Compare commits

..

4 Commits

Author SHA1 Message Date
Georgi Gerganov
f8e9f11428 common : add -dkvc arg for enabling kv cache dumps 2023-11-23 18:47:56 +02:00
Kerfuffle
5df7d06c42 llama : allow exporting a view of the KV cache (#4180)
* Allow exporting a view of the KV cache

* Allow dumping the sequences per cell in common

* Track max contiguous cells value and position as well

* Fix max contiguous empty cells index calculation

Make dump functions deal with lengths or sequences counts > 10 better

* Fix off by one error in dump_kv_cache_view

* Add doc comments for KV cache view functions

Eliminate cell sequence struct; use llama_seq_id directly

Minor cleanups
2023-11-23 18:31:20 +02:00
Georgi Gerganov
671f639c59 llama : zero KV cache used upon clear
ggml-ci
2023-11-22 19:31:09 +02:00
Georgi Gerganov
79cb8f0040 llama : keep track of used KV cells + better KV cache management 2023-11-22 17:16:57 +02:00
9 changed files with 35 additions and 47 deletions

View File

@@ -10,7 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
- *No hot topics atm. Open to suggestions about what is hot today*
----
@@ -422,9 +422,8 @@ Building the program with BLAS support may lead to some performance improvements
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
cmake --build .
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS):
- Using `CMake` for Windows:
```bash
set PATH=%HIP_PATH%\bin;%PATH%
mkdir build
cd build
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..

View File

@@ -880,21 +880,20 @@ print(f"Loading model: {dir_model.name}")
hparams = Model.load_hparams(dir_model)
with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
print("Set model parameters")
model_instance.set_gguf_parameters()
print("Set model parameters")
model_instance.set_gguf_parameters()
print("Set model tokenizer")
model_instance.set_vocab()
print("Set model tokenizer")
model_instance.set_vocab()
if args.vocab_only:
print(f"Exporting model vocab to '{fname_out}'")
model_instance.write_vocab()
else:
print(f"Exporting model to '{fname_out}'")
model_instance.write()
if args.vocab_only:
print(f"Exporting model vocab to '{fname_out}'")
model_instance.write_vocab()
else:
print(f"Exporting model to '{fname_out}'")
model_instance.write()
print(f"Model successfully exported to '{fname_out}'")
print(f"Model successfully exported to '{fname_out}'")

Binary file not shown.

Binary file not shown.

View File

@@ -153,7 +153,7 @@ while n_cur <= n_len {
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished
if new_token_id == llama_token_eos(model) || n_cur == n_len {
if new_token_id == llama_token_eos(context) || n_cur == n_len {
i_batch[i] = -1
// print("")
if n_parallel > 1 {

View File

@@ -1,5 +1,5 @@
// A basic application simulating a server with multiple clients.
// The clients submit requests to the server and they are processed in parallel.
// The clients submite requests to the server and they are processed in parallel.
#include "common.h"
#include "llama.h"

View File

@@ -1095,7 +1095,6 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_results);
task_result res;
res.id = id;
res.stop = false;
res.error = true;
res.result_json = { { "content", error } };
queue_results.push_back(res);
@@ -1256,7 +1255,6 @@ struct llama_server_context
std::lock_guard<std::mutex> lock(mutex_tasks);
task_server task;
task.id = id_gen++;
task.target_id = 0;
task.data = data;
task.infill_mode = infill;
task.embedding_mode = embedding;

View File

@@ -1,5 +1,4 @@
#include <algorithm>
#include <cinttypes>
#include <cstddef>
#include <cstdint>
#include <limits>
@@ -4610,8 +4609,8 @@ static __global__ void rope(
template<typename T, bool has_pos>
static __global__ void rope_neox(
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
float ext_factor, float attn_factor, rope_corr_dims corr_dims
) {
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@@ -4620,25 +4619,23 @@ static __global__ void rope_neox(
}
const int row = blockDim.x*blockIdx.x + threadIdx.x;
const int ib = col / n_dims;
const int ic = col % n_dims;
const int i = row*ncols + ib*n_dims + ic/2;
const int i = row*ncols + col/2;
const int i2 = row/p_delta_rows;
float cur_rot = inv_ndims * ic - ib;
// simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
const float cur_rot = -float(col)/ncols;
const int p = has_pos ? pos[i2] : 0;
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
const float theta_base = p*powf(freq_base, cur_rot);
float cos_theta, sin_theta;
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
const float x0 = x[i + 0];
const float x1 = x[i + n_dims/2];
const float x1 = x[i + ncols/2];
dst[i + 0] = x0*cos_theta - x1*sin_theta;
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
dst[i + 0] = x0*cos_theta - x1*sin_theta;
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
}
static __global__ void rope_glm_f32(
@@ -5741,26 +5738,20 @@ static void rope_cuda(
template<typename T>
static void rope_neox_cuda(
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
) {
GGML_ASSERT(ncols % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nrows, num_blocks_x, 1);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float inv_ndims = -1.0f / n_dims;
if (pos == nullptr) {
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, inv_ndims
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
);
} else {
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
theta_scale, inv_ndims
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
);
}
}
@@ -6715,14 +6706,15 @@ inline void ggml_cuda_op_rope(
GGML_ASSERT(false);
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
} else if (is_neox) {
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
if (src0->type == GGML_TYPE_F32) {
rope_neox_cuda(
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, main_stream
);
} else if (src0->type == GGML_TYPE_F16) {
rope_neox_cuda(
(const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
attn_factor, corr_dims, main_stream
);
} else {
@@ -8065,7 +8057,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
if (tensor->op == GGML_OP_MUL_MAT) {
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
#ifndef NDEBUG
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
#endif
return false;
}

View File

@@ -3469,7 +3469,7 @@ static void llm_build_k_shift(
struct ggml_cgraph * graph,
llm_rope_type type,
int64_t n_ctx,
int n_rot,
int64_t n_rot,
float freq_base,
float freq_scale,
const llm_build_cb & cb) {
@@ -3501,7 +3501,7 @@ static void llm_build_k_shift(
// we rotate only the first n_rot dimensions
ggml_rope_custom_inplace(ctx,
ggml_view_3d(ctx, kv.k,
n_embd_head, n_head_kv, n_ctx,
n_rot, n_head_kv, n_ctx,
ggml_element_size(kv.k)*n_embd_head,
ggml_element_size(kv.k)*n_embd_gqa,
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),