common : add -dkvc arg for enabling kv cache dumps

llama : allow exporting a view of the KV cache (#4180 )
* Allow exporting a view of the KV cache * Allow dumping the sequences per cell in common * Track max contiguous cells value and position as well * Fix max contiguous empty cells index calculation Make dump functions deal with lengths or sequences counts > 10 better * Fix off by one error in dump_kv_cache_view * Add doc comments for KV cache view functions Eliminate cell sequence struct; use llama_seq_id directly Minor cleanups
2026-04-23 16:37:33 +03:00 · 2023-11-23 18:47:56 +02:00 · 2023-11-23 18:31:20 +02:00 · 2023-11-22 19:31:09 +02:00 · 2023-11-22 17:16:57 +02:00
9 changed files with 35 additions and 47 deletions
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics

- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
+- *No hot topics atm. Open to suggestions about what is hot today*

 ----

@@ -422,9 +422,8 @@ Building the program with BLAS support may lead to some performance improvements
    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
    cmake --build .
    ```
-  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS):
+  - Using `CMake` for Windows:
    ```bash
-    set PATH=%HIP_PATH%\bin;%PATH%
    mkdir build
    cd build
    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -880,21 +880,20 @@ print(f"Loading model: {dir_model.name}")

 hparams = Model.load_hparams(dir_model)

-with torch.inference_mode():
-    model_class = Model.from_model_architecture(hparams["architectures"][0])
-    model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
+model_class = Model.from_model_architecture(hparams["architectures"][0])
+model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)

-    print("Set model parameters")
-    model_instance.set_gguf_parameters()
+print("Set model parameters")
+model_instance.set_gguf_parameters()

-    print("Set model tokenizer")
-    model_instance.set_vocab()
+print("Set model tokenizer")
+model_instance.set_vocab()

-    if args.vocab_only:
-        print(f"Exporting model vocab to '{fname_out}'")
-        model_instance.write_vocab()
-    else:
-        print(f"Exporting model to '{fname_out}'")
-        model_instance.write()
+if args.vocab_only:
+    print(f"Exporting model vocab to '{fname_out}'")
+    model_instance.write_vocab()
+else:
+    print(f"Exporting model to '{fname_out}'")
+    model_instance.write()

-    print(f"Model successfully exported to '{fname_out}'")
+print(f"Model successfully exported to '{fname_out}'")
--- a/docs/llama-star/idea-arch.key
+++ b/docs/llama-star/idea-arch.key
--- a/docs/llama-star/idea-arch.pdf
+++ b/docs/llama-star/idea-arch.pdf
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -153,7 +153,7 @@ while n_cur <= n_len {
        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

        // is it an end of stream? -> mark the stream as finished
-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if new_token_id == llama_token_eos(context) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -1,5 +1,5 @@
 // A basic application simulating a server with multiple clients.
-// The clients submit requests to the server and they are processed in parallel.
+// The clients submite requests to the server and they are processed in parallel.

 #include "common.h"
 #include "llama.h"
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1095,7 +1095,6 @@ struct llama_server_context
        std::lock_guard<std::mutex> lock(mutex_results);
        task_result res;
        res.id = id;
-        res.stop = false;
        res.error = true;
        res.result_json = { { "content", error } };
        queue_results.push_back(res);
@@ -1256,7 +1255,6 @@ struct llama_server_context
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
-        task.target_id = 0;
        task.data = data;
        task.infill_mode = infill;
        task.embedding_mode = embedding;
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1,5 +1,4 @@
 #include <algorithm>
-#include <cinttypes>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
@@ -4610,8 +4609,8 @@ static __global__ void rope(

 template<typename T, bool has_pos>
 static __global__ void rope_neox(
-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
+    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims
 ) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

@@ -4620,25 +4619,23 @@ static __global__ void rope_neox(
    }

    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int ib = col / n_dims;
-    const int ic = col % n_dims;
-
-    const int i = row*ncols + ib*n_dims + ic/2;
+    const int i = row*ncols + col/2;
    const int i2 = row/p_delta_rows;

-    float cur_rot = inv_ndims * ic - ib;
+    // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
+    const float cur_rot = -float(col)/ncols;

    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
+    const float theta_base = p*powf(freq_base, cur_rot);

    float cos_theta, sin_theta;
    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);

    const float x0 = x[i + 0];
-    const float x1 = x[i + n_dims/2];
+    const float x1 = x[i + ncols/2];

-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+    dst[i + 0]       = x0*cos_theta - x1*sin_theta;
+    dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
 }

 static __global__ void rope_glm_f32(
@@ -5741,26 +5738,20 @@ static void rope_cuda(

 template<typename T>
 static void rope_neox_cuda(
-    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 ) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.0f / n_dims;
-
    if (pos == nullptr) {
        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-            theta_scale, inv_ndims
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
        );
    } else {
        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-            theta_scale, inv_ndims
+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
        );
    }
 }
@@ -6715,14 +6706,15 @@ inline void ggml_cuda_op_rope(
        GGML_ASSERT(false);
        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
    } else if (is_neox) {
+        GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
        if (src0->type == GGML_TYPE_F32) {
            rope_neox_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, main_stream
            );
        } else if (src0->type == GGML_TYPE_F16) {
            rope_neox_cuda(
-                (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                attn_factor, corr_dims, main_stream
            );
        } else {
@@ -8065,7 +8057,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
    if (tensor->op == GGML_OP_MUL_MAT) {
        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
+            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
 #endif
            return false;
        }
--- a/llama.cpp
+++ b/llama.cpp
@@ -3469,7 +3469,7 @@ static void llm_build_k_shift(
       struct ggml_cgraph * graph,
            llm_rope_type   type,
                  int64_t   n_ctx,
-                  int       n_rot,
+                  int64_t   n_rot,
                  float     freq_base,
                  float     freq_scale,
       const llm_build_cb & cb) {
@@ -3501,7 +3501,7 @@ static void llm_build_k_shift(
            // we rotate only the first n_rot dimensions
            ggml_rope_custom_inplace(ctx,
                    ggml_view_3d(ctx, kv.k,
-                        n_embd_head, n_head_kv, n_ctx,
+                        n_rot, n_head_kv, n_ctx,
                        ggml_element_size(kv.k)*n_embd_head,
                        ggml_element_size(kv.k)*n_embd_gqa,
                        ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
Author	SHA1	Message	Date
Georgi Gerganov	f8e9f11428	common : add -dkvc arg for enabling kv cache dumps	2023-11-23 18:47:56 +02:00
Kerfuffle	5df7d06c42	llama : allow exporting a view of the KV cache (#4180 ) * Allow exporting a view of the KV cache * Allow dumping the sequences per cell in common * Track max contiguous cells value and position as well * Fix max contiguous empty cells index calculation Make dump functions deal with lengths or sequences counts > 10 better * Fix off by one error in dump_kv_cache_view * Add doc comments for KV cache view functions Eliminate cell sequence struct; use llama_seq_id directly Minor cleanups	2023-11-23 18:31:20 +02:00
Georgi Gerganov	671f639c59	llama : zero KV cache used upon clear ggml-ci	2023-11-22 19:31:09 +02:00
Georgi Gerganov	79cb8f0040	llama : keep track of used KV cells + better KV cache management	2023-11-22 17:16:57 +02:00