cont : cache shift support

cont : rotate caches separately + support non-power-of-2 head sizes
cont : rotate V more + refactor
2026-04-16 16:27:32 +03:00 · 2026-03-27 14:39:14 +02:00 · 2026-03-27 14:07:38 +02:00 · 2026-03-27 11:29:16 +02:00 · 2026-03-26 19:04:04 +02:00
5 changed files with 297 additions and 3 deletions
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -52,6 +52,43 @@ static bool can_reuse_kq_mask(

 // impl

+static bool ggml_is_power_of_2(int n) {
+    return (n & (n - 1)) == 0;
+}
+
+// orthonormal Walsh-Hadamard rotation matrix
+static void set_input_hadamard(int n, float * data) {
+    assert(ggml_is_power_of_2(n));
+
+    data[0*n + 0] = 1.0 / sqrtf(n);
+
+    for (int s = 1; s < n; s *= 2) {
+        for (int i = 0; i < s; i++) {
+            for (int j = 0; j < s; j++) {
+                const float val = data[i*n + j];
+
+                data[(i + s)*n + (j    )] =  val;
+                data[(i    )*n + (j + s)] =  val;
+                data[(i + s)*n + (j + s)] = -val;
+            }
+        }
+    }
+}
+
+static ggml_tensor * ggml_rotate_hadamard(
+        ggml_context * ctx,
+        ggml_tensor * cur,
+        ggml_tensor * rot) {
+    const auto n = rot->ne[0];
+
+    ggml_tensor * res;
+    res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
+    res = ggml_mul_mat(ctx, rot, res);
+    res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
+
+    return res;
+}
+
 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    if (ubatch->token) {
        const int64_t n_tokens = ubatch->n_tokens;
@@ -429,6 +466,22 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
    mctx->set_input_v_idxs(self_v_idxs, ubatch);

    mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+
+    if (self_rotk) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(self_rotk->buffer));
+
+        float * data = (float *) self_rotk->data;
+
+        set_input_hadamard(self_rotk->ne[0], data);
+    }
+
+    if (self_rotv) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(self_rotv->buffer));
+
+        float * data = (float *) self_rotv->data;
+
+        set_input_hadamard(self_rotv->ne[0], data);
+    }
 }

 bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
@@ -476,6 +529,22 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
    mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);

    mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+
+    if (self_rotk) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(self_rotk->buffer));
+
+        float * data = (float *) self_rotk->data;
+
+        set_input_hadamard(self_rotk->ne[0], data);
+    }
+
+    if (self_rotv) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(self_rotv->buffer));
+
+        float * data = (float *) self_rotv->data;
+
+        set_input_hadamard(self_rotv->ne[0], data);
+    }
 }

 bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
@@ -532,6 +601,22 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {

    mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);

+    if (inp_attn->self_rotk) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotk->buffer));
+
+        float * data = (float *) inp_attn->self_rotk->data;
+
+        set_input_hadamard(inp_attn->self_rotk->ne[0], data);
+    }
+
+    if (inp_attn->self_rotv) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotv->buffer));
+
+        float * data = (float *) inp_attn->self_rotv->data;
+
+        set_input_hadamard(inp_attn->self_rotv->ne[0], data);
+    }
+
    const int64_t n_rs = mctx->get_recr()->get_n_rs();

    if (inp_rs->s_copy) {
@@ -630,6 +715,22 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
        attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
    }

+    if (inp_attn->self_rotk) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotk->buffer));
+
+        float * data = (float *) inp_attn->self_rotk->data;
+
+        set_input_hadamard(inp_attn->self_rotk->ne[0], data);
+    }
+
+    if (inp_attn->self_rotv) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotv->buffer));
+
+        float * data = (float *) inp_attn->self_rotv->data;
+
+        set_input_hadamard(inp_attn->self_rotv->ne[0], data);
+    }
+
    const int64_t n_rs = mctx->get_recr()->get_n_rs();

    if (inp_rs->s_copy) {
@@ -2003,12 +2104,52 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
        inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);

        inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
-
        ggml_set_input(inp->self_kq_mask);

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
    }

+    {
+        const bool can_rotk =
+            !hparams.is_n_embd_k_gqa_variable() &&
+            hparams.n_embd_head_k() % 64 == 0 &&
+            ggml_is_quantized(mctx_cur->type_k());
+
+        if (can_rotk) {
+            int nrot = 64;
+            do {
+                nrot *= 2;
+            } while (hparams.n_embd_head_k() % nrot == 0);
+            nrot /= 2;
+
+            inp->self_rotk = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, nrot, nrot);
+            ggml_set_input(inp->self_rotk);
+        } else {
+            inp->self_rotk = nullptr;
+        }
+
+        const bool can_rotv =
+            !hparams.is_n_embd_v_gqa_variable() &&
+            hparams.n_embd_head_v() % 64 == 0 &&
+            ggml_is_quantized(mctx_cur->type_v());
+
+        if (can_rotv) {
+            int nrot = 64;
+
+            // TODO: I think we can afford to rotate the V more compared to Q and K - to be confirmed
+            // ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
+            //do {
+            //    nrot *= 2;
+            //} while (hparams.n_embd_head_v() % nrot == 0);
+            //nrot /= 2;
+
+            inp->self_rotv = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, nrot, nrot);
+            ggml_set_input(inp->self_rotv);
+        } else {
+            inp->self_rotv = nullptr;
+        }
+    }
+
    return inp;
 }

@@ -2034,6 +2175,15 @@ ggml_tensor * llm_graph_context::build_attn(
            int       il) const {
    GGML_ASSERT(v_mla == nullptr);

+    if (inp->self_rotk) {
+        q_cur = ggml_rotate_hadamard(ctx0, q_cur, inp->self_rotk);
+        k_cur = ggml_rotate_hadamard(ctx0, k_cur, inp->self_rotk);
+    }
+
+    if (inp->self_rotv) {
+        v_cur = ggml_rotate_hadamard(ctx0, v_cur, inp->self_rotv);
+    }
+
    // these nodes are added to the graph together so that they are not reordered
    // by doing so, the number of splits in the graph is reduced
    // expand k later to enable rope fusion which directly writes into k-v cache
@@ -2061,6 +2211,10 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
    cb(cur, "kqv_out", il);

+    if (inp->self_rotv) {
+        cur = ggml_rotate_hadamard(ctx0, cur, inp->self_rotv);
+    }
+
    if (wo) {
        cur = build_lora_mm(wo, cur);
        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
@@ -2171,6 +2325,18 @@ ggml_tensor * llm_graph_context::build_attn(
        ggml_tensor * v_mla,
            float     kq_scale,
            int       il) const {
+    if (inp->self_rotk) {
+        q_cur = ggml_rotate_hadamard(ctx0, q_cur, inp->self_rotk);
+        if (k_cur) {
+            k_cur = ggml_rotate_hadamard(ctx0, k_cur, inp->self_rotk);
+        }
+    }
+    if (inp->self_rotv) {
+        if (v_cur) {
+            v_cur = ggml_rotate_hadamard(ctx0, v_cur, inp->self_rotv);
+        }
+    }
+
    // these nodes are added to the graph together so that they are not reordered
    // by doing so, the number of splits in the graph is reduced
    ggml_build_forward_expand(gf, q_cur);
@@ -2211,6 +2377,10 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
    cb(cur, "kqv_out", il);

+    if (inp->self_rotv) {
+        cur = ggml_rotate_hadamard(ctx0, cur, inp->self_rotv);
+    }
+
    if (wo) {
        cur = build_lora_mm(wo, cur);
    }
@@ -2315,6 +2485,48 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
        ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
    }

+    {
+        const bool can_rotk =
+            !hparams.is_n_embd_k_gqa_variable() &&
+            hparams.n_embd_head_k() % 64 == 0 &&
+            ggml_is_quantized(mctx_cur->get_base()->type_k());
+
+        if (can_rotk) {
+            int nrot = 64;
+            do {
+                nrot *= 2;
+            } while (hparams.n_embd_head_k() % nrot == 0);
+            nrot /= 2;
+
+            inp->self_rotk = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, nrot, nrot);
+            ggml_set_input(inp->self_rotk);
+        } else {
+            inp->self_rotk = nullptr;
+        }
+
+        const bool can_rotv =
+            !hparams.is_n_embd_v_gqa_variable() &&
+            hparams.n_embd_head_v() % 64 == 0 &&
+            ggml_is_quantized(mctx_cur->get_base()->type_v());
+
+        if (can_rotv) {
+            int nrot = 64;
+
+            // TODO: I think we can afford to rotate the V more compared to Q and K - to be confirmed
+            // ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
+            //do {
+            //    nrot *= 2;
+            //} while (hparams.n_embd_head_v() % nrot == 0);
+            //nrot /= 2;
+
+            inp->self_rotv = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, nrot, nrot);
+            ggml_set_input(inp->self_rotv);
+        } else {
+            inp->self_rotv = nullptr;
+        }
+    }
+
+
    return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
 }

--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -308,6 +308,9 @@ public:
    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]

+    ggml_tensor * self_rotk = nullptr;
+    ggml_tensor * self_rotv = nullptr;
+
    // note: these have to be copies because in order to be able to reuse a graph, its inputs
    //       need to carry these parameters with them. otherwise, they can point to freed
    //       llm_graph_params from a previous batch, causing stack-use-after-return
@@ -384,6 +387,9 @@ public:
    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]

+    ggml_tensor * self_rotk = nullptr;
+    ggml_tensor * self_rotv = nullptr;
+
    const llama_hparams hparams;
    const llama_cparams cparams;

--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1004,6 +1004,14 @@ bool llama_kv_cache::get_has_shift() const {
    return result;
 }

+ggml_type llama_kv_cache::type_k() const {
+    return layers[0].k->type;
+}
+
+ggml_type llama_kv_cache::type_v() const {
+    return layers[0].v->type;
+}
+
 uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
    uint32_t result = 0;

@@ -1537,11 +1545,44 @@ size_t llama_kv_cache::size_v_bytes() const {
    return size_v_bytes;
 }

+static ggml_tensor * ggml_rotate_hadamard(
+        ggml_context * ctx,
+        ggml_tensor * cur,
+        ggml_tensor * rot) {
+    const auto n = rot->ne[0];
+
+    ggml_tensor * res;
+    res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
+    res = ggml_mul_mat(ctx, rot, res);
+    res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
+
+    return res;
+}
+
+static void set_input_hadamard(int n, float * data) {
+    assert(ggml_is_power_of_2(n));
+
+    data[0*n + 0] = 1.0 / sqrtf(n);
+
+    for (int s = 1; s < n; s *= 2) {
+        for (int i = 0; i < s; i++) {
+            for (int j = 0; j < s; j++) {
+                const float val = data[i*n + j];
+
+                data[(i + s)*n + (j    )] =  val;
+                data[(i    )*n + (j + s)] =  val;
+                data[(i + s)*n + (j + s)] = -val;
+            }
+        }
+    }
+}
+
 ggml_tensor * llama_kv_cache::build_rope_shift(
        const llama_cparams & cparams,
               ggml_context * ctx,
                ggml_tensor * cur,
                ggml_tensor * shift,
+                ggml_tensor * rotk,
                ggml_tensor * factors,
                      float   freq_base,
                      float   freq_scale,
@@ -1567,10 +1608,14 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
        // dequantize to f32 -> RoPE -> quantize back
        tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);

+        tmp = ggml_rotate_hadamard(ctx, tmp, rotk);
+
        tmp = ggml_rope_ext(ctx, tmp,
                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);

+        tmp = ggml_rotate_hadamard(ctx, tmp, rotk);
+
        tmp = ggml_cpy(ctx, tmp, cur);
    } else {
        // we rotate only the first n_rot dimensions
@@ -1591,6 +1636,8 @@ public:

    ggml_tensor * k_shift; // I32 [kv_size*n_stream]

+    ggml_tensor * rotk = nullptr;
+
    const llama_kv_cache * kv_self;
 };

@@ -1600,6 +1647,14 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
    if (k_shift) {
        kv_self->set_input_k_shift(k_shift);
    }
+
+    if (rotk) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(rotk->buffer));
+
+        float * data = (float *) rotk->data;
+
+        set_input_hadamard(rotk->ne[0], data);
+    }
 }

 ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
@@ -1611,6 +1666,12 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
    ggml_set_input(inp->k_shift);

+    if (ggml_is_quantized(type_k())) {
+        int nrot = hparams.n_embd_head_k();
+        inp->rotk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot);
+        ggml_set_input(inp->rotk);
+    }
+
    const auto & cparams = lctx->get_cparams();

    for (const auto & layer : layers) {
@@ -1635,7 +1696,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
                ggml_row_size(layer.k->type, n_embd_k_gqa),
                ggml_row_size(layer.k->type, n_embd_nope));

-        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
+        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, inp->rotk, rope_factors, freq_base_l, freq_scale_l, il);

        ggml_build_forward_expand(gf, cur);
    }
@@ -2239,6 +2300,14 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
    return n_kv;
 }

+ggml_type llama_kv_cache_context::type_k() const {
+    return kv->type_k();
+}
+
+ggml_type llama_kv_cache_context::type_v() const {
+    return kv->type_v();
+}
+
 ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
    return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
 }
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -152,6 +152,9 @@ public:

    bool get_has_shift() const;

+    ggml_type type_k() const;
+    ggml_type type_v() const;
+
    //
    // graph_build API
    //
@@ -262,6 +265,7 @@ private:
                   ggml_context * ctx,
                    ggml_tensor * cur,
                    ggml_tensor * shift,
+                    ggml_tensor * rotk,
                    ggml_tensor * factors,
                          float   freq_base,
                          float   freq_scale,
@@ -328,6 +332,9 @@ public:

    uint32_t get_n_kv() const;

+    ggml_type type_k() const;
+    ggml_type type_v() const;
+
    // get views of the current state of the cache
    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@@ -628,7 +628,7 @@ int main(int argc, char ** argv) {
                    const int n_left    = n_past - params.n_keep;
                    const int n_discard = n_left/2;

-                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    LOG_WRN("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                            n_past, n_left, n_ctx, params.n_keep, n_discard);

                    llama_memory_seq_rm (mem, 0, params.n_keep            , params.n_keep + n_discard);
Author	SHA1	Message	Date
Georgi Gerganov	ff76c6731d	cont : cache shift support	2026-03-27 14:39:14 +02:00
Georgi Gerganov	7711b3a36a	cont : rotate caches separately + support non-power-of-2 head sizes	2026-03-27 14:07:38 +02:00
Georgi Gerganov	832e32639f	cont : rotate V more + refactor	2026-03-27 11:29:16 +02:00
Georgi Gerganov	e5aa067d68	llama : rotate activations for better quantization	2026-03-26 19:04:04 +02:00