Compare commits

...

4 Commits

Author SHA1 Message Date
Georgi Gerganov
ff76c6731d cont : cache shift support 2026-03-27 14:39:14 +02:00
Georgi Gerganov
7711b3a36a cont : rotate caches separately + support non-power-of-2 head sizes 2026-03-27 14:07:38 +02:00
Georgi Gerganov
832e32639f cont : rotate V more + refactor 2026-03-27 11:29:16 +02:00
Georgi Gerganov
e5aa067d68 llama : rotate activations for better quantization 2026-03-26 19:04:04 +02:00
5 changed files with 297 additions and 3 deletions

View File

@@ -52,6 +52,43 @@ static bool can_reuse_kq_mask(
// impl
static bool ggml_is_power_of_2(int n) {
return (n & (n - 1)) == 0;
}
// orthonormal Walsh-Hadamard rotation matrix
static void set_input_hadamard(int n, float * data) {
assert(ggml_is_power_of_2(n));
data[0*n + 0] = 1.0 / sqrtf(n);
for (int s = 1; s < n; s *= 2) {
for (int i = 0; i < s; i++) {
for (int j = 0; j < s; j++) {
const float val = data[i*n + j];
data[(i + s)*n + (j )] = val;
data[(i )*n + (j + s)] = val;
data[(i + s)*n + (j + s)] = -val;
}
}
}
}
static ggml_tensor * ggml_rotate_hadamard(
ggml_context * ctx,
ggml_tensor * cur,
ggml_tensor * rot) {
const auto n = rot->ne[0];
ggml_tensor * res;
res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
res = ggml_mul_mat(ctx, rot, res);
res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
return res;
}
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
if (ubatch->token) {
const int64_t n_tokens = ubatch->n_tokens;
@@ -429,6 +466,22 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
mctx->set_input_v_idxs(self_v_idxs, ubatch);
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
if (self_rotk) {
GGML_ASSERT(ggml_backend_buffer_is_host(self_rotk->buffer));
float * data = (float *) self_rotk->data;
set_input_hadamard(self_rotk->ne[0], data);
}
if (self_rotv) {
GGML_ASSERT(ggml_backend_buffer_is_host(self_rotv->buffer));
float * data = (float *) self_rotv->data;
set_input_hadamard(self_rotv->ne[0], data);
}
}
bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
@@ -476,6 +529,22 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
if (self_rotk) {
GGML_ASSERT(ggml_backend_buffer_is_host(self_rotk->buffer));
float * data = (float *) self_rotk->data;
set_input_hadamard(self_rotk->ne[0], data);
}
if (self_rotv) {
GGML_ASSERT(ggml_backend_buffer_is_host(self_rotv->buffer));
float * data = (float *) self_rotv->data;
set_input_hadamard(self_rotv->ne[0], data);
}
}
bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
@@ -532,6 +601,22 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
if (inp_attn->self_rotk) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotk->buffer));
float * data = (float *) inp_attn->self_rotk->data;
set_input_hadamard(inp_attn->self_rotk->ne[0], data);
}
if (inp_attn->self_rotv) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotv->buffer));
float * data = (float *) inp_attn->self_rotv->data;
set_input_hadamard(inp_attn->self_rotv->ne[0], data);
}
const int64_t n_rs = mctx->get_recr()->get_n_rs();
if (inp_rs->s_copy) {
@@ -630,6 +715,22 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
}
if (inp_attn->self_rotk) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotk->buffer));
float * data = (float *) inp_attn->self_rotk->data;
set_input_hadamard(inp_attn->self_rotk->ne[0], data);
}
if (inp_attn->self_rotv) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotv->buffer));
float * data = (float *) inp_attn->self_rotv->data;
set_input_hadamard(inp_attn->self_rotv->ne[0], data);
}
const int64_t n_rs = mctx->get_recr()->get_n_rs();
if (inp_rs->s_copy) {
@@ -2003,12 +2104,52 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
}
{
const bool can_rotk =
!hparams.is_n_embd_k_gqa_variable() &&
hparams.n_embd_head_k() % 64 == 0 &&
ggml_is_quantized(mctx_cur->type_k());
if (can_rotk) {
int nrot = 64;
do {
nrot *= 2;
} while (hparams.n_embd_head_k() % nrot == 0);
nrot /= 2;
inp->self_rotk = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, nrot, nrot);
ggml_set_input(inp->self_rotk);
} else {
inp->self_rotk = nullptr;
}
const bool can_rotv =
!hparams.is_n_embd_v_gqa_variable() &&
hparams.n_embd_head_v() % 64 == 0 &&
ggml_is_quantized(mctx_cur->type_v());
if (can_rotv) {
int nrot = 64;
// TODO: I think we can afford to rotate the V more compared to Q and K - to be confirmed
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
//do {
// nrot *= 2;
//} while (hparams.n_embd_head_v() % nrot == 0);
//nrot /= 2;
inp->self_rotv = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, nrot, nrot);
ggml_set_input(inp->self_rotv);
} else {
inp->self_rotv = nullptr;
}
}
return inp;
}
@@ -2034,6 +2175,15 @@ ggml_tensor * llm_graph_context::build_attn(
int il) const {
GGML_ASSERT(v_mla == nullptr);
if (inp->self_rotk) {
q_cur = ggml_rotate_hadamard(ctx0, q_cur, inp->self_rotk);
k_cur = ggml_rotate_hadamard(ctx0, k_cur, inp->self_rotk);
}
if (inp->self_rotv) {
v_cur = ggml_rotate_hadamard(ctx0, v_cur, inp->self_rotv);
}
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
// expand k later to enable rope fusion which directly writes into k-v cache
@@ -2061,6 +2211,10 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il);
if (inp->self_rotv) {
cur = ggml_rotate_hadamard(ctx0, cur, inp->self_rotv);
}
if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
@@ -2171,6 +2325,18 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * v_mla,
float kq_scale,
int il) const {
if (inp->self_rotk) {
q_cur = ggml_rotate_hadamard(ctx0, q_cur, inp->self_rotk);
if (k_cur) {
k_cur = ggml_rotate_hadamard(ctx0, k_cur, inp->self_rotk);
}
}
if (inp->self_rotv) {
if (v_cur) {
v_cur = ggml_rotate_hadamard(ctx0, v_cur, inp->self_rotv);
}
}
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
ggml_build_forward_expand(gf, q_cur);
@@ -2211,6 +2377,10 @@ ggml_tensor * llm_graph_context::build_attn(
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
cb(cur, "kqv_out", il);
if (inp->self_rotv) {
cur = ggml_rotate_hadamard(ctx0, cur, inp->self_rotv);
}
if (wo) {
cur = build_lora_mm(wo, cur);
}
@@ -2315,6 +2485,48 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
}
{
const bool can_rotk =
!hparams.is_n_embd_k_gqa_variable() &&
hparams.n_embd_head_k() % 64 == 0 &&
ggml_is_quantized(mctx_cur->get_base()->type_k());
if (can_rotk) {
int nrot = 64;
do {
nrot *= 2;
} while (hparams.n_embd_head_k() % nrot == 0);
nrot /= 2;
inp->self_rotk = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, nrot, nrot);
ggml_set_input(inp->self_rotk);
} else {
inp->self_rotk = nullptr;
}
const bool can_rotv =
!hparams.is_n_embd_v_gqa_variable() &&
hparams.n_embd_head_v() % 64 == 0 &&
ggml_is_quantized(mctx_cur->get_base()->type_v());
if (can_rotv) {
int nrot = 64;
// TODO: I think we can afford to rotate the V more compared to Q and K - to be confirmed
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
//do {
// nrot *= 2;
//} while (hparams.n_embd_head_v() % nrot == 0);
//nrot /= 2;
inp->self_rotv = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, nrot, nrot);
ggml_set_input(inp->self_rotv);
} else {
inp->self_rotv = nullptr;
}
}
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
}

View File

@@ -308,6 +308,9 @@ public:
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_rotk = nullptr;
ggml_tensor * self_rotv = nullptr;
// note: these have to be copies because in order to be able to reuse a graph, its inputs
// need to carry these parameters with them. otherwise, they can point to freed
// llm_graph_params from a previous batch, causing stack-use-after-return
@@ -384,6 +387,9 @@ public:
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
ggml_tensor * self_rotk = nullptr;
ggml_tensor * self_rotv = nullptr;
const llama_hparams hparams;
const llama_cparams cparams;

View File

@@ -1004,6 +1004,14 @@ bool llama_kv_cache::get_has_shift() const {
return result;
}
ggml_type llama_kv_cache::type_k() const {
return layers[0].k->type;
}
ggml_type llama_kv_cache::type_v() const {
return layers[0].v->type;
}
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
uint32_t result = 0;
@@ -1537,11 +1545,44 @@ size_t llama_kv_cache::size_v_bytes() const {
return size_v_bytes;
}
static ggml_tensor * ggml_rotate_hadamard(
ggml_context * ctx,
ggml_tensor * cur,
ggml_tensor * rot) {
const auto n = rot->ne[0];
ggml_tensor * res;
res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
res = ggml_mul_mat(ctx, rot, res);
res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
return res;
}
static void set_input_hadamard(int n, float * data) {
assert(ggml_is_power_of_2(n));
data[0*n + 0] = 1.0 / sqrtf(n);
for (int s = 1; s < n; s *= 2) {
for (int i = 0; i < s; i++) {
for (int j = 0; j < s; j++) {
const float val = data[i*n + j];
data[(i + s)*n + (j )] = val;
data[(i )*n + (j + s)] = val;
data[(i + s)*n + (j + s)] = -val;
}
}
}
}
ggml_tensor * llama_kv_cache::build_rope_shift(
const llama_cparams & cparams,
ggml_context * ctx,
ggml_tensor * cur,
ggml_tensor * shift,
ggml_tensor * rotk,
ggml_tensor * factors,
float freq_base,
float freq_scale,
@@ -1567,10 +1608,14 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
// dequantize to f32 -> RoPE -> quantize back
tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
tmp = ggml_rotate_hadamard(ctx, tmp, rotk);
tmp = ggml_rope_ext(ctx, tmp,
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
tmp = ggml_rotate_hadamard(ctx, tmp, rotk);
tmp = ggml_cpy(ctx, tmp, cur);
} else {
// we rotate only the first n_rot dimensions
@@ -1591,6 +1636,8 @@ public:
ggml_tensor * k_shift; // I32 [kv_size*n_stream]
ggml_tensor * rotk = nullptr;
const llama_kv_cache * kv_self;
};
@@ -1600,6 +1647,14 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
if (k_shift) {
kv_self->set_input_k_shift(k_shift);
}
if (rotk) {
GGML_ASSERT(ggml_backend_buffer_is_host(rotk->buffer));
float * data = (float *) rotk->data;
set_input_hadamard(rotk->ne[0], data);
}
}
ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
@@ -1611,6 +1666,12 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
ggml_set_input(inp->k_shift);
if (ggml_is_quantized(type_k())) {
int nrot = hparams.n_embd_head_k();
inp->rotk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot);
ggml_set_input(inp->rotk);
}
const auto & cparams = lctx->get_cparams();
for (const auto & layer : layers) {
@@ -1635,7 +1696,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
ggml_row_size(layer.k->type, n_embd_k_gqa),
ggml_row_size(layer.k->type, n_embd_nope));
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, inp->rotk, rope_factors, freq_base_l, freq_scale_l, il);
ggml_build_forward_expand(gf, cur);
}
@@ -2239,6 +2300,14 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
return n_kv;
}
ggml_type llama_kv_cache_context::type_k() const {
return kv->type_k();
}
ggml_type llama_kv_cache_context::type_v() const {
return kv->type_v();
}
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
}

View File

@@ -152,6 +152,9 @@ public:
bool get_has_shift() const;
ggml_type type_k() const;
ggml_type type_v() const;
//
// graph_build API
//
@@ -262,6 +265,7 @@ private:
ggml_context * ctx,
ggml_tensor * cur,
ggml_tensor * shift,
ggml_tensor * rotk,
ggml_tensor * factors,
float freq_base,
float freq_scale,
@@ -328,6 +332,9 @@ public:
uint32_t get_n_kv() const;
ggml_type type_k() const;
ggml_type type_v() const;
// get views of the current state of the cache
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;

View File

@@ -628,7 +628,7 @@ int main(int argc, char ** argv) {
const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2;
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
LOG_WRN("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_memory_seq_rm (mem, 0, params.n_keep , params.n_keep + n_discard);