Compare commits

...

6 Commits

Author SHA1 Message Date
Xuan Son Nguyen
e83ef74733 one less magic number 2025-09-20 12:58:36 +07:00
Xuan Son Nguyen
f643b957f4 refactor softplus fn 2025-09-20 12:17:15 +07:00
Xuan Son Nguyen
46110e0630 split q_proj/gate 2025-09-20 12:00:14 +07:00
Piotr Wilkin
178230ee21 Getting to decode stage... 2025-09-18 21:47:40 +02:00
Piotr Wilkin (ilintar)
c78f9fce68 Merge branch 'ggml-org:master' into qwen3_next 2025-09-18 12:59:39 +02:00
Piotr Wilkin
344331c2b6 First draft 2025-09-18 00:21:17 +02:00
11 changed files with 984 additions and 11 deletions

View File

@@ -3748,6 +3748,32 @@ class Qwen3MoeModel(Qwen2MoeModel):
super().set_vocab()
@ModelBase.register("Qwen3NextForCausalLM")
class Qwen3NextModel(Qwen3MoeModel):
model_arch = gguf.MODEL_ARCH.QWEN3NEXT
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["linear_conv_kernel_dim"]))
self.gguf_writer.add_ssm_state_size(self.find_hparam(["linear_key_head_dim"]))
self.gguf_writer.add_ssm_group_count(self.find_hparam(["linear_num_key_heads"]))
self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["linear_num_value_heads"]))
self.gguf_writer.add_ssm_inner_size(self.find_hparam(["hidden_size"]) * (self.find_hparam(["linear_num_value_heads"]) // self.find_hparam(["linear_num_key_heads"])))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.endswith(".A_log"):
data_torch = -torch.exp(data_torch)
elif name.endswith(".dt_bias"):
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
elif "conv1d" in name:
data_torch = data_torch.squeeze()
elif "q_proj.weight" in name:
q_proj, gate = data_torch.chunk(2, dim=0)
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid), gate)
data_torch = q_proj
yield from Qwen2MoeModel.modify_tensors(self, data_torch, name, bid)
@ModelBase.register("GPT2LMHeadModel")
class GPT2Model(TextModel):

View File

@@ -539,7 +539,8 @@ extern "C" {
GGML_OP_RWKV_WKV6,
GGML_OP_GATED_LINEAR_ATTN,
GGML_OP_RWKV_WKV7,
GGML_OP_DELTA_NET,
GGML_OP_UNARY,
GGML_OP_MAP_CUSTOM1,
@@ -2278,6 +2279,31 @@ extern "C" {
struct ggml_tensor * state,
float scale);
// Delta-Net linear layer activation
// Implements the complete Delta-Net gated linear attention mechanism
// This includes causal convolution preprocessing and gated delta rule computation
// k, v, q, g: [S, H, n_tokens, n_seqs] - key, value, query, gate tensors
// conv_weight: [conv_dim, 1, conv_kernel_size] - convolution kernel weights
// conv_bias: [conv_dim] - convolution bias (optional, can be NULL)
// beta: [H, n_tokens, n_seqs] - beta parameter for delta rule
// state: [S, S, H, n_seqs] - recurrent state tensor
// chunk_size: chunk size for chunked computation (0 for recurrent mode)
// use_qk_l2norm: whether to apply L2 normalization to query and key
// scale: attention scaling factor
GGML_API struct ggml_tensor * ggml_delta_net(
struct ggml_context * ctx,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * q,
struct ggml_tensor * g,
struct ggml_tensor * conv_weight,
struct ggml_tensor * conv_bias,
struct ggml_tensor * beta,
struct ggml_tensor * state,
int chunk_size,
bool use_qk_l2norm,
float scale);
GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
struct ggml_context * ctx,
struct ggml_tensor * r,

View File

@@ -1656,6 +1656,172 @@ static void ggml_compute_forward_mul_mat_id(
}
}
// ggml_compute_forward_delta_net
static void ggml_compute_forward_delta_net(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0]; // query
const struct ggml_tensor * src1 = dst->src[1]; // key
const struct ggml_tensor * src2 = dst->src[2]; // value
const struct ggml_tensor * src3 = dst->src[3]; // gate
const struct ggml_tensor * src4 = dst->src[4]; // beta
const struct ggml_tensor * src5 = dst->src[5]; // state
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(src2->type == GGML_TYPE_F32);
GGML_ASSERT(src3->type == GGML_TYPE_F32);
GGML_ASSERT(src4->type == GGML_TYPE_F32);
GGML_ASSERT(src5->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_TENSOR_TERNARY_OP_LOCALS;
GGML_TENSOR_LOCALS(int64_t, ne3, src3, ne);
GGML_TENSOR_LOCALS(size_t, nb3, src3, nb);
GGML_TENSOR_LOCALS(int64_t, ne4, src4, ne);
GGML_TENSOR_LOCALS(size_t, nb4, src4, nb);
GGML_TENSOR_LOCALS(int64_t, ne5, src5, ne);
GGML_TENSOR_LOCALS(size_t, nb5, src5, nb);
const int ith = params->ith;
const int nth = params->nth;
const int64_t S = src0->ne[0]; // head dimension
const int64_t H = src0->ne[1]; // number of heads
const int64_t n_tokens = src0->ne[2];
const int64_t n_seqs = src0->ne[3];
GGML_ASSERT(ne00 == S && ne01 == H && ne02 == n_tokens && ne03 == n_seqs);
GGML_ASSERT(ne10 == S && ne11 == H && ne12 == n_tokens && ne13 == n_seqs);
GGML_ASSERT(ne20 == S && ne21 == H && ne22 == n_tokens && ne23 == n_seqs);
GGML_ASSERT(ne30 == S && ne31 == H && ne32 == n_tokens && ne33 == n_seqs);
GGML_ASSERT(ne40 == H && ne41 == n_tokens && ne42 == n_seqs && ne43 == 1);
GGML_ASSERT(ne50 == S && ne51 == S && ne52 == H && ne53 == n_seqs);
// Get operation parameters
bool use_qk_l2norm = ggml_get_op_params_i32(dst, 1) != 0;
float scale;
memcpy(&scale, ((int32_t*)dst->op_params) + 4, sizeof(float));
GGML_ASSERT(ne0 == S * H);
GGML_ASSERT(ne1 == n_tokens + S * n_seqs);
// Parallelize over sequences and heads
const int64_t n_total = n_seqs * H;
const int64_t n_per_thread = (n_total + nth - 1) / nth;
const int64_t n_start = ith * n_per_thread;
const int64_t n_end = MIN(n_start + n_per_thread, n_total);
for (int64_t n = n_start; n < n_end; ++n) {
const int64_t seq_idx = n / H;
const int64_t head_idx = n % H;
// Get pointers to current sequence and head
float * q_ptr = (float *)((char *)src0->data + seq_idx * nb03 + head_idx * nb01);
float * k_ptr = (float *)((char *)src1->data + seq_idx * nb13 + head_idx * nb11);
float * v_ptr = (float *)((char *)src2->data + seq_idx * nb23 + head_idx * nb21);
float * g_ptr = (float *)((char *)src3->data + seq_idx * nb33 + head_idx * nb31);
float * beta_ptr = (float *)((char *)src4->data + seq_idx * nb43);
float * state_ptr = (float *)((char *)src5->data + seq_idx * nb53 + head_idx * nb51);
float * out_ptr = (float *)((char *)dst->data + n * ne0 * sizeof(float));
float * new_state_ptr = out_ptr + n_tokens * S;
// Apply L2 normalization if requested
if (use_qk_l2norm) {
// Normalize query and key
for (int64_t t = 0; t < n_tokens; ++t) {
float q_sum = 0.0f, k_sum = 0.0f;
for (int64_t s = 0; s < S; ++s) {
float q_val = q_ptr[t * nb02 / sizeof(float) + s];
float k_val = k_ptr[t * nb12 / sizeof(float) + s];
q_sum += q_val * q_val;
k_sum += k_val * k_val;
}
float q_norm = sqrtf(q_sum + 1e-6f);
float k_norm = sqrtf(k_sum + 1e-6f);
for (int64_t s = 0; s < S; ++s) {
q_ptr[t * nb02 / sizeof(float) + s] /= q_norm;
k_ptr[t * nb12 / sizeof(float) + s] /= k_norm;
}
}
}
// Apply scaling to query
for (int64_t i = 0; i < n_tokens * S; ++i) {
q_ptr[i] *= scale;
}
// Apply sigmoid to beta
float * beta_sigmoid = (float *)alloca(n_tokens * sizeof(float));
for (int64_t t = 0; t < n_tokens; ++t) {
beta_sigmoid[t] = 1.0f / (1.0f + expf(-beta_ptr[t * nb42 / sizeof(float)]));
}
// Complete implementation of gated delta rule
// Based on torch_recurrent_gated_delta_rule from the reference implementation
// Process each token sequentially for recurrent computation
for (int64_t t = 0; t < n_tokens; ++t) {
// Get pointers to current token data
float * q_t = q_ptr + t * (nb02 / sizeof(float));
float * k_t = k_ptr + t * (nb12 / sizeof(float));
float * v_t = v_ptr + t * (nb22 / sizeof(float));
float * g_t = g_ptr + t * (nb32 / sizeof(float));
// Apply exponential to gate and multiply by beta
float g_exp = expf(g_t[0]); // g is per-head, not per-dimension
float beta_t = beta_sigmoid[t];
// Update recurrent state: state = state * g_exp
for (int64_t i = 0; i < S * S; ++i) {
state_ptr[i] *= g_exp;
}
// Compute kv_mem = (state * k_t^T).sum(dim=-1)
// This is a matrix-vector multiplication: state[S×S] @ k_t[S]
float kv_mem[S];
for (int64_t i = 0; i < S; ++i) {
kv_mem[i] = 0.0f;
for (int64_t j = 0; j < S; ++j) {
kv_mem[i] += state_ptr[i * S + j] * k_t[j];
}
}
// Compute delta = (v_t - kv_mem) * beta_t
float delta[S];
for (int64_t i = 0; i < S; ++i) {
delta[i] = (v_t[i] - kv_mem[i]) * beta_t;
}
// Update state: state = state + k_t * delta^T
// This is an outer product: k_t[S] ⊗ delta[S]
for (int64_t i = 0; i < S; ++i) {
for (int64_t j = 0; j < S; ++j) {
state_ptr[i * S + j] += k_t[i] * delta[j];
}
}
// Compute output: out = (state * q_t^T).sum(dim=-1)
// This is a matrix-vector multiplication: state[S×S] @ q_t[S]
float * out_t = out_ptr + t * S;
for (int64_t i = 0; i < S; ++i) {
out_t[i] = 0.0f;
for (int64_t j = 0; j < S; ++j) {
out_t[i] += state_ptr[i * S + j] * q_t[j];
}
}
}
// Copy final state to new_state
memcpy(new_state_ptr, state_ptr, S * S * sizeof(float));
}
}
/////////////////////////////////
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
@@ -1998,6 +2164,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_rwkv_wkv7(params, tensor);
} break;
case GGML_OP_DELTA_NET:
{
ggml_compute_forward_delta_net(params, tensor);
} break;
case GGML_OP_MAP_CUSTOM1:
{
ggml_compute_forward_map_custom1(params, tensor);
@@ -2291,6 +2461,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_RWKV_WKV6:
case GGML_OP_GATED_LINEAR_ATTN:
case GGML_OP_RWKV_WKV7:
case GGML_OP_DELTA_NET:
{
n_tasks = n_threads;
} break;

View File

@@ -1002,6 +1002,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"RWKV_WKV6",
"GATED_LINEAR_ATTN",
"RWKV_WKV7",
"DELTA_NET",
"UNARY",
@@ -1019,7 +1020,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GLU",
};
static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1106,6 +1107,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"rwkv_wkv6(k, v, r, tf, td, s)",
"gated_linear_attn(k, v, q, gate, s)",
"rwkv_wkv7(r, w, k, v, a, b, s)",
"delta_net(k, v, q, g, conv_w, conv_b, beta, state, chunk_size, use_qk_l2norm, scale)",
"unary(x)",
@@ -1123,7 +1125,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"glu(x)",
};
static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -3433,7 +3435,7 @@ struct ggml_tensor * ggml_reshape_4d(
int64_t ne2,
int64_t ne3) {
GGML_ASSERT(ggml_is_contiguous(a));
GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
@@ -5417,6 +5419,180 @@ struct ggml_tensor * ggml_gated_linear_attn(
return result;
}
// ggml_delta_net
struct ggml_tensor * ggml_delta_net(
struct ggml_context * ctx,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * q,
struct ggml_tensor * g,
struct ggml_tensor * conv_weight,
struct ggml_tensor * conv_bias,
struct ggml_tensor * beta,
struct ggml_tensor * state,
int chunk_size,
bool use_qk_l2norm,
float scale) {
GGML_ASSERT(ggml_is_contiguous(k));
GGML_ASSERT(ggml_is_contiguous(v));
GGML_ASSERT(ggml_is_contiguous(q));
GGML_ASSERT(ggml_is_contiguous(g));
GGML_ASSERT(ggml_is_contiguous(beta));
GGML_ASSERT(ggml_is_contiguous(state));
const int64_t S_k = k->ne[0];
const int64_t H_k = k->ne[1];
const int64_t n_tokens = k->ne[2];
const int64_t n_seqs = state->ne[1];
const int64_t S_v = v->ne[0];
const int64_t H_v = v->ne[1];
// Validate dimensions - allow different head dimensions for q/k vs v
GGML_ASSERT(v->ne[2] == n_tokens);
GGML_ASSERT(q->ne[2] == n_tokens);
GGML_ASSERT(g->ne[2] == n_tokens);
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[1] == n_tokens && (beta->ne[2] == n_seqs || beta->ne[2] == 1));
GGML_ASSERT(ggml_nelements(state) == S_v * H_v * n_seqs);
// Check that q and k have the same dimensions
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens);
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens);
GGML_ASSERT(g->ne[0] == S_v && g->ne[1] == H_v && g->ne[2] == n_tokens);
// Apply L2 normalization to query and key if requested
struct ggml_tensor * q_norm = q;
struct ggml_tensor * k_norm = k;
if (use_qk_l2norm) {
q_norm = ggml_l2_norm(ctx, q, 1e-6f);
k_norm = ggml_l2_norm(ctx, k, 1e-6f);
}
// Apply scaling to query
q_norm = ggml_scale(ctx, q_norm, scale);
// Apply sigmoid to beta for gating
struct ggml_tensor * beta_sigmoid = ggml_sigmoid(ctx, beta);
struct ggml_tensor * mixed_qkv = ggml_concat(ctx, q_norm, k_norm, 1);
mixed_qkv = ggml_concat(ctx, mixed_qkv, v, 1);
u_int32_t dim = (S_v * H_v) + 2 * (H_k * S_k);
mixed_qkv = ggml_reshape_3d(ctx, mixed_qkv, 1, dim, n_tokens);
struct ggml_tensor * mixed_qkv_padded = ggml_pad(ctx, mixed_qkv, 3, 0, 0, 0);
// Apply SSM convolution
struct ggml_tensor * conv_out = ggml_ssm_conv(ctx, mixed_qkv_padded, conv_weight);
// Apply bias if provided
if (conv_bias) {
conv_out = ggml_add(ctx, conv_out, conv_bias);
}
// Apply SiLU activation
conv_out = ggml_silu(ctx, conv_out);
// Reshape back to 4D: [dim, n_tokens, 1] -> [dim, n_tokens, 1, 1]
conv_out = ggml_reshape_4d(ctx, conv_out, dim, n_tokens, 1, 1);
// Transpose to get the right layout: [dim, n_tokens, 1] -> [dim, 1, n_tokens, 1]
conv_out = ggml_permute(ctx, conv_out, 0, 2, 1, 3);
// q projection view
struct ggml_tensor * q_conv = ggml_view_4d(ctx, conv_out,
S_k, // ne0
H_k, // ne1
conv_out->ne[1], // ne2 = sequence length (1)
conv_out->ne[2], // ne3 = batch (1)
H_k * sizeof(float), // nb1 = stride along H_k
conv_out->nb[1], // nb2 = stride along sequence dim
conv_out->nb[2], // nb3 = stride along batch dim
0 // offset in bytes
);
// k projection view
struct ggml_tensor * k_conv = ggml_view_4d(ctx, conv_out,
S_k, // ne0
H_k, // ne1
conv_out->ne[1], // ne2
conv_out->ne[2], // ne3
H_k * sizeof(float), // nb1
conv_out->nb[1], // nb2
conv_out->nb[2], // nb3
S_k * H_k * sizeof(q->type) // offset = skip q_out
);
// v projection view
struct ggml_tensor * v_conv = ggml_view_4d(ctx, conv_out,
S_v, // ne0
H_v, // ne1
conv_out->ne[1], // ne2
conv_out->ne[2], // ne3
H_v * sizeof(float), // nb1
conv_out->nb[1], // nb2
conv_out->nb[2], // nb3
(2 * S_k * H_k) * sizeof(q->type)// offset = skip q_out + k_out
);
// Transpose each component back to original layout: [S_v, 1, token_split_size, 1] -> [S_v, token_split_size, 1, 1]
q_conv = ggml_permute(ctx, q_conv, 0, 2, 1, 3);
k_conv = ggml_permute(ctx, k_conv, 0, 2, 1, 3);
v_conv = ggml_permute(ctx, v_conv, 0, 2, 1, 3);
q_conv = ggml_reshape_3d(ctx, ggml_cont(ctx, q_conv), S_k * H_k, 1, n_tokens);
k_conv = ggml_reshape_3d(ctx, ggml_cont(ctx, k_conv), S_k * H_k, 1, n_tokens);
v_conv = ggml_reshape_3d(ctx, ggml_cont(ctx, v_conv), S_v * H_v, 1, n_tokens);
// NOW we repeat query and key to match value head dimensions if needed (after convolution)
struct ggml_tensor * q_broadcast = q_conv;
struct ggml_tensor * k_broadcast = k_conv;
if (H_k != H_v) {
// Calculate the repeat factor: H_v / H_k
GGML_ASSERT(H_v % H_k == 0);
int64_t repeat_factor = H_v / H_k;
// Repeat query and key along the head dimension
// First reshape to separate the repeat dimension: [S_k, H_k, n_tokens, 1] -> [S_k, 1, H_k, n_tokens]
q_broadcast = ggml_reshape_4d(ctx, q_conv, S_k, 1, H_k, n_tokens);
k_broadcast = ggml_reshape_4d(ctx, k_conv, S_k, 1, H_k, n_tokens);
// Repeat along the new dimension: [S_k, repeat_factor, H_k, n_tokens]
q_broadcast = ggml_repeat_4d(ctx, q_broadcast, S_k, repeat_factor, H_k, n_tokens);
k_broadcast = ggml_repeat_4d(ctx, k_broadcast, S_k, repeat_factor, H_k, n_tokens);
// Reshape back to original dimensions but with H_v heads: [S_k, H_v, n_tokens, 1]
q_broadcast = ggml_reshape_4d(ctx, q_broadcast, S_k, H_v, n_tokens, 1);
k_broadcast = ggml_reshape_4d(ctx, k_broadcast, S_k, H_v, n_tokens, 1);
}
// concat output and new_state
const int64_t ne[4] = { S_v * H_v, n_tokens + H_v * n_seqs, 1, 1 };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
// Set operation parameters for the delta rule computation
int32_t params[8] = {
chunk_size,
use_qk_l2norm ? 1 : 0,
0, 0, // reserved
0, 0, 0 // scale and other params
};
memcpy(params + 4, &scale, sizeof(float));
ggml_set_op_params(result, params, sizeof(params));
// Use custom operation for the gated delta rule computation
result->op = GGML_OP_DELTA_NET;
result->src[0] = q_broadcast;
result->src[1] = k_broadcast;
result->src[2] = v_conv;
result->src[3] = g;
result->src[4] = beta_sigmoid;
result->src[5] = state;
return result;
}
// ggml_rwkv_wkv7
struct ggml_tensor * ggml_rwkv_wkv7(

View File

@@ -335,6 +335,7 @@ class MODEL_ARCH(IntEnum):
QWEN2VL = auto()
QWEN3 = auto()
QWEN3MOE = auto()
QWEN3NEXT = auto()
PHI2 = auto()
PHI3 = auto()
PHIMOE = auto()
@@ -432,6 +433,7 @@ class MODEL_TENSOR(IntEnum):
ATTN_NORM_2 = auto()
ATTN_OUT_NORM = auto()
ATTN_POST_NORM = auto()
ATTN_GATE = auto()
ATTN_ROT_EMBD = auto()
ATTN_SINKS = auto()
FFN_GATE_INP = auto()
@@ -481,6 +483,7 @@ class MODEL_TENSOR(IntEnum):
SSM_D = auto()
SSM_NORM = auto()
SSM_OUT = auto()
SSM_BETA_ALPHA = auto()
TIME_MIX_W0 = auto()
TIME_MIX_W1 = auto()
TIME_MIX_W2 = auto()
@@ -671,6 +674,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.QWEN2VL: "qwen2vl",
MODEL_ARCH.QWEN3: "qwen3",
MODEL_ARCH.QWEN3MOE: "qwen3moe",
MODEL_ARCH.QWEN3NEXT: "qwen3next",
MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PHI3: "phi3",
MODEL_ARCH.PHIMOE: "phimoe",
@@ -773,6 +777,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
MODEL_TENSOR.ATTN_GATE: "blk.{bid}.attn_gate",
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
@@ -818,6 +823,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba",
MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0",
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
@@ -1462,6 +1468,35 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.QWEN3NEXT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.ATTN_GATE,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_INP_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.SSM_A,
MODEL_TENSOR.SSM_CONV1D,
MODEL_TENSOR.SSM_DT,
MODEL_TENSOR.SSM_NORM,
MODEL_TENSOR.SSM_IN,
MODEL_TENSOR.SSM_BETA_ALPHA,
MODEL_TENSOR.SSM_OUT
],
MODEL_ARCH.PLAMO: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,

View File

@@ -628,10 +628,11 @@ class TensorNameMap:
),
MODEL_TENSOR.SSM_IN: (
"model.layers.{bid}.in_proj", # mamba-hf
"backbone.layers.{bid}.mixer.in_proj", # mamba
"model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.in_proj", # plamo2
"model.layers.{bid}.in_proj", # mamba-hf
"backbone.layers.{bid}.mixer.in_proj", # mamba
"model.layers.{bid}.mamba.in_proj", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.in_proj", # plamo2
"model.layers.{bid}.linear_attn.in_proj_qkvz", # qwen3next
),
MODEL_TENSOR.SSM_CONV1D: (
@@ -639,6 +640,7 @@ class TensorNameMap:
"backbone.layers.{bid}.mixer.conv1d", # mamba
"model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.conv1d", # plamo2
"model.layers.{bid}.linear_attn.conv1d", # qwen3next
),
MODEL_TENSOR.SSM_X: (
@@ -653,6 +655,7 @@ class TensorNameMap:
"backbone.layers.{bid}.mixer.dt_proj", # mamba
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next
),
MODEL_TENSOR.SSM_DT_NORM: (
@@ -665,6 +668,7 @@ class TensorNameMap:
"backbone.layers.{bid}.mixer.A_log", # mamba
"model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid
"model.layers.layers.{bid}.mixer.A_log", # plamo2
"model.layers.{bid}.linear_attn.A_log", # qwen3next
),
MODEL_TENSOR.SSM_B_NORM: (
@@ -687,17 +691,23 @@ class TensorNameMap:
),
MODEL_TENSOR.SSM_NORM: (
"model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
"backbone.layers.{bid}.mixer.norm", # mamba2
"model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
"model.layers.{bid}.linear_attn.norm", # qwen3next
"backbone.layers.{bid}.mixer.norm", # mamba2
),
MODEL_TENSOR.SSM_OUT: (
"model.layers.{bid}.out_proj", # mamba-hf
"backbone.layers.{bid}.mixer.out_proj", # mamba
"model.layers.{bid}.mamba.out_proj", # jamba falcon-h1 granite-hybrid
"model.layers.{bid}.linear_attn.out_proj", # qwen3next
"model.layers.layers.{bid}.mixer.out_proj", # plamo2
),
MODEL_TENSOR.SSM_BETA_ALPHA: (
"model.layers.{bid}.linear_attn.in_proj_ba", # qwen3next
),
MODEL_TENSOR.TIME_MIX_W0: (
"model.layers.{bid}.attention.w0", # rwkv7
),

View File

@@ -31,6 +31,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_QWEN3, "qwen3" },
{ LLM_ARCH_QWEN3MOE, "qwen3moe" },
{ LLM_ARCH_QWEN3NEXT, "qwen3next" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PHIMOE, "phimoe" },
@@ -754,6 +755,39 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
{
LLM_ARCH_QWEN3NEXT,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
},
},
{
LLM_ARCH_PHI2,
{
@@ -2212,6 +2246,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -2275,6 +2310,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_SSM_BETA_ALPHA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -2438,6 +2474,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
case LLM_ARCH_GRANITE_HYBRID:
case LLM_ARCH_LFM2:
case LLM_ARCH_NEMOTRON_H:
case LLM_ARCH_QWEN3NEXT:
return true;
default:
return false;

View File

@@ -35,6 +35,7 @@ enum llm_arch {
LLM_ARCH_QWEN2VL,
LLM_ARCH_QWEN3,
LLM_ARCH_QWEN3MOE,
LLM_ARCH_QWEN3NEXT,
LLM_ARCH_PHI2,
LLM_ARCH_PHI3,
LLM_ARCH_PHIMOE,
@@ -334,6 +335,7 @@ enum llm_tensor {
LLM_TENSOR_SSM_D,
LLM_TENSOR_SSM_NORM,
LLM_TENSOR_SSM_OUT,
LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next
LLM_TENSOR_TIME_MIX_W0,
LLM_TENSOR_TIME_MIX_W1,
LLM_TENSOR_TIME_MIX_W2,
@@ -379,6 +381,7 @@ enum llm_tensor {
LLM_TENSOR_ATTN_Q_A_NORM,
LLM_TENSOR_ATTN_KV_A_NORM,
LLM_TENSOR_ATTN_SUB_NORM,
LLM_TENSOR_ATTN_GATE,
LLM_TENSOR_FFN_SUB_NORM,
LLM_TENSOR_DEC_ATTN_NORM,
LLM_TENSOR_DEC_ATTN_Q,

View File

@@ -811,6 +811,7 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx
}
struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
LLAMA_LOG_DEBUG("%s: loading tensor %s as view\n", __func__, name.c_str());
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
if (cur == NULL) {

View File

@@ -115,6 +115,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_A13B: return "A13B";
case LLM_TYPE_21B_A3B: return "21B.A3B";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_80B_A3B: return "80B.A3B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
case LLM_TYPE_300B_A47B: return "300B.A47B";
@@ -1825,6 +1826,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
// For Granite MoE Shared
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
} break;
case LLM_ARCH_QWEN3NEXT:
{
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
// Load linear attention (gated delta net) parameters
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
// Mark recurrent layers (linear attention layers)
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
}
switch (hparams.n_layer) {
case 80: type = LLM_TYPE_80B_A3B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_CHAMELEON:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -2376,6 +2400,79 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
}
} break;
case LLM_ARCH_QWEN3NEXT:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
}
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
// Calculate dimensions from hyperparameters
const int64_t head_k_dim = hparams.ssm_d_state;
const int64_t head_v_dim = hparams.ssm_d_state;
const int64_t n_k_heads = hparams.ssm_n_group;
const int64_t n_v_heads = hparams.ssm_dt_rank;
const int64_t key_dim = head_k_dim * n_k_heads;
const int64_t value_dim = head_v_dim * n_v_heads;
const int64_t conv_dim = key_dim * 2 + value_dim;
// Calculate projection sizes
const int64_t qkvz_projection_size = key_dim * 2 + value_dim * 2;
const int64_t ba_projection_size = n_v_heads * 2;
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
if (!hparams.is_recurrent(i)) {
// Attention layers
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
// Q/K normalization for attention layers
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
// attn gate
layer.wq_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
} else {
// Linear attention (gated delta net) specific tensors
// Create tensors with calculated dimensions
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_projection_size }, 0);
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_projection_size }, 0);
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
}
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
// Shared experts
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
}
}
break;
case LLM_ARCH_LLADA:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
@@ -6090,7 +6187,8 @@ void llama_model::print_info() const {
arch == LLM_ARCH_FALCON_H1 ||
arch == LLM_ARCH_PLAMO2 ||
arch == LLM_ARCH_GRANITE_HYBRID ||
arch == LLM_ARCH_NEMOTRON_H) {
arch == LLM_ARCH_NEMOTRON_H ||
arch == LLM_ARCH_QWEN3NEXT) {
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
@@ -18851,6 +18949,386 @@ struct llm_build_smallthinker : public llm_graph_context{
}
};
struct llm_build_qwen3next : public llm_graph_context_mamba {
llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) : llm_graph_context_mamba(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
auto * inp = build_inp_mem_hybrid();
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// Pre-norm for attention/linear attention
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// Determine layer type and build appropriate attention mechanism
if (hparams.is_recurrent(il)) {
// Linear attention layer (gated delta net)
cur = build_qwen3next_linear_attn_layer(inp->get_recr(), cur, model, ubatch, il);
} else {
// Full attention layer
cur = build_qwen3next_attention_layer(
cur, inp_pos, inp->get_attn(), model,
n_embd_head, il);
}
// Post-attention norm
cur = build_norm(cur,
model.layers[il].attn_post_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "attn_post_norm", il);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
// Residual connection
cur = ggml_add(ctx0, cur, inpSA);
cb(cur, "attn_residual", il);
// FFN layer (MoE or dense)
cur = build_layer_ffn(cur, model, il);
// Input for next layer
inpL = cur;
}
cur = inpL;
// Final norm
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// LM head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
private:
ggml_tensor * build_qwen3next_attention_layer(
ggml_tensor * cur,
ggml_tensor * inp_pos,
llm_graph_input_attn_kv * inp_attn,
const llama_model & model,
const int64_t n_embd_head,
const int il) {
ggml_tensor * gate = build_lora_mm(model.layers[il].wq_gate, cur);
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
// Apply Q/K normalization
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Qcur_normed", il);
cb(Kcur, "Kcur_normed", il);
// Apply RoPE
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
// Attention computation
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
// Apply gating
cur = ggml_cont(ctx0, ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate)));
cb(cur, "attn_gated", il);
return cur;
}
ggml_tensor * build_qwen3next_linear_attn_layer(llm_graph_input_rs * inp,
ggml_tensor * cur,
const llama_model & model,
const llama_ubatch & ubatch,
int il) {
// Gated Delta Net implementation using the new ggml_delta_net function
const auto * mctx_cur = inp->mctx;
const auto kv_head = mctx_cur->get_head();
const int64_t d_inner = hparams.ssm_d_inner;
const int64_t n_heads = hparams.ssm_dt_rank;
const int64_t head_dim = d_inner / n_heads;
const int64_t n_seqs = ubatch.n_seqs;
const int64_t head_k_dim = hparams.ssm_d_state;
const int64_t head_v_dim = hparams.ssm_d_state;
const int64_t num_k_heads = hparams.ssm_n_group;
const int64_t num_v_heads = hparams.ssm_dt_rank;
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
const int64_t n_tokens = ubatch.n_tokens;
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
// Input projections
ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, cur);
cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur);
cb(mixed_ba, "linear_attn_mixed_ba", il);
// Reshape mixed_qkvz: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*head_k_dim + 2*head_v_dim*num_v_heads/num_k_heads]
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * num_v_heads / num_k_heads;
ggml_tensor * mixed_qkvz_reshaped =
ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_tokens, n_seqs);
// Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_tokens, n_seqs);
// Split mixed_qkvz into query, key, value, z
int64_t split_sizes_qkvz[4] = {
head_k_dim, // query size
head_k_dim, // key size
head_v_dim * num_v_heads / num_k_heads, // value size
head_v_dim * num_v_heads / num_k_heads // z size
};
ggml_tensor * query = ggml_cont(ctx0, ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_tokens,
n_seqs, split_sizes_qkvz[0] * sizeof(float), mixed_qkvz_reshaped->nb[1],
mixed_qkvz_reshaped->nb[2], 0));
ggml_tensor * key = ggml_cont(ctx0, ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_tokens, n_seqs,
split_sizes_qkvz[1] * sizeof(float), mixed_qkvz_reshaped->nb[1],
mixed_qkvz_reshaped->nb[2], split_sizes_qkvz[0] * sizeof(float)));
ggml_tensor * value =
ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_tokens, n_seqs,
split_sizes_qkvz[2] * sizeof(float), mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2],
(split_sizes_qkvz[0] + split_sizes_qkvz[1]) * sizeof(float));
ggml_tensor * z =
ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_tokens, n_seqs,
split_sizes_qkvz[3] * sizeof(float), mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2],
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
// Reshape value and z to merge head dimensions: [batch, seq_len, num_k_heads, head_v_dim*num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads, head_v_dim]
ggml_tensor * value_reshaped = ggml_reshape_4d(ctx0, ggml_cont(ctx0, value), head_v_dim, num_v_heads, n_tokens, n_seqs);
ggml_tensor * z_reshaped = ggml_reshape_4d(ctx0, ggml_cont(ctx0, z), head_v_dim, num_v_heads, n_tokens, n_seqs);
GGML_ASSERT(ggml_nelements(query) + ggml_nelements(key) + ggml_nelements(value_reshaped) +
ggml_nelements(z_reshaped) ==
ggml_nelements(mixed_qkvz));
// Split mixed_ba into b and a (beta and alpha parameters)
int64_t split_sizes_ba[2] = {
num_v_heads / num_k_heads, // beta size
num_v_heads / num_k_heads // alpha size
};
ggml_tensor * b =
ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_tokens, n_seqs,
split_sizes_ba[0] * sizeof(float), mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], 0);
ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_tokens, n_seqs,
split_sizes_ba[1] * sizeof(float), mixed_ba_reshaped->nb[1],
mixed_ba_reshaped->nb[2], split_sizes_ba[0] * sizeof(float));
// Reshape b and a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
ggml_tensor * beta = ggml_reshape_3d(ctx0, ggml_cont(ctx0, b), num_v_heads, n_tokens, n_seqs);
ggml_tensor * alpha = ggml_reshape_3d(ctx0, ggml_cont(ctx0, a), num_v_heads, n_tokens, n_seqs);
GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba));
ggml_tensor * alpha_softplus = softplus(alpha, model.layers[il].ssm_dt);
ggml_tensor * A_log_exp = ggml_exp(ctx0, model.layers[il].ssm_a); // A_log.exp()
ggml_tensor * gate_scaled = ggml_mul(ctx0, alpha_softplus, A_log_exp); // A_log.exp() * softplus
ggml_tensor * gate = ggml_scale(ctx0, gate_scaled, -1.0f); // - (A_log.exp() * softplus)
// Get convolution weights and bias
ggml_tensor * conv_weight = model.layers[il].ssm_conv1d;
ggml_tensor * conv_bias = nullptr; // Add if your model has conv bias
// Get recurrent states (conv_states not needed as it's handled internally by ggml_delta_net)
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
// Beta tensor
beta = ggml_reshape_3d(ctx0, beta, n_heads, n_tokens, n_seqs);
// Get current state slice
ggml_tensor * state = ggml_view_4d(ctx0, ssm_states_all, head_dim, head_dim, n_heads, n_seqs,
ssm_states_all->nb[0], ssm_states_all->nb[1], ssm_states_all->nb[2],
kv_head * head_dim * head_dim * n_heads * ggml_element_size(ssm_states_all));
state = ggml_cont(ctx0, state);
ggml_tensor * target_gate = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_dim, n_heads, n_tokens, n_seqs);
ggml_tensor * gate_broadcast = ggml_reshape_4d(ctx0, gate, 1, n_heads, n_tokens, n_seqs);
gate = ggml_repeat(ctx0, gate_broadcast, target_gate);
// Call the new ggml_delta_net function with the corrected flow
ggml_tensor * output = ggml_delta_net(ctx0,
key, // k tensor
value_reshaped, // v tensor
query, // q tensor
gate, // g tensor
conv_weight, // conv_weight tensor
conv_bias, // conv_bias tensor (can be nullptr)
beta, // beta tensor
state, // state tensor
64, // chunk_size (adjust as needed)
true, // use_qk_l2norm
1.0f // scale (adjust based on your model)
);
cb(output, "delta_net_output", il);
// Extract the output part (first half of the concatenated result)
ggml_tensor * attn_out = ggml_view_4d(ctx0, output, head_dim, n_heads, n_tokens, n_seqs, output->nb[0],
output->nb[1], output->nb[2], 0);
// Extract the new state (second half of the concatenated result)
ggml_tensor * new_state =
ggml_view_4d(ctx0, output, head_dim, head_dim, n_heads, n_seqs, output->nb[0], output->nb[1], output->nb[2],
n_tokens * head_dim * n_heads * sizeof(float));
// Update the recurrent states
ggml_build_forward_expand(
gf, ggml_cpy(ctx0, new_state,
ggml_view_1d(
ctx0, ssm_states_all, head_dim * head_dim * n_heads * n_seqs,
kv_head * n_seqs * head_dim * head_dim * n_heads * ggml_element_size(ssm_states_all))));
// Reshape both attn_out and z to 2D tensors for normalization
// attn_out: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
ggml_tensor * attn_out_2d = ggml_reshape_2d(ctx0, ggml_cont(ctx0, attn_out), head_dim, n_heads * n_tokens * n_seqs);
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z_reshaped, head_dim, n_heads * n_tokens * n_seqs);
// Apply gated normalization: self.norm(core_attn_out, z)
// This is Qwen3NextRMSNormGated which applies: RMSNorm(x) * silu(gate)
ggml_tensor * attn_out_norm = build_norm(attn_out_2d, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
// Apply silu gate: attn_out_norm * silu(z_2d)
ggml_tensor * z_silu = ggml_silu(ctx0, z_2d);
ggml_tensor * gated_output = ggml_mul(ctx0, attn_out_norm, z_silu);
// Reshape back to original dimensions: [n_heads * n_tokens * n_seqs, head_dim] -> [head_dim, n_heads, n_tokens, n_seqs]
ggml_tensor * gated_output_4d = ggml_reshape_4d(ctx0, gated_output, head_dim, n_heads, n_tokens, n_seqs);
// Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
ggml_tensor * final_output = ggml_reshape_3d(ctx0, gated_output_4d, n_heads * head_dim, n_tokens, n_seqs);
// Output projection
cur = build_lora_mm(model.layers[il].ssm_out, final_output);
cb(cur, "linear_attn_out", il);
// Reshape back to original dimensions
cur = ggml_cont(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens));
return cur;
}
ggml_tensor * build_layer_ffn(ggml_tensor * cur, const llama_model & model, const int il) {
// Check if this is an MoE layer
if (model.layers[il].ffn_gate_inp != nullptr) {
// MoE branch
ggml_tensor * moe_out = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);
// Add shared experts if present
if (model.layers[il].ffn_up_shexp != nullptr) {
ggml_tensor * ffn_shexp = build_ffn(cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);
cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
} else {
cur = moe_out;
}
} else {
// Dense FFN branch
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
}
// Residual connection
cur = ggml_add(ctx0, cur, cur); // This should be the residual from before FFN
cb(cur, "ffn_residual", il);
return cur;
}
ggml_tensor * softplus(ggml_tensor * alpha, ggml_tensor * dt_bias) {
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, dt_bias); // a + dt_bias
ggml_tensor * alpha_exp = ggml_exp(ctx0, alpha_biased); // exp(a + dt_bias)
ggml_tensor * one_plus_exp = ggml_scale_bias(ctx0, alpha_exp, 1.0f, 1.0f); // 1 + exp(a + dt_bias)
ggml_tensor * alpha_softplus = ggml_log(ctx0, one_plus_exp); // log(1 + exp(...))
return alpha_softplus;
}
};
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
llama_memory_i * res;
@@ -19377,6 +19855,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
}
} break;
case LLM_ARCH_QWEN3NEXT:
{
llm = std::make_unique<llm_build_qwen3next>(*this, params);
} break;
default:
GGML_ABORT("fatal error");
}
@@ -19552,6 +20034,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_QWEN2MOE:
case LLM_ARCH_QWEN3:
case LLM_ARCH_QWEN3MOE:
case LLM_ARCH_QWEN3NEXT:
case LLM_ARCH_LLADA_MOE:
case LLM_ARCH_OLMO2:
case LLM_ARCH_OLMOE:

View File

@@ -107,6 +107,7 @@ enum llm_type {
LLM_TYPE_A13B,
LLM_TYPE_21B_A3B, // Ernie MoE small
LLM_TYPE_30B_A3B,
LLM_TYPE_80B_A3B, // Qwen3 Next
LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_235B_A22B,
LLM_TYPE_300B_A47B, // Ernie MoE big
@@ -227,6 +228,7 @@ struct llama_layer {
struct ggml_tensor * wk_enc = nullptr;
struct ggml_tensor * wv_enc = nullptr;
struct ggml_tensor * wo_enc = nullptr;
struct ggml_tensor * wq_gate = nullptr;
// attention bias
struct ggml_tensor * bq = nullptr;
@@ -295,6 +297,9 @@ struct llama_layer {
struct ggml_tensor * ssm_conv1d_b = nullptr;
struct ggml_tensor * ssm_dt_b = nullptr;
// qwen3next
struct ggml_tensor * ssm_beta_alpha = nullptr;
// rwkv
struct ggml_tensor * time_mix_w1 = nullptr;
struct ggml_tensor * time_mix_w2 = nullptr;