Compare commits

...

2 Commits

Author SHA1 Message Date
Georgi Gerganov
bf3f12df4c graph : constant topology for tokens/embeddings inputs 2026-01-02 15:46:45 +02:00
Georgi Gerganov
4ed59dc2c7 graph : reduce topology branching 2026-01-02 15:39:12 +02:00
6 changed files with 32 additions and 34 deletions

View File

@@ -21,7 +21,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
}
if (ubatch->embd) {
const int64_t n_embd = embd->ne[0];
GGML_ASSERT(n_embd == embd->ne[0]);
const int64_t n_tokens = ubatch->n_tokens;
ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
@@ -1206,17 +1207,21 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
const int64_t n_embd = hparams.n_embd_inp();
auto inp = std::make_unique<llm_graph_input_embd>();
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
cb(inp->tokens, "inp_tokens", -1);
ggml_set_input(inp->tokens);
res->t_tokens = inp->tokens;
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
ggml_set_input(inp->embd);
ggml_tensor * cur = nullptr;
if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
//cb(inp->tokens, "inp_tokens", -1);
ggml_set_input(inp->tokens);
res->t_tokens = inp->tokens;
{
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
cur = ggml_scale(ctx0, cur, ubatch.token ? 1.0f : 0.0f);
// apply lora for embedding tokens if needed
for (const auto & lora : *loras) {
@@ -1235,13 +1240,10 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
cur = ggml_add(ctx0, cur, inpL_delta);
}
} else {
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
ggml_set_input(inp->embd);
cur = inp->embd;
}
cur = ggml_add(ctx0, cur, ggml_scale(ctx0, inp->embd, ubatch.embd ? 1.0f : 0.0f));
// For Granite architecture
if (hparams.f_embedding_scale != 0.0f) {
cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);

View File

@@ -104,7 +104,7 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
class llm_graph_input_embd : public llm_graph_input_i {
public:
llm_graph_input_embd() = default;
llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
virtual ~llm_graph_input_embd() = default;
void set_input(const llama_ubatch * ubatch) override;
@@ -113,6 +113,8 @@ public:
ggml_tensor * tokens = nullptr; // I32 [n_batch]
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
const int64_t n_embd = 0;
};
class llm_graph_input_pos : public llm_graph_input_i {

View File

@@ -3,12 +3,14 @@
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
float kq_scale = 1.0f / sqrtf(float(n_embd_head));
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
ggml_tensor *inpL, *cur;
ggml_tensor * inpL;
ggml_tensor * cur;
inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos();
@@ -44,7 +46,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
}
ggml_tensor * inpSA = inpL;
cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
// build self attention
{

View File

@@ -1,7 +1,5 @@
#include "models.h"
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -12,10 +10,8 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
if (ubatch.token) {
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();

View File

@@ -10,10 +10,9 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
if (ubatch.token) {
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();

View File

@@ -1,7 +1,5 @@
#include "models.h"
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params),
model(model),
@@ -15,10 +13,9 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
if (ubatch.token) {
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
}
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
cb(inpL, "inp_scaled", -1);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
@@ -248,7 +245,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
// equivalent to get_per_layer_inputs() in python code
// output shape: [n_embd_altup, n_layer, n_tokens]
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
auto inp = std::make_unique<llm_graph_input_embd>();
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
ggml_tensor * inp_per_layer;
if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);