mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-03-05 14:33:24 +02:00
Compare commits
7 Commits
0cc4m/test
...
b8161
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9b62913b40 | ||
|
|
66287bdaac | ||
|
|
1ca3d1de15 | ||
|
|
bd72300591 | ||
|
|
2943210c1e | ||
|
|
3769fe6eb7 | ||
|
|
832aa94762 |
@@ -1578,7 +1578,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--temp"}, "N",
|
||||
{"--temp", "--temperature"}, "N",
|
||||
string_format("temperature (default: %.2f)", (double)params.sampling.temp),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.temp = std::stof(value);
|
||||
@@ -1611,7 +1611,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--top-nsigma"}, "N",
|
||||
{"--top-nsigma", "--top-n-sigma"}, "N",
|
||||
string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.top_n_sigma = std::stof(value);
|
||||
@@ -1634,7 +1634,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--typical"}, "N",
|
||||
{"--typical", "--typical-p"}, "N",
|
||||
string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sampling.typ_p = std::stof(value);
|
||||
@@ -2642,8 +2642,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.out_file = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA,
|
||||
LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
|
||||
add_opt(common_arg(
|
||||
{"-ofreq", "--output-frequency"}, "N",
|
||||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||
|
||||
@@ -104,7 +104,6 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_DIFFUSION,
|
||||
LLAMA_EXAMPLE_FINETUNE,
|
||||
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
|
||||
@@ -721,6 +721,8 @@ value member_expression::execute_impl(context & ctx) {
|
||||
int64_t arr_size = 0;
|
||||
if (is_val<value_array>(object)) {
|
||||
arr_size = object->as_array().size();
|
||||
} else if (is_val<value_string>(object)) {
|
||||
arr_size = object->as_string().length();
|
||||
}
|
||||
|
||||
if (is_stmt<slice_expression>(this->property)) {
|
||||
|
||||
@@ -1148,6 +1148,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano
|
||||
res = "jina-v5-nano"
|
||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||
res = "smaug-bpe"
|
||||
@@ -6125,6 +6128,32 @@ class NeoBert(BertModel):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model")
|
||||
class EuroBertModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.EUROBERT
|
||||
|
||||
def set_vocab(self):
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# EuroBert is bidirectional (encoder)
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
self._try_set_pooling_type()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Strip "model." prefix from tensor names
|
||||
if name.startswith("model."):
|
||||
name = name[6:]
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
||||
class XLMRobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
@@ -107,6 +107,7 @@ models = [
|
||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "jina-v5-nano", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v5-text-nano", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||
|
||||
@@ -11,8 +11,8 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
int s00, int s01, int s02, int s03,
|
||||
int s10, int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
@@ -44,7 +44,7 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
for (int i0 = i0s; i0 < ne0;
|
||||
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0*s00] : 0.0f, (float)src1_row[i10*s10]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,8 +53,8 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
int s00, int s01, int s02, int s03,
|
||||
int s10, int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
@@ -82,7 +82,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0*s00] : 0.0f, (float)src1_row[i10*s10]);
|
||||
}
|
||||
|
||||
|
||||
@@ -95,7 +95,8 @@ struct bin_bcast_sycl {
|
||||
const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
|
||||
const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
|
||||
const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
|
||||
const bool src1_is_contiguous, const bool src0_is_permuted, const bool src1_is_permuted,
|
||||
queue_ptr stream) {
|
||||
int nr0 = ne10 / ne0;
|
||||
int nr1 = ne11/ne1;
|
||||
int nr2 = ne12/ne2;
|
||||
@@ -123,7 +124,7 @@ struct bin_bcast_sycl {
|
||||
cnb[3] *= cne[3];
|
||||
};
|
||||
|
||||
if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
|
||||
if (src0_is_contiguous && src1_is_contiguous && !src0_is_permuted && !src1_is_permuted) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (nr[i] != 1) {
|
||||
break;
|
||||
@@ -164,7 +165,7 @@ struct bin_bcast_sycl {
|
||||
size_t nb12 = cnb1[2];
|
||||
size_t nb13 = cnb1[3];
|
||||
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
// size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
@@ -196,9 +197,6 @@ struct bin_bcast_sycl {
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(s0 == 1);
|
||||
GGML_ASSERT(s10 == 1);
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
||||
@@ -232,8 +230,8 @@ struct bin_bcast_sycl {
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast_unravel<bin_op>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
|
||||
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
|
||||
s03, s11, s12, s13, item_ct1);
|
||||
ne10, ne11, ne12, ne13, s1, s2, s3, s00, s01, s02,
|
||||
s03, s10, s11, s12, s13, item_ct1);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
@@ -251,7 +249,7 @@ struct bin_bcast_sycl {
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
||||
ne2, ne3, ne10, ne11, ne12, ne13,
|
||||
s1, s2, s3, s01, s02, s03, s11, s12, s13,
|
||||
s1, s2, s3, s00, s01, s02, s03, s10, s11, s12, s13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -268,24 +266,27 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10,
|
||||
ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3,
|
||||
ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
|
||||
ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01,
|
||||
ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13,
|
||||
nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst),
|
||||
nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
|
||||
main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
||||
op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02,
|
||||
ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1,
|
||||
nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
|
||||
nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
|
||||
main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
|
||||
op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03,
|
||||
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
|
||||
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
|
||||
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
|
||||
main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
|
||||
op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03,
|
||||
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
|
||||
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
|
||||
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
|
||||
main_stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
|
||||
@@ -13820,12 +13820,11 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check whether the tensors overlap in memory but are not equal.
|
||||
// Fusions can potenitally overwrite src tensors in ways that are not prevented
|
||||
// by ggml-alloc. If the fusion is entirely elementwise, then it's OK for them
|
||||
// to overlap if they are exactly equal.
|
||||
// XXX TODO this check is probably missing from several fusion optimizations.
|
||||
static bool ggml_vk_tensors_overlap_but_not_equal(const ggml_tensor * a, const ggml_tensor * b) {
|
||||
// Check whether the tensors overlap in memory.
|
||||
// Fusions can potentially overwrite src tensors in ways that are not prevented
|
||||
// by ggml-alloc. If the fusion src is being applied in a way that's elementwise
|
||||
// with the destination, then it's OK for them to overlap if they are exactly equal.
|
||||
static bool ggml_vk_tensors_overlap(const ggml_tensor * a, const ggml_tensor * b, bool elementwise) {
|
||||
ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)a->buffer->context;
|
||||
vk_buffer a_buf = a_buf_ctx->dev_buffer;
|
||||
ggml_backend_vk_buffer_context * b_buf_ctx = (ggml_backend_vk_buffer_context *)b->buffer->context;
|
||||
@@ -13836,7 +13835,7 @@ static bool ggml_vk_tensors_overlap_but_not_equal(const ggml_tensor * a, const g
|
||||
auto b_base = vk_tensor_offset(b) + b->view_offs;
|
||||
auto b_size = ggml_nbytes(b);
|
||||
|
||||
if (a_base == b_base && a_size == b_size) {
|
||||
if (elementwise && a_base == b_base && a_size == b_size) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -13874,13 +13873,6 @@ static bool ggml_vk_can_fuse_rms_norm_mul_rope(ggml_backend_vk_context * ctx, co
|
||||
return false;
|
||||
}
|
||||
|
||||
// must not overwrite srcs in a way that's not elementwise
|
||||
ggml_tensor *other_src = mul->src[0] == rms ? mul->src[1] : mul->src[0];
|
||||
if (ggml_vk_tensors_overlap_but_not_equal(rms->src[0], rope) ||
|
||||
ggml_vk_tensors_overlap_but_not_equal(other_src, rope)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// conditions for pipeline creation
|
||||
if (!(ctx->device->float_controls_rte_fp16 &&
|
||||
sizeof(vk_op_rms_norm_mul_rope_push_constants) <= ctx->device->properties.limits.maxPushConstantsSize)) {
|
||||
@@ -13942,6 +13934,18 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
return num_adds;
|
||||
}
|
||||
|
||||
static int32_t find_first_set(uint32_t x) {
|
||||
int32_t ret = 0;
|
||||
if (!x) {
|
||||
return -1;
|
||||
}
|
||||
while (!(x & 1)) {
|
||||
x >>= 1;
|
||||
ret++;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
@@ -14040,6 +14044,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
total_mul_mat_bytes += bytes;
|
||||
}
|
||||
|
||||
// op_srcs_fused_elementwise indicates whether an op's srcs all contribute to
|
||||
// the fused result in an elementwise-way. This affects whether the memory for
|
||||
// the src is allowed to overlap the memory for the destination.
|
||||
// The array is sized to handle the largest fusion (asserted later).
|
||||
bool op_srcs_fused_elementwise[12];
|
||||
|
||||
ctx->fused_topk_moe_mode = TOPK_MOE_COUNT;
|
||||
ctx->fused_topk_moe_scale = false;
|
||||
const char *fusion_string {};
|
||||
@@ -14048,39 +14058,68 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
if (num_adds) {
|
||||
ctx->num_additional_fused_ops = num_adds - 1;
|
||||
fusion_string = "MULTI_ADD";
|
||||
std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, true);
|
||||
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) {
|
||||
ctx->num_additional_fused_ops = 2;
|
||||
fusion_string = "MUL_MAT_ADD_ADD";
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = true;
|
||||
op_srcs_fused_elementwise[2] = true;
|
||||
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
|
||||
ctx->num_additional_fused_ops = 1;
|
||||
fusion_string = "MUL_MAT_ADD";
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = true;
|
||||
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) {
|
||||
ctx->num_additional_fused_ops = 2;
|
||||
fusion_string = "MUL_MAT_ID_ADD_ID_MUL";
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = true;
|
||||
op_srcs_fused_elementwise[2] = true;
|
||||
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
|
||||
ctx->num_additional_fused_ops = 1;
|
||||
fusion_string = "MUL_MAT_ID_ADD_ID";
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = true;
|
||||
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) {
|
||||
ctx->num_additional_fused_ops = 1;
|
||||
fusion_string = "MUL_MAT_ID_MUL";
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = true;
|
||||
} else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) &&
|
||||
ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) &&
|
||||
ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) &&
|
||||
ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) {
|
||||
ctx->num_additional_fused_ops = 4;
|
||||
fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS";
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = false;
|
||||
op_srcs_fused_elementwise[2] = false;
|
||||
op_srcs_fused_elementwise[3] = false;
|
||||
op_srcs_fused_elementwise[4] = false;
|
||||
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&&
|
||||
ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) {
|
||||
ctx->num_additional_fused_ops = 2;
|
||||
fusion_string = "RMS_NORM_MUL_ROPE";
|
||||
// rope is approximately elementwise - whole rows are done by a single workgroup and it's row-wise
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = true;
|
||||
op_srcs_fused_elementwise[2] = true;
|
||||
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
ctx->num_additional_fused_ops = 1;
|
||||
fusion_string = "RMS_NORM_MUL";
|
||||
// rms_norm is not elementwise, but whole rows must be consumed and the scale factor computed before
|
||||
// they are overwritten, and one workgroup per row. So close enough.
|
||||
op_srcs_fused_elementwise[0] = true;
|
||||
op_srcs_fused_elementwise[1] = true;
|
||||
} else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
|
||||
ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
|
||||
ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
|
||||
ctx->num_additional_fused_ops = 2;
|
||||
fusion_string = "ROPE_VIEW_SET_ROWS";
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = false;
|
||||
op_srcs_fused_elementwise[2] = false;
|
||||
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
|
||||
ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
|
||||
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
|
||||
@@ -14089,6 +14128,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ctx->fused_ops_write_mask |= 1 << 3;
|
||||
ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX_NORM;
|
||||
fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
|
||||
std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
|
||||
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_sigmoid_norm_bias, { i + 4, i + 10 }) &&
|
||||
ggml_check_edges(cgraph, i, topk_moe_sigmoid_norm_bias_edges) &&
|
||||
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_SIGMOID_NORM_BIAS)) {
|
||||
@@ -14097,6 +14137,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ctx->fused_ops_write_mask |= 1 << 4;
|
||||
ctx->fused_topk_moe_mode = TOPK_MOE_SIGMOID_NORM_BIAS;
|
||||
fusion_string = "TOPK_MOE_SIGMOID_NORM_BIAS";
|
||||
std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
|
||||
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
|
||||
ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
|
||||
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
|
||||
@@ -14105,6 +14146,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ctx->fused_ops_write_mask |= 1 << 3;
|
||||
ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX;
|
||||
fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
|
||||
std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
|
||||
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
|
||||
ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
|
||||
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
|
||||
@@ -14113,6 +14155,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ctx->fused_ops_write_mask |= 1 << 1;
|
||||
ctx->fused_topk_moe_mode = TOPK_MOE_LATE_SOFTMAX;
|
||||
fusion_string = "TOPK_MOE_LATE_SOFTMAX";
|
||||
std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
|
||||
}
|
||||
if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
|
||||
// Look for an additional scale op to fuse - occurs in deepseek2 and nemotron3 nano.
|
||||
@@ -14120,11 +14163,73 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops, { GGML_OP_GET_ROWS, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 })) {
|
||||
ctx->fused_topk_moe_scale = true;
|
||||
ctx->num_additional_fused_ops++;
|
||||
op_srcs_fused_elementwise[ctx->num_additional_fused_ops] = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(ctx->num_additional_fused_ops < (int)(sizeof(op_srcs_fused_elementwise) / sizeof(op_srcs_fused_elementwise[0])));
|
||||
ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
|
||||
|
||||
// Check whether fusion would overwrite src operands while they're still in use.
|
||||
// If so, disable fusion.
|
||||
if (ctx->num_additional_fused_ops) {
|
||||
// There are up to two output nodes - topk_moe has two.
|
||||
uint32_t bits = ctx->fused_ops_write_mask & ~(1 << ctx->num_additional_fused_ops);
|
||||
ggml_tensor *output_nodes[2] {};
|
||||
output_nodes[0] = cgraph->nodes[i + ctx->num_additional_fused_ops];
|
||||
if (bits) {
|
||||
int output_idx = find_first_set(bits);
|
||||
GGML_ASSERT(bits == (1u << output_idx));
|
||||
output_nodes[1] = cgraph->nodes[i + output_idx];
|
||||
}
|
||||
|
||||
bool need_disable = false;
|
||||
|
||||
// topk_moe often overwrites the source, but for a given row all the src values are
|
||||
// loaded before anything is stored. If there's only one row, this is safe, so treat
|
||||
// this as a special case.
|
||||
bool is_topk_moe_single_row = ctx->fused_topk_moe_mode != TOPK_MOE_COUNT &&
|
||||
ggml_nrows(cgraph->nodes[i]->src[0]) == 1;
|
||||
|
||||
if (!is_topk_moe_single_row) {
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
ggml_tensor *dst = output_nodes[j];
|
||||
if (!dst) {
|
||||
continue;
|
||||
}
|
||||
// Loop over all srcs of all nodes in the fusion. If the src overlaps
|
||||
// the destination and the src is not an intermediate node that's being
|
||||
// elided, then disable fusion.
|
||||
for (int k = 0; k <= ctx->num_additional_fused_ops; ++k) {
|
||||
for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) {
|
||||
ggml_tensor *src = cgraph->nodes[i + k]->src[s];
|
||||
if (!src || src->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
if (ggml_vk_tensors_overlap(src, dst, op_srcs_fused_elementwise[k])) {
|
||||
bool found = false;
|
||||
for (int n = 0; n < k; ++n) {
|
||||
if (cgraph->nodes[i + n] == src) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
need_disable = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (need_disable) {
|
||||
ctx->num_additional_fused_ops = 0;
|
||||
ctx->fused_ops_write_mask = 1;
|
||||
ctx->fused_topk_moe_mode = TOPK_MOE_COUNT;
|
||||
ctx->fused_topk_moe_scale = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
|
||||
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
|
||||
bool submit = (submitted_nodes >= nodes_per_submit) ||
|
||||
|
||||
@@ -228,13 +228,41 @@ struct gguf_context {
|
||||
};
|
||||
|
||||
struct gguf_reader {
|
||||
FILE * file;
|
||||
gguf_reader(FILE * file) : file(file) {
|
||||
// read the remaining bytes once and update on each read
|
||||
nbytes_remain = file_remain(file);
|
||||
}
|
||||
|
||||
gguf_reader(FILE * file) : file(file) {}
|
||||
// helper for remaining bytes in a file
|
||||
static uint64_t file_remain(FILE * file) {
|
||||
const int64_t cur = gguf_ftell(file);
|
||||
if (cur < 0) {
|
||||
return 0;
|
||||
}
|
||||
if (gguf_fseek(file, 0, SEEK_END) != 0) {
|
||||
gguf_fseek(file, cur, SEEK_SET);
|
||||
|
||||
return 0;
|
||||
}
|
||||
const int64_t end = gguf_ftell(file);
|
||||
if (end < 0) {
|
||||
gguf_fseek(file, cur, SEEK_SET);
|
||||
|
||||
return 0;
|
||||
}
|
||||
gguf_fseek(file, cur, SEEK_SET);
|
||||
return static_cast<uint64_t>(end - cur);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool read(T & dst) const {
|
||||
return fread(&dst, 1, sizeof(dst), file) == sizeof(dst);
|
||||
const size_t size = sizeof(dst);
|
||||
if (nbytes_remain < size) {
|
||||
return false;
|
||||
}
|
||||
const size_t nread = fread(&dst, 1, size, file);
|
||||
nbytes_remain -= nread;
|
||||
return nread == size;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -242,20 +270,19 @@ struct gguf_reader {
|
||||
if (n > GGUF_MAX_ARRAY_ELEMENTS) {
|
||||
return false;
|
||||
}
|
||||
const uint64_t nbytes = nbytes_remain();
|
||||
if constexpr (std::is_same<T, std::string>::value) {
|
||||
// strings are prefixed with their length, so we need to account for that
|
||||
if (n > SIZE_MAX / sizeof(uint64_t)) {
|
||||
return false;
|
||||
}
|
||||
if (nbytes < n * sizeof(uint64_t)) {
|
||||
if (nbytes_remain < n * sizeof(uint64_t)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (n > SIZE_MAX / sizeof(T)) {
|
||||
return false;
|
||||
}
|
||||
if (nbytes < n * sizeof(T)) {
|
||||
if (nbytes_remain < n * sizeof(T)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -312,39 +339,29 @@ struct gguf_reader {
|
||||
GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds maximum %" PRIu64 "\n", __func__, size, (uint64_t) GGUF_MAX_STRING_LENGTH);
|
||||
return false;
|
||||
}
|
||||
const uint64_t nbytes = nbytes_remain();
|
||||
if (size > nbytes) {
|
||||
GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds remaining file size %" PRIu64 " bytes\n", __func__, size, nbytes);
|
||||
if (size > nbytes_remain) {
|
||||
GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds remaining file size %" PRIu64 " bytes\n", __func__, size, nbytes_remain);
|
||||
return false;
|
||||
}
|
||||
dst.resize(static_cast<size_t>(size));
|
||||
return fread(dst.data(), 1, dst.length(), file) == dst.length();
|
||||
const size_t nread = fread(dst.data(), 1, size, file);
|
||||
nbytes_remain -= nread;
|
||||
return nread == size;
|
||||
}
|
||||
|
||||
bool read(void * dst, const size_t size) const {
|
||||
return fread(dst, 1, size, file) == size;
|
||||
if (size > nbytes_remain) {
|
||||
return false;
|
||||
}
|
||||
const size_t nread = fread(dst, 1, size, file);
|
||||
nbytes_remain -= nread;
|
||||
return nread == size;
|
||||
}
|
||||
|
||||
// remaining bytes in the file
|
||||
uint64_t nbytes_remain() const {
|
||||
const int64_t cur = gguf_ftell(file);
|
||||
if (cur < 0) {
|
||||
return 0;
|
||||
}
|
||||
if (gguf_fseek(file, 0, SEEK_END) != 0) {
|
||||
gguf_fseek(file, cur, SEEK_SET);
|
||||
private:
|
||||
FILE * file;
|
||||
|
||||
return 0;
|
||||
}
|
||||
const int64_t end = gguf_ftell(file);
|
||||
if (end < 0) {
|
||||
gguf_fseek(file, cur, SEEK_SET);
|
||||
|
||||
return 0;
|
||||
}
|
||||
gguf_fseek(file, cur, SEEK_SET);
|
||||
return static_cast<uint64_t>(end - cur);
|
||||
}
|
||||
mutable uint64_t nbytes_remain;
|
||||
};
|
||||
|
||||
struct gguf_context * gguf_init_empty(void) {
|
||||
|
||||
@@ -379,6 +379,7 @@ class MODEL_ARCH(IntEnum):
|
||||
NEO_BERT = auto()
|
||||
JINA_BERT_V2 = auto()
|
||||
JINA_BERT_V3 = auto()
|
||||
EUROBERT = auto()
|
||||
BLOOM = auto()
|
||||
STABLELM = auto()
|
||||
QWEN = auto()
|
||||
@@ -820,6 +821,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.NEO_BERT: "neo-bert",
|
||||
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
||||
MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3",
|
||||
MODEL_ARCH.EUROBERT: "eurobert",
|
||||
MODEL_ARCH.BLOOM: "bloom",
|
||||
MODEL_ARCH.STABLELM: "stablelm",
|
||||
MODEL_ARCH.QWEN: "qwen",
|
||||
@@ -1587,6 +1589,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||
],
|
||||
MODEL_ARCH.EUROBERT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
],
|
||||
MODEL_ARCH.MPT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
||||
@@ -617,13 +617,6 @@ extern "C" {
|
||||
const char * fname_out,
|
||||
const llama_model_quantize_params * params);
|
||||
|
||||
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
|
||||
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
||||
struct llama_context * ctx,
|
||||
uint32_t n_tokens,
|
||||
uint32_t n_seqs,
|
||||
uint32_t n_outputs);
|
||||
|
||||
//
|
||||
// Adapters
|
||||
//
|
||||
|
||||
@@ -62,6 +62,7 @@ add_library(llama
|
||||
models/dream.cpp
|
||||
models/ernie4-5-moe.cpp
|
||||
models/ernie4-5.cpp
|
||||
models/eurobert.cpp
|
||||
models/exaone-moe.cpp
|
||||
models/exaone.cpp
|
||||
models/exaone4.cpp
|
||||
|
||||
@@ -26,6 +26,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||
{ LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
|
||||
{ LLM_ARCH_EUROBERT, "eurobert" },
|
||||
{ LLM_ARCH_BLOOM, "bloom" },
|
||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||
{ LLM_ARCH_QWEN, "qwen" },
|
||||
@@ -819,6 +820,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
};
|
||||
case LLM_ARCH_EUROBERT:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_K,
|
||||
LLM_TENSOR_ATTN_V,
|
||||
LLM_TENSOR_ATTN_OUT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
};
|
||||
case LLM_ARCH_MODERN_BERT:
|
||||
return {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
|
||||
@@ -30,6 +30,7 @@ enum llm_arch {
|
||||
LLM_ARCH_NEO_BERT,
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
LLM_ARCH_JINA_BERT_V3,
|
||||
LLM_ARCH_EUROBERT,
|
||||
LLM_ARCH_BLOOM,
|
||||
LLM_ARCH_STABLELM,
|
||||
LLM_ARCH_QWEN,
|
||||
|
||||
@@ -3035,19 +3035,6 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
|
||||
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
|
||||
}
|
||||
|
||||
struct ggml_cgraph * llama_graph_reserve(
|
||||
struct llama_context * ctx,
|
||||
uint32_t n_tokens,
|
||||
uint32_t n_seqs,
|
||||
uint32_t n_outputs) {
|
||||
auto * memory = ctx->get_memory();
|
||||
llama_memory_context_ptr mctx;
|
||||
if (memory) {
|
||||
mctx = memory->init_full();
|
||||
}
|
||||
return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
|
||||
}
|
||||
|
||||
// llama adapter API
|
||||
|
||||
int32_t llama_set_adapters_lora(
|
||||
|
||||
@@ -979,6 +979,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
type = LLM_TYPE_250M;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_EUROBERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
|
||||
if (hparams.n_layer == 12) {
|
||||
type = LLM_TYPE_SMALL; // 0.2B
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_BLOOM:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -3570,6 +3580,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_EUROBERT:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
||||
@@ -8181,6 +8214,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
case LLM_ARCH_EUROBERT:
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
case LLM_ARCH_MODERN_BERT:
|
||||
case LLM_ARCH_GEMMA_EMBEDDING:
|
||||
@@ -8378,6 +8412,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_neo_bert>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_EUROBERT:
|
||||
{
|
||||
llm = std::make_unique<llm_build_eurobert>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_BLOOM:
|
||||
{
|
||||
llm = std::make_unique<llm_build_bloom>(*this, params);
|
||||
@@ -9004,6 +9042,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_MODERN_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
case LLM_ARCH_EUROBERT:
|
||||
case LLM_ARCH_STABLELM:
|
||||
case LLM_ARCH_BITNET:
|
||||
case LLM_ARCH_QWEN:
|
||||
|
||||
@@ -1890,7 +1890,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "falcon-h1" ||
|
||||
tokenizer_pre == "pixtral" ||
|
||||
tokenizer_pre == "midm-2.0" ||
|
||||
tokenizer_pre == "lfm2") {
|
||||
tokenizer_pre == "lfm2" ||
|
||||
tokenizer_pre == "jina-v5-nano") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||
ignore_merges = true;
|
||||
add_bos = true;
|
||||
|
||||
97
src/models/eurobert.cpp
Normal file
97
src/models/eurobert.cpp
Normal file
@@ -0,0 +1,97 @@
|
||||
#include "models.h"
|
||||
|
||||
llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
auto * inp_attn = build_attn_inp_no_cache();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
|
||||
{
|
||||
ggml_tensor * Qcur;
|
||||
ggml_tensor * Kcur;
|
||||
ggml_tensor * Vcur;
|
||||
|
||||
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, nullptr,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
ggml_tensor * ffn_inp = cur;
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_embd", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
@@ -424,6 +424,10 @@ struct llm_build_neo_bert : public llm_graph_context {
|
||||
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_eurobert : public llm_graph_context {
|
||||
llm_build_eurobert(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
template <bool iswa>
|
||||
struct llm_build_olmo2 : public llm_graph_context {
|
||||
llm_build_olmo2(const llama_model & model, const llm_graph_params & params);
|
||||
|
||||
@@ -31,12 +31,10 @@
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <future>
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <thread>
|
||||
@@ -6596,236 +6594,6 @@ struct test_diag : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// Deserializable generic test case
|
||||
struct input_tensor {
|
||||
ggml_type type;
|
||||
std::array<int64_t, 4> ne;
|
||||
std::array<size_t, 4> nb; // strides (0 = use default contiguous strides)
|
||||
};
|
||||
|
||||
static bool is_non_contiguous(const input_tensor & src) {
|
||||
if (src.nb[0] == 0) {
|
||||
return false;
|
||||
}
|
||||
const size_t default_nb0 = ggml_type_size(src.type);
|
||||
const size_t default_nb1 = default_nb0 * (src.ne[0] / ggml_blck_size(src.type));
|
||||
const size_t default_nb2 = default_nb1 * src.ne[1];
|
||||
const size_t default_nb3 = default_nb2 * src.ne[2];
|
||||
return src.nb[0] != default_nb0 ||
|
||||
src.nb[1] != default_nb1 ||
|
||||
src.nb[2] != default_nb2 ||
|
||||
src.nb[3] != default_nb3;
|
||||
}
|
||||
|
||||
static std::string var_to_str(const std::vector<input_tensor>& sources) {
|
||||
std::ostringstream oss;
|
||||
bool first = true;
|
||||
for (const auto& src : sources) {
|
||||
if (!first) oss << ",";
|
||||
oss << ggml_type_name(src.type) << "[" << src.ne[0] << "," << src.ne[1] << "," << src.ne[2] << "," << src.ne[3] << "]";
|
||||
if (is_non_contiguous(src)) {
|
||||
oss << "nb[" << src.nb[0] << "," << src.nb[1] << "," << src.nb[2] << "," << src.nb[3] << "]";
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
static std::string var_to_str(const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)>& params) {
|
||||
std::ostringstream oss;
|
||||
oss << "[";
|
||||
bool first = true;
|
||||
for (size_t i = 0; i < params.size(); ++i) {
|
||||
if (params[i] != 0) {
|
||||
if (!first) oss << ",";
|
||||
oss << i << ":" << params[i];
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
oss << "]";
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
|
||||
struct test_generic_op : public test_case {
|
||||
const ggml_op op;
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params;
|
||||
|
||||
const std::vector<input_tensor> sources;
|
||||
const std::string name;
|
||||
|
||||
std::string vars() override {
|
||||
if (name.empty()) {
|
||||
return VARS_TO_STR4(type, ne, op_params, sources);
|
||||
}
|
||||
|
||||
return VARS_TO_STR5(name, type, ne, op_params, sources);
|
||||
}
|
||||
|
||||
test_generic_op(ggml_op op, ggml_type type, std::array<int64_t, 4> ne,
|
||||
std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params,
|
||||
std::vector<input_tensor> sources, std::string name = "")
|
||||
: op(op), type(type), ne(ne), op_params(op_params), sources(sources), name(std::move(name)) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
const size_t source_count = std::min(sources.size(), (size_t)GGML_MAX_SRC);
|
||||
|
||||
std::array<ggml_tensor *, GGML_MAX_SRC> source_tensors;
|
||||
for (size_t i = 0; i < source_count; ++i) {
|
||||
const input_tensor& src = sources[i];
|
||||
|
||||
if (is_non_contiguous(src)) {
|
||||
size_t total_size;
|
||||
const size_t blck_size = ggml_blck_size(src.type);
|
||||
if (blck_size == 1) {
|
||||
total_size = ggml_type_size(src.type);
|
||||
for (int d = 0; d < 4; d++) {
|
||||
total_size += (src.ne[d] - 1) * src.nb[d];
|
||||
}
|
||||
} else {
|
||||
total_size = src.ne[0] * src.nb[0] / blck_size;
|
||||
for (int d = 1; d < 4; d++) {
|
||||
total_size += (src.ne[d] - 1) * src.nb[d];
|
||||
}
|
||||
}
|
||||
|
||||
// Convert bytes to elements, padded to block size for quantized types
|
||||
const size_t type_size = ggml_type_size(src.type);
|
||||
size_t backing_elements = (total_size * blck_size + type_size - 1) / type_size;
|
||||
backing_elements = ((backing_elements + blck_size - 1) / blck_size) * blck_size;
|
||||
ggml_tensor * backing = ggml_new_tensor_1d(ctx, src.type, backing_elements);
|
||||
source_tensors[i] = ggml_view_4d(ctx, backing,
|
||||
src.ne[0], src.ne[1], src.ne[2], src.ne[3],
|
||||
src.nb[1], src.nb[2], src.nb[3], 0);
|
||||
// nb[0] does not get set by view_4d, so set it manually
|
||||
source_tensors[i]->nb[0] = src.nb[0];
|
||||
} else {
|
||||
source_tensors[i] = ggml_new_tensor_4d(ctx, src.type, src.ne[0], src.ne[1], src.ne[2], src.ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
// Ops with an inplace flag create a view of src[0] as their output.
|
||||
bool inplace = false;
|
||||
if (op == GGML_OP_SET || op == GGML_OP_ACC) {
|
||||
inplace = op_params[4] != 0;
|
||||
} else if (op == GGML_OP_ADD_REL_POS) {
|
||||
inplace = op_params[0] != 0;
|
||||
}
|
||||
|
||||
ggml_tensor * out;
|
||||
if (inplace && source_count > 0) {
|
||||
out = ggml_view_tensor(ctx, source_tensors[0]);
|
||||
} else {
|
||||
out = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
||||
}
|
||||
out->op = op;
|
||||
for (size_t i = 0; i < source_count; ++i) {
|
||||
out->src[i] = source_tensors[i];
|
||||
}
|
||||
|
||||
memcpy(out->op_params, op_params.data(), GGML_MAX_OP_PARAMS);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
switch (op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
case GGML_OP_OUT_PROD:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_3D:
|
||||
case GGML_OP_SET_ROWS:
|
||||
case GGML_OP_CPY:
|
||||
return 5e-4;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return 1e-6;
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
return 5e-3;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
// Scale error with kv length to account for accumulating floating point error
|
||||
const int64_t kv = sources[1].ne[1];
|
||||
return 5e-4 * std::max(1.0, kv / 20000.0);
|
||||
}
|
||||
default:
|
||||
return 1e-7;
|
||||
}
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
ggml_tensor * out = ggml_get_tensor(ctx, "out");
|
||||
|
||||
std::random_device rd;
|
||||
std::default_random_engine rng(rd());
|
||||
|
||||
for (size_t i = 0; i < sources.size() && i < GGML_MAX_SRC; i++) {
|
||||
ggml_tensor * t = out->src[i];
|
||||
if (!t) {
|
||||
break;
|
||||
}
|
||||
|
||||
// FLASH_ATTN_EXT: src[3] is the KQ mask
|
||||
if (op == GGML_OP_FLASH_ATTN_EXT && i == 3) {
|
||||
init_tensor_kq_mask(t);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (t->type == GGML_TYPE_I32 || t->type == GGML_TYPE_I64) {
|
||||
if (op == GGML_OP_GET_ROWS || op == GGML_OP_GET_ROWS_BACK) {
|
||||
const int64_t num_rows = sources[0].ne[1];
|
||||
const int64_t nels = ggml_nelements(t);
|
||||
std::vector<int32_t> data(nels);
|
||||
std::uniform_int_distribution<int32_t> dist(0, num_rows - 1);
|
||||
for (int64_t i = 0; i < nels; i++) {
|
||||
data[i] = dist(rng);
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
|
||||
} else if (op == GGML_OP_SET_ROWS) {
|
||||
init_set_rows_row_ids(t, ne[1]);
|
||||
} else if (op == GGML_OP_ROPE) {
|
||||
const int mode = op_params[2];
|
||||
const int64_t nels = (mode & GGML_ROPE_TYPE_MROPE) ? ne[2] * 4 : ne[2];
|
||||
std::vector<int32_t> data(nels);
|
||||
std::uniform_int_distribution<int32_t> dist(0, ne[2] - 1);
|
||||
for (int64_t i = 0; i < nels; i++) {
|
||||
data[i] = dist(rng);
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, nels * sizeof(int32_t));
|
||||
} else if (op == GGML_OP_MUL_MAT_ID || op == GGML_OP_ADD_ID) {
|
||||
const int64_t n_expert = (op == GGML_OP_MUL_MAT_ID) ? sources[0].ne[2] : sources[1].ne[1];
|
||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||
std::vector<int32_t> data(t->ne[0]);
|
||||
for (int32_t i = 0; i < t->ne[0]; i++) {
|
||||
data[i] = i % n_expert;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
||||
}
|
||||
} else if (op == GGML_OP_SSM_SCAN) {
|
||||
for (int64_t r = 0; r < ggml_nrows(t); r++) {
|
||||
std::vector<int32_t> data(t->ne[0]);
|
||||
for (int32_t i = 0; i < t->ne[0]; i++) {
|
||||
data[i] = i;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
|
||||
}
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
enum llm_norm_type {
|
||||
LLM_NORM,
|
||||
@@ -8885,72 +8653,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const char * path) {
|
||||
std::ifstream f(path);
|
||||
|
||||
if (!f.is_open()) {
|
||||
throw std::runtime_error("Unable to read test file");
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
|
||||
std::string line;
|
||||
|
||||
while (std::getline(f, line)) {
|
||||
std::istringstream iss(line);
|
||||
|
||||
ggml_op op;
|
||||
ggml_type type;
|
||||
std::array<int64_t, 4> ne;
|
||||
std::array<int32_t, GGML_MAX_OP_PARAMS / sizeof(int32_t)> op_params = {};
|
||||
std::string name;
|
||||
uint64_t tmp;
|
||||
|
||||
iss >> tmp;
|
||||
op = (ggml_op)tmp;
|
||||
iss >> tmp;
|
||||
type = (ggml_type)tmp;
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
iss >> ne[i];
|
||||
}
|
||||
|
||||
iss >> tmp;
|
||||
for (size_t i = 0; i < tmp && i < op_params.size(); i++) {
|
||||
iss >> op_params[i];
|
||||
}
|
||||
|
||||
iss >> tmp;
|
||||
|
||||
size_t num_src = std::min((uint64_t)GGML_MAX_SRC, tmp);
|
||||
std::vector<input_tensor> sources(num_src);
|
||||
for (size_t i = 0; i < num_src; i++) {
|
||||
input_tensor& src = sources[i];
|
||||
iss >> tmp;
|
||||
src.type = (ggml_type)tmp;
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
iss >> src.ne[i];
|
||||
}
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
iss >> src.nb[i];
|
||||
}
|
||||
}
|
||||
|
||||
iss >> name;
|
||||
|
||||
if (name.length() == 1 && name[0] == '-') {
|
||||
name = "";
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_generic_op(op, type, ne, op_params, sources, std::move(name)));
|
||||
}
|
||||
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
|
||||
printer * output_printer, const char * test_file_path) {
|
||||
printer * output_printer) {
|
||||
auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
|
||||
if (params_filter == nullptr) {
|
||||
return;
|
||||
@@ -8968,26 +8672,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
|
||||
if (test_file_path == nullptr) {
|
||||
switch (mode) {
|
||||
case MODE_TEST:
|
||||
case MODE_GRAD:
|
||||
case MODE_SUPPORT:
|
||||
test_cases = make_test_cases_eval();
|
||||
break;
|
||||
case MODE_PERF:
|
||||
test_cases = make_test_cases_perf();
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
test_cases = make_test_cases_from_file(test_file_path);
|
||||
}
|
||||
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
|
||||
if (mode == MODE_TEST) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
|
||||
if (backend_cpu == NULL) {
|
||||
test_operation_info info("", "", "CPU");
|
||||
@@ -9027,6 +8714,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
if (mode == MODE_GRAD) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
size_t n_ok = 0;
|
||||
for (auto & test : test_cases) {
|
||||
if (test->eval_grad(backend, op_names_filter, output_printer)) {
|
||||
@@ -9039,6 +8728,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
if (mode == MODE_PERF) {
|
||||
auto test_cases = make_test_cases_perf();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
for (auto & test : test_cases) {
|
||||
test->eval_perf(backend, op_names_filter, output_printer);
|
||||
}
|
||||
@@ -9046,6 +8737,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
if (mode == MODE_SUPPORT) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
filter_test_cases(test_cases, params_filter);
|
||||
|
||||
// Filter out fusion cases
|
||||
test_cases.erase(
|
||||
std::remove_if(test_cases.begin(), test_cases.end(), [](const std::unique_ptr<test_case> & tc) {
|
||||
@@ -9164,8 +8858,7 @@ static void show_test_coverage() {
|
||||
}
|
||||
|
||||
static void usage(char ** argv) {
|
||||
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
|
||||
printf(" [--show-coverage] [--test-file <path>]\n");
|
||||
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops] [--show-coverage]\n", argv[0]);
|
||||
printf(" valid modes:\n");
|
||||
printf(" - test (default, compare with CPU backend for correctness)\n");
|
||||
printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
|
||||
@@ -9176,7 +8869,6 @@ static void usage(char ** argv) {
|
||||
printf(" --output specifies output format (default: console, options: console, sql, csv)\n");
|
||||
printf(" --list-ops lists all available GGML operations\n");
|
||||
printf(" --show-coverage shows test coverage\n");
|
||||
printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
@@ -9185,7 +8877,6 @@ int main(int argc, char ** argv) {
|
||||
const char * op_names_filter = nullptr;
|
||||
const char * backend_filter = nullptr;
|
||||
const char * params_filter = nullptr;
|
||||
const char * test_file_path = nullptr;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (strcmp(argv[i], "test") == 0) {
|
||||
@@ -9233,13 +8924,6 @@ int main(int argc, char ** argv) {
|
||||
} else if (strcmp(argv[i], "--show-coverage") == 0) {
|
||||
show_test_coverage();
|
||||
return 0;
|
||||
} else if (strcmp(argv[i], "--test-file") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
test_file_path = argv[++i];
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
@@ -9292,7 +8976,7 @@ int main(int argc, char ** argv) {
|
||||
false, "", ggml_backend_dev_description(dev),
|
||||
total / 1024 / 1024, free / 1024 / 1024, true));
|
||||
|
||||
bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path);
|
||||
bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get());
|
||||
|
||||
if (ok) {
|
||||
n_ok++;
|
||||
|
||||
@@ -13,7 +13,12 @@ fi
|
||||
name=$1
|
||||
input=$2
|
||||
|
||||
make -j tests/test-tokenizer-0
|
||||
# Build using CMake if binary doesn't exist
|
||||
if [ ! -f ./build/bin/test-tokenizer-0 ]; then
|
||||
printf "Building test-tokenizer-0 with CMake...\n"
|
||||
cmake -B build -DLLAMA_BUILD_TESTS=ON
|
||||
cmake --build build --target test-tokenizer-0 -j
|
||||
fi
|
||||
|
||||
printf "Testing %s on %s ...\n" $name $input
|
||||
|
||||
@@ -23,7 +28,7 @@ printf "Tokenizing using (py) Python AutoTokenizer ...\n"
|
||||
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
|
||||
|
||||
printf "Tokenizing using (cpp) llama.cpp ...\n"
|
||||
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
|
||||
./build/bin/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
|
||||
|
||||
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
|
||||
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
|
||||
|
||||
@@ -37,5 +37,4 @@ else()
|
||||
add_subdirectory(export-lora)
|
||||
endif()
|
||||
add_subdirectory(fit-params)
|
||||
add_subdirectory(export-graph-ops)
|
||||
endif()
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
set(TARGET llama-export-graph-ops)
|
||||
add_executable(${TARGET} export-graph-ops.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
@@ -1,168 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
struct input_tensor {
|
||||
ggml_type type;
|
||||
std::array<int64_t, 4> ne;
|
||||
std::array<size_t, 4> nb;
|
||||
|
||||
input_tensor(ggml_type type, int64_t * ne, size_t * nb): type(type) {
|
||||
memcpy(this->ne.data(), ne, 4 * sizeof(int64_t));
|
||||
memcpy(this->nb.data(), nb, 4 * sizeof(size_t));
|
||||
}
|
||||
|
||||
bool operator<(const input_tensor &b) const {
|
||||
return std::tie(type, ne, nb) <
|
||||
std::tie(b.type, b.ne, b.nb);
|
||||
}
|
||||
|
||||
void serialize(std::ostream& out) const {
|
||||
out << type << ' ';
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
out << ne[i] << ' ';
|
||||
}
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
out << nb[i] << ' ';
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct test_object {
|
||||
ggml_op op;
|
||||
ggml_type type;
|
||||
std::array<int64_t, 4> ne;
|
||||
std::vector<int32_t> op_params;
|
||||
std::vector<input_tensor> sources;
|
||||
std::string name;
|
||||
|
||||
void serialize(std::ostream& out) const {
|
||||
out << op << ' ' << type << ' ';
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
out << ne[i] << ' ';
|
||||
}
|
||||
|
||||
out << op_params.size() << ' ';
|
||||
for (size_t i = 0; i < op_params.size(); i++) {
|
||||
out << op_params[i] << ' ';
|
||||
}
|
||||
|
||||
out << sources.size() << ' ';
|
||||
for (size_t s = 0; s < sources.size(); s++) {
|
||||
sources[s].serialize(out);
|
||||
}
|
||||
|
||||
if (!name.empty()) {
|
||||
out << name;
|
||||
} else {
|
||||
out << '-';
|
||||
}
|
||||
|
||||
out << '\n';
|
||||
}
|
||||
|
||||
bool operator<(const test_object &b) const {
|
||||
return std::tie(op, type, ne, op_params, sources) <
|
||||
std::tie(b.op, b.type, b.ne, b.op_params, b.sources);
|
||||
}
|
||||
};
|
||||
|
||||
static void extract_graph_ops(ggml_cgraph * cgraph, const char * label, std::set<test_object> & tests) {
|
||||
int n_nodes = ggml_graph_n_nodes(cgraph);
|
||||
int n_skipped = 0;
|
||||
int n_before = (int) tests.size();
|
||||
for (int i = 0; i < n_nodes; i++) {
|
||||
ggml_tensor * node = ggml_graph_node(cgraph, i);
|
||||
|
||||
if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE) {
|
||||
n_skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
test_object test;
|
||||
|
||||
test.op = node->op;
|
||||
test.type = node->type;
|
||||
memcpy(&test.ne, node->ne, 4 * sizeof(int64_t));
|
||||
|
||||
test.op_params.resize(GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
||||
memcpy(test.op_params.data(), node->op_params, GGML_MAX_OP_PARAMS);
|
||||
|
||||
for (size_t s = 0; s < GGML_MAX_SRC; s++) {
|
||||
if (node->src[s] == nullptr) {
|
||||
break;
|
||||
}
|
||||
|
||||
test.sources.emplace_back(node->src[s]->type, node->src[s]->ne, node->src[s]->nb);
|
||||
}
|
||||
|
||||
test.name = node->name;
|
||||
tests.insert(test);
|
||||
}
|
||||
|
||||
int n_new = (int) tests.size() - n_before;
|
||||
LOG_INF("%s: %d unique ops, %d total nodes, %d skipped (view ops)\n",
|
||||
label, n_new, n_nodes, n_skipped);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
params.out_file = "tests.txt";
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
// Load CPU-only
|
||||
ggml_backend_dev_t cpu_device = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
params.devices = { cpu_device, nullptr };
|
||||
params.fit_params = false;
|
||||
params.n_gpu_layers = 0;
|
||||
|
||||
params.warmup = false;
|
||||
|
||||
auto init_result = common_init_from_params(params);
|
||||
|
||||
llama_context * ctx = init_result->context();
|
||||
|
||||
const uint32_t n_seqs = llama_n_seq_max(ctx);
|
||||
const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
|
||||
|
||||
std::set<test_object> tests;
|
||||
|
||||
auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
|
||||
if (!gf_pp) {
|
||||
throw std::runtime_error("failed to reserve prompt processing graph");
|
||||
}
|
||||
extract_graph_ops(gf_pp, "pp", tests);
|
||||
|
||||
auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
|
||||
if (!gf_tg) {
|
||||
throw std::runtime_error("failed to reserve token generation graph");
|
||||
}
|
||||
extract_graph_ops(gf_tg, "tg", tests);
|
||||
|
||||
LOG_INF("%d unique ops total\n", (int) tests.size());
|
||||
|
||||
std::ofstream f(params.out_file);
|
||||
|
||||
if (!f.is_open()) {
|
||||
throw std::runtime_error("Unable to open output file");
|
||||
}
|
||||
|
||||
for (const auto& test : tests) {
|
||||
test.serialize(f);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -912,7 +912,9 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params, c
|
||||
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
|
||||
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
||||
if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_LAST) {
|
||||
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
||||
}
|
||||
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
@@ -1510,7 +1510,7 @@ version = 1
|
||||
; If the same key is defined in a specific preset, it will override the value in this global section.
|
||||
[*]
|
||||
c = 8192
|
||||
n-gpu-layer = 8
|
||||
n-gpu-layers = 8
|
||||
|
||||
; If the key corresponds to an existing model on the server,
|
||||
; this will be used as the default config for that model
|
||||
|
||||
Reference in New Issue
Block a user