Compare commits

...

9 Commits

Author SHA1 Message Date
Kevin Pouget
ffaafde16f ggml-virtgpu: improve the reliability of the code (#19846)
* ggml-virtgpu-backend: validate the consistency of the received objects

This patch adds consistency checks in the
ggml-virtgpu-backend (running on the host side) to ensure that the
data received from the guest is consistent (valid pointers, valid
sizes and offsets).

* ggml-virtgpu-backend: add fallback/skips for optional ggml backend methods

```
  1. bck->iface.synchronize(bck)
  2. buft->iface.get_alloc_size(buft, op)
  3. buft->iface.get_max_size(buft)
```

these three methods are optional in the GGML interface. `get_max_size`
was already properly defaulted, but `backend sychronize` and `butf
get_max_size` would have segfaulted the backend if not implemented.

* ggml-virtgpu-backend: fix log format missing argument

* ggml-virtgpu-backend: improve the abort message

* ggml-virtgpu-backend: more safety checks

* ggml-virtgpu-backend: new error code

* ggml-virtgpu-backend: initialize all the error codes

* ggml-virtgpu: add a missing comment generated by the code generator

* ggml-virtgpu: add the '[virtgpu]' prefix to the device/buffer names

* ggml-virtgpu: apir_device_buffer_from_ptr: improve the error message

* ggml-virtgpu: shared: make it match the latest api_remoting.h of Virglrenderer APIR

(still unmerged)

* ggml-virtgpu: update the code generator to have dispatch_command_name in a host/guest shared file

* ggml-virtgpu: REMOTE_CALL: fail if the backend returns an error

* docs/backend/VirtGPU.md: indicate that the RAM+VRAM size is limed to 64 GB with libkrun

* ggml-virtgpu: turn off clang-format header ordering for some of the files

Compilation breaks when ordered alphabetically.

* ggml-virtgpu: clang-format

* ggml-virtgpu/backend/shared/api_remoting: better comments for the APIR return codes
2026-02-26 20:00:57 +08:00
drrros
efba35a860 server: fix load-on-startup not respected in ini file (#19897)
Co-authored-by: Roman Marchenko <r.marchenko@ideco.ru>
2026-02-26 12:32:31 +01:00
Eric Zhang
9b62913b40 jinja : correct default size for string slices (#19913) 2026-02-26 12:28:09 +01:00
Maximilian Werk
66287bdaac model : add Jina Embeddings v5 Nano (partial EuroBERT) support (#19826)
* WIP: Add EuroBERT support with autoformatting changes

This commit includes:
- EuroBERT model implementation for GGUF conversion
- C++ backend support for EuroBERT architecture
- Unintended autoformatting changes to Python files

Saving before reverting formatting-only changes.

* feat: add back eos assert when not last token pooling

* feat: removed duplicated code and cleanup

* feat: removed not working architectures and unnecessary check

* fix: typo

* fix: dynamic pooling config

* feat: added an example model for eurobert

* feat: proper llama-vocab implementation for jina-v5

* fix: removed unnecessary comments
2026-02-26 12:14:09 +01:00
Georgi Gerganov
1ca3d1de15 gguf : avoid too many file size calls (#19919) 2026-02-26 12:46:32 +02:00
yggdrasil75
bd72300591 server : fix typo in server README.md (#19900)
fix typo
2026-02-26 11:26:16 +01:00
Neo Zhang
2943210c1e support permuted, remove check s0/s10 (#19889)
Co-authored-by: Neo Zhang Jianyu <jianyu.zhang@intel.com>
2026-02-26 10:27:20 +08:00
Jeff Bolz
3769fe6eb7 vulkan: check for memory overlap before doing fusion (#19768)
* vulkan: check for memory overlap before doing fusion

* Update ggml/src/ggml-vulkan/ggml-vulkan.cpp

* address feedback
2026-02-25 18:25:38 +01:00
ddh0
832aa94762 common : add more aliases for sampler CLI params (#19797)
* common : add more aliases for sampler CLI params
2026-02-25 16:34:25 +01:00
49 changed files with 808 additions and 330 deletions

View File

@@ -1578,7 +1578,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--temp"}, "N",
{"--temp", "--temperature"}, "N",
string_format("temperature (default: %.2f)", (double)params.sampling.temp),
[](common_params & params, const std::string & value) {
params.sampling.temp = std::stof(value);
@@ -1611,7 +1611,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--top-nsigma"}, "N",
{"--top-nsigma", "--top-n-sigma"}, "N",
string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
[](common_params & params, const std::string & value) {
params.sampling.top_n_sigma = std::stof(value);
@@ -1634,7 +1634,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--typical"}, "N",
{"--typical", "--typical-p"}, "N",
string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
[](common_params & params, const std::string & value) {
params.sampling.typ_p = std::stof(value);

View File

@@ -721,6 +721,8 @@ value member_expression::execute_impl(context & ctx) {
int64_t arr_size = 0;
if (is_val<value_array>(object)) {
arr_size = object->as_array().size();
} else if (is_val<value_string>(object)) {
arr_size = object->as_string().length();
}
if (is_stmt<slice_expression>(this->property)) {

View File

@@ -1148,6 +1148,9 @@ class TextModel(ModelBase):
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de"
if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d":
# ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano
res = "jina-v5-nano"
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
@@ -6125,6 +6128,32 @@ class NeoBert(BertModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model")
class EuroBertModel(TextModel):
model_arch = gguf.MODEL_ARCH.EUROBERT
def set_vocab(self):
self.gguf_writer.add_add_bos_token(False)
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
# EuroBert is bidirectional (encoder)
self.gguf_writer.add_causal_attention(False)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self._try_set_pooling_type()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Strip "model." prefix from tensor names
if name.startswith("model."):
name = name[6:]
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
class XLMRobertaModel(BertModel):
model_arch = gguf.MODEL_ARCH.BERT

View File

@@ -107,6 +107,7 @@ models = [
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "jina-v5-nano", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v5-text-nano", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },

View File

@@ -152,7 +152,9 @@ Commands and data are serialized using a custom binary protocol with:
- **VM-specific**: Only works in virtual machines with virtio-gpu support
- **Host dependency**: Requires properly configured host-side backend
- **Latency**: Small overhead from VM escaping for each operation
- **Shared-memory size**: with the `libkrun` hypervisor, the RAM + VRAM
addressable memory is limited to 64 GB. So the maximum GPU memory
will be `64GB - RAM`, regardless of the hardware VRAM size.
* This work is pending upstream changes in the VirglRenderer
project.

View File

@@ -11,8 +11,8 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13,
int s00, int s01, int s02, int s03,
int s10, int s11, int s12, int s13,
const sycl::nd_item<3> &item_ct1) {
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
@@ -44,7 +44,7 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
for (int i0 = i0s; i0 < ne0;
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0*s00] : 0.0f, (float)src1_row[i10*s10]);
}
}
@@ -53,8 +53,8 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13,
int s00, int s01, int s02, int s03,
int s10, int s11, int s12, int s13,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
@@ -82,7 +82,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
dst_t * dst_row = dst + i_dst;
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0*s00] : 0.0f, (float)src1_row[i10*s10]);
}
@@ -95,7 +95,8 @@ struct bin_bcast_sycl {
const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
const bool src1_is_contiguous, const bool src0_is_permuted, const bool src1_is_permuted,
queue_ptr stream) {
int nr0 = ne10 / ne0;
int nr1 = ne11/ne1;
int nr2 = ne12/ne2;
@@ -123,7 +124,7 @@ struct bin_bcast_sycl {
cnb[3] *= cne[3];
};
if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
if (src0_is_contiguous && src1_is_contiguous && !src0_is_permuted && !src1_is_permuted) {
for (int i = 0; i < 4; i++) {
if (nr[i] != 1) {
break;
@@ -164,7 +165,7 @@ struct bin_bcast_sycl {
size_t nb12 = cnb1[2];
size_t nb13 = cnb1[3];
size_t s0 = nb0 / sizeof(dst_t);
// size_t s0 = nb0 / sizeof(dst_t);
size_t s1 = nb1 / sizeof(dst_t);
size_t s2 = nb2 / sizeof(dst_t);
size_t s3 = nb3 / sizeof(dst_t);
@@ -196,9 +197,6 @@ struct bin_bcast_sycl {
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
GGML_ASSERT(s0 == 1);
GGML_ASSERT(s10 == 1);
const int block_size = 128;
int64_t hne0 = std::max(ne0/2LL, 1LL);
@@ -232,8 +230,8 @@ struct bin_bcast_sycl {
[=](sycl::nd_item<3> item_ct1) {
k_bin_bcast_unravel<bin_op>(
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
s03, s11, s12, s13, item_ct1);
ne10, ne11, ne12, ne13, s1, s2, s3, s00, s01, s02,
s03, s10, s11, s12, s13, item_ct1);
});
}
} else {
@@ -251,7 +249,7 @@ struct bin_bcast_sycl {
[=](sycl::nd_item<3> item_ct1) {
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
ne2, ne3, ne10, ne11, ne12, ne13,
s1, s2, s3, s01, s02, s03, s11, s12, s13,
s1, s2, s3, s00, s01, s02, s03, s10, s11, s12, s13,
item_ct1);
});
}
@@ -268,24 +266,27 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10,
ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3,
ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01,
ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13,
nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst),
nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02,
ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1,
nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
main_stream);
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
main_stream);
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
main_stream);
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
ggml_type_name(src0->type), ggml_type_name(src1->type));

View File

@@ -7,9 +7,21 @@
#include <cstdint>
static uint32_t validate_graph_operation(size_t cgraph_size, uint32_t shmem_res_id, const char * operation) {
if (cgraph_size == 0) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Zero-size computation graph\n", operation);
return 1;
}
// place-holder: validate that the size of shmem_res_id is <= cgraph_size
// need to add another method in the Virgl->APIR callback interface
GGML_UNUSED(shmem_res_id);
return 0; // Valid
}
uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
GGML_UNUSED(ctx);
GGML_UNUSED(enc);
static bool async_backend_initialized = false;
static bool async_backend;
@@ -34,10 +46,26 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
size_t cgraph_size;
apir_decode_size_t(dec, &cgraph_size);
if (validate_graph_operation(cgraph_size, shmem_res_id, __func__) != 0) {
apir_decoder_set_fatal(dec);
return 1;
}
apir_decoder secondary_dec = apir_new_decoder((const char *) shmem_data, cgraph_size);
ggml_cgraph * cgraph = apir_decode_ggml_cgraph(&secondary_dec, cgraph_size);
if (!cgraph || apir_decoder_get_fatal(&secondary_dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to deserialize computation graph\n", __func__);
return 1;
}
if (cgraph->n_nodes < 0 || cgraph->n_leafs < 0) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid negative node/leaf count: nodes=%d leafs=%d\n", __func__,
cgraph->n_nodes, cgraph->n_leafs);
return 1;
}
ggml_status status;
#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
for (int idx = 0; idx < cgraph->n_nodes; idx++) {
@@ -45,7 +73,8 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
if (dev->iface.supports_op(dev, op)) {
continue;
}
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op));
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", __func__, idx,
ggml_op_desc(op));
status = GGML_STATUS_ABORTED;
apir_encode_ggml_status(enc, &status);
@@ -53,9 +82,17 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
return 0;
}
#endif
// Check if backend is properly initialized
if (!bck) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Backend not initialized (bck is null)\n", __func__);
return 1;
}
status = bck->iface.graph_compute(bck, cgraph);
if (async_backend) {
if (async_backend && bck->iface.synchronize) {
bck->iface.synchronize(bck);
}

View File

@@ -85,7 +85,19 @@ uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * d
const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec);
size_t value = buft->iface.get_alloc_size(buft, op);
// Check for decode error
if (op == nullptr) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to decode tensor\n", __func__);
apir_decoder_set_fatal(dec);
return 1;
}
size_t value;
if (buft->iface.get_alloc_size) {
value = buft->iface.get_alloc_size(buft, op);
} else {
value = ggml_nbytes(op); // Default fallback
}
apir_encode_size_t(enc, &value);

View File

@@ -6,11 +6,26 @@
#include <cstdint>
static uint32_t validate_buffer_operation(size_t offset, size_t size, const char * operation) {
// Only check for critical integer overflow - no arbitrary size limits
if (offset > SIZE_MAX - size) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Integer overflow in offset+size: %zu + %zu\n", operation, offset, size);
return 1;
}
return 0; // Valid
}
uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
GGML_UNUSED(ctx);
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
apir_encode_uintptr_t(enc, &base);
@@ -24,6 +39,11 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
ggml_tensor * tensor;
// safe to remove the const qualifier here
tensor = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec);
@@ -37,6 +57,10 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl
size_t size;
apir_decode_size_t(dec, &size);
if (validate_buffer_operation(offset, size, __func__) != 0) {
return 1;
}
void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
if (!shmem_data) {
@@ -56,6 +80,11 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
const ggml_tensor * tensor;
// safe to remove the const qualifier here
tensor = apir_decode_ggml_tensor(dec);
@@ -69,6 +98,10 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl
size_t size;
apir_decode_size_t(dec, &size);
if (validate_buffer_operation(offset, size, __func__) != 0) {
return 1;
}
void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
if (!shmem_data) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
@@ -86,6 +119,11 @@ uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
const ggml_tensor * src;
// safe to remove the const qualifier here
src = apir_decode_ggml_tensor(dec);
@@ -105,6 +143,11 @@ uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
uint8_t value;
apir_decode_uint8_t(dec, &value);
@@ -120,6 +163,11 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg
ggml_backend_buffer_t buffer;
buffer = apir_decode_ggml_buffer(dec);
if (!buffer || apir_decoder_get_fatal(dec)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
return 1;
}
if (!apir_untrack_backend_buffer(buffer)) {
GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: unknown buffer %p\n", __func__, (void *) buffer);
return 1;

View File

@@ -1,6 +1,6 @@
#include "backend-dispatched.h"
#include "backend-virgl-apir.h"
#include "backend-virgl-apir.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
@@ -28,19 +28,24 @@ uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) {
return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED;
}
if (!reg->iface.get_device_count(reg)) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device found\n", __func__);
size_t device_count = reg->iface.get_device_count(reg);
if (!device_count) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: no device found\n", __func__);
return APIR_BACKEND_INITIALIZE_NO_DEVICE;
}
dev = reg->iface.get_device(reg, 0);
if (!dev) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device received\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: failed to get device\n", __func__);
return APIR_BACKEND_INITIALIZE_NO_DEVICE;
}
bck = dev->iface.init_backend(dev, NULL);
if (!bck) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed\n", __func__);
return APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED;
}
return APIR_BACKEND_INITIALIZE_SUCCESS;
}

View File

@@ -32,64 +32,6 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg
/* backend */
uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
static inline const char * backend_dispatch_command_name(ApirBackendCommandType type) {
switch (type) {
/* device */
case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT:
return "backend_device_get_device_count";
case APIR_COMMAND_TYPE_DEVICE_GET_COUNT:
return "backend_device_get_count";
case APIR_COMMAND_TYPE_DEVICE_GET_NAME:
return "backend_device_get_name";
case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION:
return "backend_device_get_description";
case APIR_COMMAND_TYPE_DEVICE_GET_TYPE:
return "backend_device_get_type";
case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY:
return "backend_device_get_memory";
case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP:
return "backend_device_supports_op";
case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE:
return "backend_device_get_buffer_type";
case APIR_COMMAND_TYPE_DEVICE_GET_PROPS:
return "backend_device_get_props";
case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR:
return "backend_device_buffer_from_ptr";
/* buffer-type */
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME:
return "backend_buffer_type_get_name";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT:
return "backend_buffer_type_get_alignment";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
return "backend_buffer_type_get_max_size";
case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
return "backend_buffer_type_is_host (DEPRECATED)";
case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
return "backend_buffer_type_alloc_buffer";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
return "backend_buffer_type_get_alloc_size";
/* buffer */
case APIR_COMMAND_TYPE_BUFFER_GET_BASE:
return "backend_buffer_get_base";
case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR:
return "backend_buffer_set_tensor";
case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR:
return "backend_buffer_get_tensor";
case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR:
return "backend_buffer_cpy_tensor";
case APIR_COMMAND_TYPE_BUFFER_CLEAR:
return "backend_buffer_clear";
case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER:
return "backend_buffer_free_buffer";
/* backend */
case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE:
return "backend_backend_graph_compute";
default:
return "unknown";
}
}
extern "C" {
static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {

View File

@@ -1,5 +1,6 @@
#pragma once
// clang-format off
#include <cstdint>
#include <cstddef>
@@ -10,6 +11,7 @@
#include "shared/apir_backend.h"
#include "shared/apir_cs.h"
#include "shared/apir_cs_ggml.h"
// clang-format on
#define GGML_VIRTGPU_BCK "ggml-virtgpu-backend: "

View File

@@ -19,7 +19,7 @@ struct virgl_apir_callbacks {
};
extern "C" {
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs);
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks * virgl_cbs);
void apir_backend_deinit(uint32_t virgl_ctx_id);
uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id,
virgl_apir_callbacks * virgl_cbs,

View File

@@ -1,6 +1,5 @@
#include "backend-dispatched.h"
#include "backend-virgl-apir.h"
#include "shared/api_remoting.h"
#include "shared/apir_backend.h"
#include "shared/apir_cs.h"
@@ -17,10 +16,10 @@
#define GGML_DEFAULT_BACKEND_REG "ggml_backend_init"
static void * backend_library_handle = NULL;
static FILE * apir_logfile = NULL;
static FILE * apir_logfile = NULL;
static void log_to_file_callback(enum ggml_log_level level, const char * text, void * user_data) {
FILE * logfile = (FILE *)user_data;
FILE * logfile = (FILE *) user_data;
fprintf(logfile, "[%d] %s", level, text);
fflush(logfile);
}
@@ -48,9 +47,9 @@ void apir_backend_deinit(uint32_t virgl_ctx_id) {
}
#define APIR_GGML_LIBRARY_PATH_KEY "ggml.library.path"
#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg"
#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg"
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs) {
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks * virgl_cbs) {
const char * dlsym_error;
const char * apir_log_to_file = getenv(APIR_LLAMA_CPP_LOG_TO_FILE_ENV);
@@ -63,15 +62,13 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
}
}
const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY);
const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY);
const char * virgl_library_reg = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_REG_KEY);
const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG;
const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG;
if (!library_name) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot open the GGML library: env var '%s' not defined\n",
__func__, APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot open the GGML library: env var '%s' not defined\n", __func__,
APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
}
@@ -79,16 +76,14 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
backend_library_handle = dlopen(library_name, RTLD_LAZY);
if (!backend_library_handle) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot open the GGML library: %s\n", __func__, dlerror());
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot open the GGML library: %s\n", __func__, dlerror());
return APIR_LOAD_LIBRARY_CANNOT_OPEN;
}
if (!library_reg) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot register the GGML library: env var '%s' not defined\n",
__func__, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot register the GGML library: env var '%s' not defined\n", __func__,
APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
}
@@ -96,11 +91,9 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
dlsym_error = dlerror();
if (dlsym_error) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n",
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n",
__func__, library_reg, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error);
return APIR_LOAD_LIBRARY_SYMBOL_MISSING;
}
@@ -132,13 +125,12 @@ uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id,
virgl_apir_context ctx = {
.ctx_id = virgl_ctx_id,
.iface = virgl_cbs,
.iface = virgl_cbs,
};
if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) {
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
"%s: Received an invalid dispatch index (%d >= %d)\n",
__func__, cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Received an invalid dispatch index (%d >= %d)\n", __func__, cmd_type,
APIR_BACKEND_DISPATCH_TABLE_COUNT);
return APIR_BACKEND_FORWARD_INDEX_INVALID;
}

View File

@@ -16,28 +16,32 @@ enum ApirCommandType {
APIR_COMMAND_TYPE_LOADLIBRARY = 1,
APIR_COMMAND_TYPE_FORWARD = 2,
APIR_COMMAND_TYPE_LENGTH = 3,
APIR_COMMAND_TYPE_LENGTH = 3,
};
typedef uint64_t ApirCommandFlags;
enum ApirLoadLibraryReturnCode {
APIR_LOAD_LIBRARY_SUCCESS = 0,
// these error codes are returned by the Virglrenderer APIR component
APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1,
APIR_LOAD_LIBRARY_ALREADY_LOADED = 2,
APIR_LOAD_LIBRARY_ENV_VAR_MISSING = 3,
APIR_LOAD_LIBRARY_CANNOT_OPEN = 4,
APIR_LOAD_LIBRARY_SYMBOL_MISSING = 5,
APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, // anything above this is a APIR backend library initialization return code
// any value greater than this is an APIR *backend library* initialization return code
APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6,
};
enum ApirForwardReturnCode {
APIR_FORWARD_SUCCESS = 0,
APIR_FORWARD_NO_DISPATCH_FCT = 1,
APIR_FORWARD_TIMEOUT = 2,
APIR_FORWARD_BASE_INDEX = 3, // anything above this is a APIR backend library forward return code
} ;
APIR_FORWARD_SUCCESS = 0,
// these error codes are returned by the Virglrenderer APIR component
APIR_FORWARD_NO_DISPATCH_FCT = 1,
APIR_FORWARD_TIMEOUT = 2,
APIR_FORWARD_FAILED_TO_SYNC_STREAMS = 3,
// any value greater than this index an APIR *backend library* forward return code
APIR_FORWARD_BASE_INDEX = 4,
};
__attribute__((unused)) static inline const char * apir_command_name(ApirCommandType type) {
switch (type) {
@@ -82,6 +86,7 @@ __attribute__((unused)) static const char * apir_forward_error(ApirForwardReturn
APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS);
APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT);
APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT);
APIR_FORWARD_ERROR(APIR_FORWARD_FAILED_TO_SYNC_STREAMS);
APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX);
return "Unknown APIR_COMMAND_TYPE_FORWARD error";

View File

@@ -34,3 +34,61 @@ typedef enum ApirBackendCommandType {
// last command_type index + 1
APIR_BACKEND_DISPATCH_TABLE_COUNT = 23,
} ApirBackendCommandType;
static inline const char * apir_dispatch_command_name(ApirBackendCommandType type) {
switch (type) {
/* device */
case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT:
return "device_get_device_count";
case APIR_COMMAND_TYPE_DEVICE_GET_COUNT:
return "device_get_count";
case APIR_COMMAND_TYPE_DEVICE_GET_NAME:
return "device_get_name";
case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION:
return "device_get_description";
case APIR_COMMAND_TYPE_DEVICE_GET_TYPE:
return "device_get_type";
case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY:
return "device_get_memory";
case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP:
return "device_supports_op";
case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE:
return "device_get_buffer_type";
case APIR_COMMAND_TYPE_DEVICE_GET_PROPS:
return "device_get_props";
case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR:
return "device_buffer_from_ptr";
/* buffer-type */
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME:
return "buffer_type_get_name";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT:
return "buffer_type_get_alignment";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
return "buffer_type_get_max_size";
case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
return "buffer_type_is_host";
case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
return "buffer_type_alloc_buffer";
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
return "buffer_type_get_alloc_size";
/* buffer */
case APIR_COMMAND_TYPE_BUFFER_GET_BASE:
return "buffer_get_base";
case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR:
return "buffer_set_tensor";
case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR:
return "buffer_get_tensor";
case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR:
return "buffer_cpy_tensor";
case APIR_COMMAND_TYPE_BUFFER_CLEAR:
return "buffer_clear";
case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER:
return "buffer_free_buffer";
/* backend */
case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE:
return "backend_graph_compute";
default:
return "unknown";
}
}

View File

@@ -14,7 +14,7 @@
#define APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED 6
#define APIR_BACKEND_INITIALIZE_ALREADY_INITED 7
#define APIR_BACKEND_INITIALIZE_NO_DEVICE 8
#define APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED 9
// new entries here need to be added to the apir_backend_initialize_error function below
@@ -39,6 +39,10 @@ static const char * apir_backend_initialize_error(int code) {
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_ALREADY_INITED);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_NO_DEVICE);
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED);
return "Unknown APIR_BACKEND_INITIALIZE error:/";

View File

@@ -13,7 +13,6 @@ struct apir_encoder {
const char * start;
const char * end;
bool fatal;
};
struct apir_decoder {
@@ -28,8 +27,8 @@ struct apir_decoder {
static apir_decoder apir_new_decoder(const char * ptr, size_t size) {
apir_decoder dec = {
.cur = ptr,
.end = ptr + size,
.cur = ptr,
.end = ptr + size,
.fatal = false,
};
@@ -79,10 +78,7 @@ static inline bool apir_decoder_get_fatal(const apir_decoder * dec) {
* encode peek
*/
static inline bool apir_decoder_peek_internal(apir_decoder * dec,
size_t size,
void * val,
size_t val_size) {
static inline bool apir_decoder_peek_internal(apir_decoder * dec, size_t size, void * val, size_t val_size) {
assert(val_size <= size);
if (unlikely(size > (size_t) (dec->end - dec->cur))) {
@@ -332,8 +328,7 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t
static inline void * apir_decoder_alloc_array(size_t size, size_t count) {
size_t alloc_size;
if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n",
__func__, size, count);
GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n", __func__, size, count);
return NULL;
}
@@ -352,20 +347,19 @@ static inline void apir_decode_bool_t(apir_decoder * dec, bool * val) {
/* apir_buffer_type_host_handle_t */
static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc,
static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc,
const apir_buffer_type_host_handle_t * val) {
apir_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
}
static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec,
static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec,
apir_buffer_type_host_handle_t * val) {
apir_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
}
/* apir_buffer_host_handle_t */
static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc,
const apir_buffer_host_handle_t * val) {
static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc, const apir_buffer_host_handle_t * val) {
apir_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
}

View File

@@ -1,11 +1,10 @@
#include "ggml-impl.h"
#include "apir_cs.h"
#include "apir_cs_rpc.h"
#include "ggml-impl.h"
// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc,
const apir_buffer_host_handle_t * handle);
static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle);
static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec);
@@ -22,8 +21,7 @@ static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder
return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
}
static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec,
uint32_t n_tensors) {
static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec, uint32_t n_tensors) {
size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors;
return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
@@ -45,9 +43,9 @@ static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
}
ggml_init_params params{
/*.mem_size =*/ ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
/*.mem_size =*/ggml_tensor_overhead(),
/*.mem_buffer =*/NULL,
/*.no_alloc =*/true,
};
ggml_context * ctx = ggml_init(params);
@@ -105,6 +103,19 @@ static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec)
apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
// SECURITY: Validate buffer handle against tracked buffers to prevent
// guest VM from providing arbitrary host memory addresses
if (buffer) {
extern std::unordered_set<ggml_backend_buffer_t> backend_buffers;
if (backend_buffers.find(buffer) == backend_buffers.end()) {
GGML_LOG_WARN("ggml-virtgpu-backend: %s: Invalid buffer handle from guest: %p\n", __func__,
(void *) buffer);
// Set fatal flag to prevent further processing with invalid handle
apir_decoder_set_fatal(dec);
return NULL;
}
}
return buffer;
}

View File

@@ -1,3 +1,6 @@
#pragma once
// clang-format off
#include "ggml.h"
#include "ggml-backend-impl.h"
@@ -5,6 +8,7 @@
#include <unordered_set>
#include <vector>
#include <cstdint>
// clang-format on
// ggml_tensor is serialized into apir_rpc_tensor
struct apir_rpc_tensor {

View File

@@ -34,6 +34,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
virtgpu * gpu = BUFT_TO_GPU(buft);
// Return the prefixed name that was built once during initialization
return gpu->cached_buffer_type.name;
}
@@ -53,9 +54,8 @@ static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buff
const ggml_tensor * tensor) {
virtgpu * gpu = BUFT_TO_GPU(buft);
if (tensor->buffer == NULL
|| !tensor->buffer->context
|| !buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) {
if (tensor->buffer == NULL || !tensor->buffer->context ||
!buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) {
return ggml_nbytes(tensor);
}

View File

@@ -3,6 +3,7 @@
static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
// Return the prefixed name that was built once during initialization
return gpu->cached_device_info.name;
}
@@ -22,7 +23,7 @@ static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_bac
static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
virtgpu * gpu = DEV_TO_GPU(dev);
*free = gpu->cached_device_info.memory_free;
*free = gpu->cached_device_info.memory_free;
*total = gpu->cached_device_info.memory_total;
}
@@ -72,7 +73,7 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_
ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
static std::atomic<bool> initialized = false;
static std::atomic<bool> initialized = false;
static ggml_backend_buffer_type buft;
if (!initialized) {
@@ -95,7 +96,7 @@ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_bac
static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
virtgpu * gpu = DEV_TO_GPU(dev);
static std::atomic<bool> initialized = false;
static std::atomic<bool> initialized = false;
static ggml_backend_buffer_type buft;
if (!initialized) {

View File

@@ -7,8 +7,8 @@
void ggml_virtgpu_cleanup(virtgpu * gpu);
static virtgpu * apir_initialize() {
static virtgpu * gpu = NULL;
static std::atomic<bool> initialized = false;
static virtgpu * gpu = NULL;
static std::atomic<bool> initialized = false;
if (initialized) {
// fast track
@@ -31,29 +31,53 @@ static virtgpu * apir_initialize() {
}
// Pre-fetch and cache all device information, it will not change
gpu->cached_device_info.description = apir_device_get_description(gpu);
gpu->cached_device_info.description = apir_device_get_description(gpu);
if (!gpu->cached_device_info.description) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device description", __func__);
}
gpu->cached_device_info.name = apir_device_get_name(gpu);
if (!gpu->cached_device_info.name) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device name", __func__);
}
gpu->cached_device_info.device_count = apir_device_get_count(gpu);
gpu->cached_device_info.type = apir_device_get_type(gpu);
apir_device_get_memory(gpu,
&gpu->cached_device_info.memory_free,
&gpu->cached_device_info.memory_total);
{
// Get the remote name and create prefixed version
char * rmt_device_name = apir_device_get_name(gpu);
if (!rmt_device_name) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to get the virtgpu device name", __func__);
}
size_t device_name_len = strlen(rmt_device_name) + 11; // "[virtgpu] " + null terminator
gpu->cached_device_info.name = (char *) malloc(device_name_len);
if (!gpu->cached_device_info.name) {
free(rmt_device_name);
GGML_ABORT(GGML_VIRTGPU "%s: failed to allocate memory for prefixed device name", __func__);
}
snprintf(gpu->cached_device_info.name, device_name_len, "[virtgpu] %s", rmt_device_name);
free(rmt_device_name);
}
apir_device_get_memory(gpu, &gpu->cached_device_info.memory_free, &gpu->cached_device_info.memory_total);
apir_buffer_type_host_handle_t buft_host_handle = apir_device_get_buffer_type(gpu);
gpu->cached_buffer_type.host_handle = buft_host_handle;
gpu->cached_buffer_type.name = apir_buffer_type_get_name(gpu, buft_host_handle);
if (!gpu->cached_buffer_type.name) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu buffer type name", __func__);
{
// Get the remote name and create prefixed version
char * rmt_name = apir_buffer_type_get_name(gpu, buft_host_handle);
if (!rmt_name) {
GGML_ABORT(GGML_VIRTGPU "%s: failed to get the virtgpu buffer type name", __func__);
}
size_t prefixed_len = strlen(rmt_name) + 11; // "[virtgpu] " + null terminator
gpu->cached_buffer_type.name = (char *) malloc(prefixed_len);
if (!gpu->cached_buffer_type.name) {
free(rmt_name);
GGML_ABORT(GGML_VIRTGPU "%s: failed to allocate memory for prefixed buffer type name", __func__);
}
snprintf(gpu->cached_buffer_type.name, prefixed_len, "[virtgpu] %s", rmt_name);
free(rmt_name);
}
gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle);
gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle);
gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle);
gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle);
initialized = true;
}
@@ -98,7 +122,7 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
static std::atomic<bool> initialized = false;
if (initialized) {
return; // fast track
return; // fast track
}
{

View File

@@ -1,5 +1,5 @@
#include "ggml-remoting.h"
#include "../../include/ggml-virtgpu.h"
#include "ggml-remoting.h"
static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
UNUSED(backend);

View File

@@ -9,7 +9,7 @@
#include <string>
#define GGML_VIRTGPU_NAME "ggml-virtgpu"
#define GGML_VIRTGPU "ggml-virtgpu: "
#define GGML_VIRTGPU "ggml-virtgpu: "
// USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
struct virgl_renderer_capset_apir {
uint32_t apir_version;
uint32_t supports_blob_resources;
uint32_t reserved[4]; // For future expansion
uint32_t apir_version;
uint32_t supports_blob_resources;
uint32_t reserved[4]; // For future expansion
};

View File

@@ -145,8 +145,31 @@ class RemotingCodebaseGenerator:
enum_lines.append(f" APIR_BACKEND_DISPATCH_TABLE_COUNT = {total_count},")
enum_lines.append("} ApirBackendCommandType;")
# Generate function name mapping
func_lines = []
func_lines.append("static inline const char * apir_dispatch_command_name(ApirBackendCommandType type) {")
func_lines.append(" switch (type) {")
current_group = None
for func in functions:
# Add comment for new group
if func['group_name'] != current_group:
func_lines.append(f" /* {func['group_description']} */")
current_group = func['group_name']
# Generate clean function name without backend_ prefix
clean_name = f"{func['group_name']}_{func['function_name']}"
func_lines.append(f" case {func['enum_name']}:")
func_lines.append(f" return \"{clean_name}\";")
func_lines.append("")
func_lines.append(" default:")
func_lines.append(" return \"unknown\";")
func_lines.append(" }")
func_lines.append("}")
# Full header template
header_content = NL.join(enum_lines) + "\n"
header_content = NL.join(enum_lines) + "\n\n" + NL.join(func_lines) + "\n"
return header_content
@@ -170,19 +193,6 @@ class RemotingCodebaseGenerator:
decl_lines.append(f"{signature} {func['backend_function']}({params});")
# Switch cases
switch_lines = []
current_group = None
for func in functions:
if func['group_name'] != current_group:
switch_lines.append(f" /* {func['group_description']} */")
current_group = func['group_name']
deprecated = " (DEPRECATED)" if func['deprecated'] else ""
switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}{deprecated}\";")
# Dispatch table
table_lines = []
current_group = None
@@ -201,15 +211,6 @@ class RemotingCodebaseGenerator:
{NL.join(decl_lines)}
static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
{{
switch (type) {{
{NL.join(switch_lines)}
default: return "unknown";
}}
}}
extern "C" {{
static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {{
{NL.join(table_lines)}

View File

@@ -17,8 +17,8 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
size_t cgraph_size = apir_serialize_ggml_cgraph(cgraph, cgraph_data);
virtgpu_shmem temp_shmem; // Local storage for large buffers
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
if (cgraph_size <= gpu->data_shmem.mmap_size) {
// Lock mutex before using shared data_shmem buffer
@@ -26,7 +26,7 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
}
using_shared_shmem = true;
shmem = &gpu->data_shmem;
shmem = &gpu->data_shmem;
} else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) {
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
}

View File

@@ -62,7 +62,9 @@ size_t apir_buffer_type_get_max_size(virtgpu * gpu, apir_buffer_type_host_handle
return max_size;
}
apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size) {
apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu,
apir_buffer_type_host_handle_t host_handle,
size_t size) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;
@@ -84,7 +86,9 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_t
return buffer_context;
}
size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, const ggml_tensor * op) {
size_t apir_buffer_type_get_alloc_size(virtgpu * gpu,
apir_buffer_type_host_handle_t host_handle,
const ggml_tensor * op) {
apir_encoder * encoder;
apir_decoder * decoder;
ApirForwardReturnCode ret;

View File

@@ -35,8 +35,8 @@ void apir_buffer_set_tensor(virtgpu * gpu,
apir_encode_ggml_tensor(encoder, tensor);
virtgpu_shmem temp_shmem; // Local storage for large buffers
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
if (size <= gpu->data_shmem.mmap_size) {
// Lock mutex before using shared data_shmem buffer
@@ -44,7 +44,7 @@ void apir_buffer_set_tensor(virtgpu * gpu,
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
}
using_shared_shmem = true;
shmem = &gpu->data_shmem;
shmem = &gpu->data_shmem;
} else if (virtgpu_shmem_create(gpu, size, shmem)) {
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
@@ -86,8 +86,8 @@ void apir_buffer_get_tensor(virtgpu * gpu,
apir_encode_ggml_tensor(encoder, tensor);
virtgpu_shmem temp_shmem; // Local storage for large buffers
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
virtgpu_shmem * shmem = &temp_shmem;
bool using_shared_shmem = false;
if (size <= gpu->data_shmem.mmap_size) {
// Lock mutex before using shared data_shmem buffer
@@ -95,7 +95,7 @@ void apir_buffer_get_tensor(virtgpu * gpu,
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
}
using_shared_shmem = true;
shmem = &gpu->data_shmem;
shmem = &gpu->data_shmem;
} else if (virtgpu_shmem_create(gpu, size, shmem)) {
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);

View File

@@ -26,7 +26,7 @@ char * apir_device_get_name(virtgpu * gpu) {
REMOTE_CALL(gpu, encoder, decoder, ret);
const size_t string_size = apir_decode_array_size_unchecked(decoder);
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
if (!string) {
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__);
return NULL;
@@ -173,7 +173,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, si
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) {
GGML_ABORT(GGML_VIRTGPU "Couldn't allocate the guest-host shared buffer");
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate %ldb of guest-host shared buffer", __func__, size);
}
apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id);

View File

@@ -1,29 +1,36 @@
#include "virtgpu.h"
#pragma once
// clang-format off
#include "virtgpu.h"
#include "ggml-remoting.h"
#include "backend/shared/apir_backend.h"
#include "backend/shared/apir_cs_ggml.h"
#include "ggml-backend-impl.h"
// clang-format on
#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \
do { \
int32_t forward_flag = (int32_t) apir_command_type__; \
encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \
if (!encoder_name) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \
} \
#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \
int32_t REMOTE_CALL_PREPARE_forward_flag = (int32_t) apir_command_type__; \
const char * REMOTE_CALL_PREPARE_command_name = apir_dispatch_command_name(apir_command_type__); \
do { \
encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, REMOTE_CALL_PREPARE_forward_flag); \
if (!encoder_name) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \
} \
} while (0)
#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \
do { \
ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \
if (!decoder_name) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \
} \
if (ret_name < APIR_FORWARD_BASE_INDEX) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \
apir_forward_error(ret_name), ret_name); \
} \
ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \
#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \
do { \
ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \
if (!decoder_name) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \
} \
if (ret_name < APIR_FORWARD_BASE_INDEX) { \
GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \
apir_forward_error(ret_name), ret_name); \
} \
ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \
if (ret_name != 0) { \
GGML_ABORT(GGML_VIRTGPU "backend function '%s' failed (return code: %d)", \
REMOTE_CALL_PREPARE_command_name, ret_name); \
} \
} while (0)

View File

@@ -20,6 +20,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu * gpu,
char * apir_buffer_type_get_name(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
/* apir_buffer_type_is_host is deprecated. */
apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu,
apir_buffer_type_host_handle_t host_handle,
size_t size);

View File

@@ -53,9 +53,9 @@ static int virtgpu_handshake(virtgpu * gpu) {
if (!decoder) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initiate the communication with the virglrenderer library. "
"Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
__func__);
"%s: failed to initiate the communication with the virglrenderer library. "
"Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
__func__);
return 1;
}
@@ -65,8 +65,7 @@ static int virtgpu_handshake(virtgpu * gpu) {
uint32_t host_minor;
if (ret_magic != APIR_HANDSHAKE_MAGIC) {
GGML_ABORT(GGML_VIRTGPU
"%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
GGML_ABORT(GGML_VIRTGPU "%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
apir_backend_initialize_error(ret_magic));
} else {
apir_decode_uint32_t(decoder, &host_major);
@@ -140,15 +139,13 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
"Make sure virglrenderer is correctly configured by the hypervisor. (%s) ",
__func__, apir_load_library_error(ret));
} else {
GGML_ABORT(GGML_VIRTGPU
"%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)", __func__,
apir_load_library_error(ret), ret);
GGML_ABORT(GGML_VIRTGPU "%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)",
__func__, apir_load_library_error(ret), ret);
}
return ret;
}
GGML_LOG_INFO(GGML_VIRTGPU
"%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__);
GGML_LOG_INFO(GGML_VIRTGPU "%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__);
ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
@@ -158,10 +155,11 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
__func__, apir_load_library_error(apir_ret));
} else if (apir_ret == APIR_LOAD_LIBRARY_SYMBOL_MISSING) {
GGML_ABORT(GGML_VIRTGPU
"%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. "
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
__func__, apir_load_library_error(apir_ret));
GGML_ABORT(
GGML_VIRTGPU
"%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. "
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
__func__, apir_load_library_error(apir_ret));
} else if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
GGML_ABORT(GGML_VIRTGPU
"%s: the API Remoting backend library couldn't load the GGML backend library: apir code=%d | %s)",
@@ -169,8 +167,8 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
} else {
uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX;
GGML_ABORT(GGML_VIRTGPU
"%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__,
lib_ret);
"%s: the API Remoting backend library failed to initialize its backend library: apir code=%d)",
__func__, lib_ret);
}
return ret;
}
@@ -184,55 +182,49 @@ virtgpu * create_virtgpu() {
// Initialize mutex to protect shared data_shmem buffer
if (mtx_init(&gpu->data_shmem_mutex, mtx_plain) != thrd_success) {
delete gpu;
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize data_shmem mutex", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize data_shmem mutex", __func__);
return NULL;
}
if (virtgpu_open(gpu) != APIR_SUCCESS) {
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to open the virtgpu device\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to open the virtgpu device\n", __func__);
return NULL;
}
if (virtgpu_init_capset(gpu) != APIR_SUCCESS) {
if (gpu->use_apir_capset) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library supports it.", __func__);
"%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library "
"supports it.",
__func__);
} else {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize the virtgpu Venus capset", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu Venus capset", __func__);
}
return NULL;
}
if (virtgpu_init_context(gpu) != APIR_SUCCESS) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to initialize the GPU context", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the GPU context", __func__);
return NULL;
}
if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to create the shared reply memory pages", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to create the shared reply memory pages", __func__);
return NULL;
}
if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to create the shared data memory pages", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to create the shared data memory pages", __func__);
return NULL;
}
if (virtgpu_handshake(gpu)) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to handshake with the virglrenderer library", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to handshake with the virglrenderer library", __func__);
return NULL;
}
if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to load the backend library", __func__);
GGML_ABORT(GGML_VIRTGPU "%s: failed to load the backend library", __func__);
return NULL;
}
@@ -243,8 +235,7 @@ static virt_gpu_result_t virtgpu_open(virtgpu * gpu) {
drmDevicePtr devs[8];
int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
if (count < 0) {
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to enumerate DRM devices\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to enumerate DRM devices\n", __func__);
return APIR_ERROR_INITIALIZATION_FAILED;
}
@@ -266,19 +257,17 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d
int fd = open(node_path, O_RDWR | O_CLOEXEC);
if (fd < 0) {
GGML_ABORT(GGML_VIRTGPU
"%s: failed to open %s", __func__, node_path);
GGML_ABORT(GGML_VIRTGPU "%s: failed to open %s", __func__, node_path);
return APIR_ERROR_INITIALIZATION_FAILED;
}
drmVersionPtr version = drmGetVersion(fd);
if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) {
if (version) {
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: unknown DRM driver %s version %d\n", __func__, version->name, version->version_major);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: unknown DRM driver %s version %d\n", __func__, version->name,
version->version_major);
} else {
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to get DRM driver version\n", __func__);
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to get DRM driver version\n", __func__);
}
if (version) {
@@ -322,9 +311,8 @@ static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) {
virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data));
if (ret) {
GGML_LOG_ERROR(GGML_VIRTGPU
"%s: failed to get APIR v%d capset: %s\n",
__func__, gpu->capset.version, strerror(errno));
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to get APIR v%d capset: %s\n", __func__, gpu->capset.version,
strerror(errno));
return APIR_ERROR_INITIALIZATION_FAILED;
}
@@ -547,13 +535,10 @@ static void log_call_duration(long long call_duration_ns, const char * name) {
double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds
if (call_duration_s > 1) {
GGML_LOG_INFO(GGML_VIRTGPU
"waited %.2fs for the %s host reply...\n", call_duration_s, name);
GGML_LOG_INFO(GGML_VIRTGPU "waited %.2fs for the %s host reply...\n", call_duration_s, name);
} else if (call_duration_ms > 1) {
GGML_LOG_INFO(GGML_VIRTGPU
"waited %.2fms for the %s host reply...\n", call_duration_ms, name);
GGML_LOG_INFO(GGML_VIRTGPU "waited %.2fms for the %s host reply...\n", call_duration_ms, name);
} else {
GGML_LOG_INFO(GGML_VIRTGPU
"waited %lldns for the %s host reply...\n", call_duration_ns, name);
GGML_LOG_INFO(GGML_VIRTGPU "waited %lldns for the %s host reply...\n", call_duration_ns, name);
}
}

View File

@@ -1,5 +1,6 @@
#pragma once
// clang-format off
#include "virtgpu-utils.h"
#include "virtgpu-shm.h"
#include "virtgpu-apir.h"
@@ -23,20 +24,21 @@
#include "apir_hw.h"
#include <drm/virtgpu_drm.h>
#include "venus_hw.h"
// clang-format on
#ifndef VIRTGPU_DRM_CAPSET_APIR
// Will be defined include/drm/virtgpu_drm.h when
// https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590/diffs
// is merged
#define VIRTGPU_DRM_CAPSET_APIR 10
# define VIRTGPU_DRM_CAPSET_APIR 10
#endif
// Mesa/Virlgrenderer Venus internal. Only necessary during the
// Venus->APIR transition in Virglrenderer
#define VENUS_COMMAND_TYPE_LENGTH 331
#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16
#define VIRTGPU_DRM_CAPSET_VENUS 4
#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16
# define VIRTGPU_DRM_CAPSET_VENUS 4
#endif
typedef uint32_t virgl_renderer_capset;

View File

@@ -13820,12 +13820,11 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
return true;
}
// Check whether the tensors overlap in memory but are not equal.
// Fusions can potenitally overwrite src tensors in ways that are not prevented
// by ggml-alloc. If the fusion is entirely elementwise, then it's OK for them
// to overlap if they are exactly equal.
// XXX TODO this check is probably missing from several fusion optimizations.
static bool ggml_vk_tensors_overlap_but_not_equal(const ggml_tensor * a, const ggml_tensor * b) {
// Check whether the tensors overlap in memory.
// Fusions can potentially overwrite src tensors in ways that are not prevented
// by ggml-alloc. If the fusion src is being applied in a way that's elementwise
// with the destination, then it's OK for them to overlap if they are exactly equal.
static bool ggml_vk_tensors_overlap(const ggml_tensor * a, const ggml_tensor * b, bool elementwise) {
ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)a->buffer->context;
vk_buffer a_buf = a_buf_ctx->dev_buffer;
ggml_backend_vk_buffer_context * b_buf_ctx = (ggml_backend_vk_buffer_context *)b->buffer->context;
@@ -13836,7 +13835,7 @@ static bool ggml_vk_tensors_overlap_but_not_equal(const ggml_tensor * a, const g
auto b_base = vk_tensor_offset(b) + b->view_offs;
auto b_size = ggml_nbytes(b);
if (a_base == b_base && a_size == b_size) {
if (elementwise && a_base == b_base && a_size == b_size) {
return false;
}
@@ -13874,13 +13873,6 @@ static bool ggml_vk_can_fuse_rms_norm_mul_rope(ggml_backend_vk_context * ctx, co
return false;
}
// must not overwrite srcs in a way that's not elementwise
ggml_tensor *other_src = mul->src[0] == rms ? mul->src[1] : mul->src[0];
if (ggml_vk_tensors_overlap_but_not_equal(rms->src[0], rope) ||
ggml_vk_tensors_overlap_but_not_equal(other_src, rope)) {
return false;
}
// conditions for pipeline creation
if (!(ctx->device->float_controls_rte_fp16 &&
sizeof(vk_op_rms_norm_mul_rope_push_constants) <= ctx->device->properties.limits.maxPushConstantsSize)) {
@@ -13942,6 +13934,18 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
return num_adds;
}
static int32_t find_first_set(uint32_t x) {
int32_t ret = 0;
if (!x) {
return -1;
}
while (!(x & 1)) {
x >>= 1;
ret++;
}
return ret;
}
static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
@@ -14040,6 +14044,12 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
total_mul_mat_bytes += bytes;
}
// op_srcs_fused_elementwise indicates whether an op's srcs all contribute to
// the fused result in an elementwise-way. This affects whether the memory for
// the src is allowed to overlap the memory for the destination.
// The array is sized to handle the largest fusion (asserted later).
bool op_srcs_fused_elementwise[12];
ctx->fused_topk_moe_mode = TOPK_MOE_COUNT;
ctx->fused_topk_moe_scale = false;
const char *fusion_string {};
@@ -14048,39 +14058,68 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
if (num_adds) {
ctx->num_additional_fused_ops = num_adds - 1;
fusion_string = "MULTI_ADD";
std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, true);
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_ADD })) {
ctx->num_additional_fused_ops = 2;
fusion_string = "MUL_MAT_ADD_ADD";
op_srcs_fused_elementwise[0] = false;
op_srcs_fused_elementwise[1] = true;
op_srcs_fused_elementwise[2] = true;
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
ctx->num_additional_fused_ops = 1;
fusion_string = "MUL_MAT_ADD";
op_srcs_fused_elementwise[0] = false;
op_srcs_fused_elementwise[1] = true;
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL })) {
ctx->num_additional_fused_ops = 2;
fusion_string = "MUL_MAT_ID_ADD_ID_MUL";
op_srcs_fused_elementwise[0] = false;
op_srcs_fused_elementwise[1] = true;
op_srcs_fused_elementwise[2] = true;
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
ctx->num_additional_fused_ops = 1;
fusion_string = "MUL_MAT_ID_ADD_ID";
op_srcs_fused_elementwise[0] = false;
op_srcs_fused_elementwise[1] = true;
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_MUL })) {
ctx->num_additional_fused_ops = 1;
fusion_string = "MUL_MAT_ID_MUL";
op_srcs_fused_elementwise[0] = false;
op_srcs_fused_elementwise[1] = true;
} else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) &&
ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) &&
ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) &&
ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) {
ctx->num_additional_fused_ops = 4;
fusion_string = "RMS_NORM_MUL_ROPE_VIEW_SET_ROWS";
op_srcs_fused_elementwise[0] = false;
op_srcs_fused_elementwise[1] = false;
op_srcs_fused_elementwise[2] = false;
op_srcs_fused_elementwise[3] = false;
op_srcs_fused_elementwise[4] = false;
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&&
ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) {
ctx->num_additional_fused_ops = 2;
fusion_string = "RMS_NORM_MUL_ROPE";
// rope is approximately elementwise - whole rows are done by a single workgroup and it's row-wise
op_srcs_fused_elementwise[0] = false;
op_srcs_fused_elementwise[1] = true;
op_srcs_fused_elementwise[2] = true;
} else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
ctx->num_additional_fused_ops = 1;
fusion_string = "RMS_NORM_MUL";
// rms_norm is not elementwise, but whole rows must be consumed and the scale factor computed before
// they are overwritten, and one workgroup per row. So close enough.
op_srcs_fused_elementwise[0] = true;
op_srcs_fused_elementwise[1] = true;
} else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
ctx->num_additional_fused_ops = 2;
fusion_string = "ROPE_VIEW_SET_ROWS";
op_srcs_fused_elementwise[0] = false;
op_srcs_fused_elementwise[1] = false;
op_srcs_fused_elementwise[2] = false;
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
@@ -14089,6 +14128,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->fused_ops_write_mask |= 1 << 3;
ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX_NORM;
fusion_string = "TOPK_MOE_EARLY_SOFTMAX_NORM";
std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_sigmoid_norm_bias, { i + 4, i + 10 }) &&
ggml_check_edges(cgraph, i, topk_moe_sigmoid_norm_bias_edges) &&
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_SIGMOID_NORM_BIAS)) {
@@ -14097,6 +14137,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->fused_ops_write_mask |= 1 << 4;
ctx->fused_topk_moe_mode = TOPK_MOE_SIGMOID_NORM_BIAS;
fusion_string = "TOPK_MOE_SIGMOID_NORM_BIAS";
std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
@@ -14105,6 +14146,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->fused_ops_write_mask |= 1 << 3;
ctx->fused_topk_moe_mode = TOPK_MOE_EARLY_SOFTMAX;
fusion_string = "TOPK_MOE_EARLY_SOFTMAX";
std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
@@ -14113,6 +14155,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->fused_ops_write_mask |= 1 << 1;
ctx->fused_topk_moe_mode = TOPK_MOE_LATE_SOFTMAX;
fusion_string = "TOPK_MOE_LATE_SOFTMAX";
std::fill_n(op_srcs_fused_elementwise, ctx->num_additional_fused_ops + 1, false);
}
if (ctx->fused_topk_moe_mode != TOPK_MOE_COUNT) {
// Look for an additional scale op to fuse - occurs in deepseek2 and nemotron3 nano.
@@ -14120,11 +14163,73 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ggml_can_fuse_subgraph(cgraph, i + ctx->num_additional_fused_ops, { GGML_OP_GET_ROWS, GGML_OP_SCALE }, { i + ctx->num_additional_fused_ops + 1 })) {
ctx->fused_topk_moe_scale = true;
ctx->num_additional_fused_ops++;
op_srcs_fused_elementwise[ctx->num_additional_fused_ops] = false;
}
}
}
GGML_ASSERT(ctx->num_additional_fused_ops < (int)(sizeof(op_srcs_fused_elementwise) / sizeof(op_srcs_fused_elementwise[0])));
ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
// Check whether fusion would overwrite src operands while they're still in use.
// If so, disable fusion.
if (ctx->num_additional_fused_ops) {
// There are up to two output nodes - topk_moe has two.
uint32_t bits = ctx->fused_ops_write_mask & ~(1 << ctx->num_additional_fused_ops);
ggml_tensor *output_nodes[2] {};
output_nodes[0] = cgraph->nodes[i + ctx->num_additional_fused_ops];
if (bits) {
int output_idx = find_first_set(bits);
GGML_ASSERT(bits == (1u << output_idx));
output_nodes[1] = cgraph->nodes[i + output_idx];
}
bool need_disable = false;
// topk_moe often overwrites the source, but for a given row all the src values are
// loaded before anything is stored. If there's only one row, this is safe, so treat
// this as a special case.
bool is_topk_moe_single_row = ctx->fused_topk_moe_mode != TOPK_MOE_COUNT &&
ggml_nrows(cgraph->nodes[i]->src[0]) == 1;
if (!is_topk_moe_single_row) {
for (int j = 0; j < 2; ++j) {
ggml_tensor *dst = output_nodes[j];
if (!dst) {
continue;
}
// Loop over all srcs of all nodes in the fusion. If the src overlaps
// the destination and the src is not an intermediate node that's being
// elided, then disable fusion.
for (int k = 0; k <= ctx->num_additional_fused_ops; ++k) {
for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) {
ggml_tensor *src = cgraph->nodes[i + k]->src[s];
if (!src || src->op == GGML_OP_NONE) {
continue;
}
if (ggml_vk_tensors_overlap(src, dst, op_srcs_fused_elementwise[k])) {
bool found = false;
for (int n = 0; n < k; ++n) {
if (cgraph->nodes[i + n] == src) {
found = true;
break;
}
}
if (!found) {
need_disable = true;
}
}
}
}
}
}
if (need_disable) {
ctx->num_additional_fused_ops = 0;
ctx->fused_ops_write_mask = 1;
ctx->fused_topk_moe_mode = TOPK_MOE_COUNT;
ctx->fused_topk_moe_scale = false;
}
}
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
bool submit = (submitted_nodes >= nodes_per_submit) ||

View File

@@ -228,13 +228,41 @@ struct gguf_context {
};
struct gguf_reader {
FILE * file;
gguf_reader(FILE * file) : file(file) {
// read the remaining bytes once and update on each read
nbytes_remain = file_remain(file);
}
gguf_reader(FILE * file) : file(file) {}
// helper for remaining bytes in a file
static uint64_t file_remain(FILE * file) {
const int64_t cur = gguf_ftell(file);
if (cur < 0) {
return 0;
}
if (gguf_fseek(file, 0, SEEK_END) != 0) {
gguf_fseek(file, cur, SEEK_SET);
return 0;
}
const int64_t end = gguf_ftell(file);
if (end < 0) {
gguf_fseek(file, cur, SEEK_SET);
return 0;
}
gguf_fseek(file, cur, SEEK_SET);
return static_cast<uint64_t>(end - cur);
}
template <typename T>
bool read(T & dst) const {
return fread(&dst, 1, sizeof(dst), file) == sizeof(dst);
const size_t size = sizeof(dst);
if (nbytes_remain < size) {
return false;
}
const size_t nread = fread(&dst, 1, size, file);
nbytes_remain -= nread;
return nread == size;
}
template <typename T>
@@ -242,20 +270,19 @@ struct gguf_reader {
if (n > GGUF_MAX_ARRAY_ELEMENTS) {
return false;
}
const uint64_t nbytes = nbytes_remain();
if constexpr (std::is_same<T, std::string>::value) {
// strings are prefixed with their length, so we need to account for that
if (n > SIZE_MAX / sizeof(uint64_t)) {
return false;
}
if (nbytes < n * sizeof(uint64_t)) {
if (nbytes_remain < n * sizeof(uint64_t)) {
return false;
}
} else {
if (n > SIZE_MAX / sizeof(T)) {
return false;
}
if (nbytes < n * sizeof(T)) {
if (nbytes_remain < n * sizeof(T)) {
return false;
}
}
@@ -312,39 +339,29 @@ struct gguf_reader {
GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds maximum %" PRIu64 "\n", __func__, size, (uint64_t) GGUF_MAX_STRING_LENGTH);
return false;
}
const uint64_t nbytes = nbytes_remain();
if (size > nbytes) {
GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds remaining file size %" PRIu64 " bytes\n", __func__, size, nbytes);
if (size > nbytes_remain) {
GGML_LOG_ERROR("%s: string length %" PRIu64 " exceeds remaining file size %" PRIu64 " bytes\n", __func__, size, nbytes_remain);
return false;
}
dst.resize(static_cast<size_t>(size));
return fread(dst.data(), 1, dst.length(), file) == dst.length();
const size_t nread = fread(dst.data(), 1, size, file);
nbytes_remain -= nread;
return nread == size;
}
bool read(void * dst, const size_t size) const {
return fread(dst, 1, size, file) == size;
if (size > nbytes_remain) {
return false;
}
const size_t nread = fread(dst, 1, size, file);
nbytes_remain -= nread;
return nread == size;
}
// remaining bytes in the file
uint64_t nbytes_remain() const {
const int64_t cur = gguf_ftell(file);
if (cur < 0) {
return 0;
}
if (gguf_fseek(file, 0, SEEK_END) != 0) {
gguf_fseek(file, cur, SEEK_SET);
private:
FILE * file;
return 0;
}
const int64_t end = gguf_ftell(file);
if (end < 0) {
gguf_fseek(file, cur, SEEK_SET);
return 0;
}
gguf_fseek(file, cur, SEEK_SET);
return static_cast<uint64_t>(end - cur);
}
mutable uint64_t nbytes_remain;
};
struct gguf_context * gguf_init_empty(void) {

View File

@@ -379,6 +379,7 @@ class MODEL_ARCH(IntEnum):
NEO_BERT = auto()
JINA_BERT_V2 = auto()
JINA_BERT_V3 = auto()
EUROBERT = auto()
BLOOM = auto()
STABLELM = auto()
QWEN = auto()
@@ -820,6 +821,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.NEO_BERT: "neo-bert",
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3",
MODEL_ARCH.EUROBERT: "eurobert",
MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen",
@@ -1587,6 +1589,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.LAYER_OUT_NORM,
],
MODEL_ARCH.EUROBERT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_DOWN,
],
MODEL_ARCH.MPT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,

View File

@@ -62,6 +62,7 @@ add_library(llama
models/dream.cpp
models/ernie4-5-moe.cpp
models/ernie4-5.cpp
models/eurobert.cpp
models/exaone-moe.cpp
models/exaone.cpp
models/exaone4.cpp

View File

@@ -26,6 +26,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_NEO_BERT, "neo-bert" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
{ LLM_ARCH_EUROBERT, "eurobert" },
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" },
@@ -819,6 +820,20 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
};
case LLM_ARCH_EUROBERT:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_K,
LLM_TENSOR_ATTN_V,
LLM_TENSOR_ATTN_OUT,
LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_UP,
LLM_TENSOR_FFN_DOWN,
};
case LLM_ARCH_MODERN_BERT:
return {
LLM_TENSOR_TOKEN_EMBD,

View File

@@ -30,6 +30,7 @@ enum llm_arch {
LLM_ARCH_NEO_BERT,
LLM_ARCH_JINA_BERT_V2,
LLM_ARCH_JINA_BERT_V3,
LLM_ARCH_EUROBERT,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
LLM_ARCH_QWEN,

View File

@@ -979,6 +979,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
type = LLM_TYPE_250M;
}
} break;
case LLM_ARCH_EUROBERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
if (hparams.n_layer == 12) {
type = LLM_TYPE_SMALL; // 0.2B
}
} break;
case LLM_ARCH_BLOOM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@@ -3570,6 +3580,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
}
} break;
case LLM_ARCH_EUROBERT:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
}
} break;
case LLM_ARCH_JINA_BERT_V2:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
@@ -8181,6 +8214,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_EUROBERT:
case LLM_ARCH_WAVTOKENIZER_DEC:
case LLM_ARCH_MODERN_BERT:
case LLM_ARCH_GEMMA_EMBEDDING:
@@ -8378,6 +8412,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_neo_bert>(*this, params);
} break;
case LLM_ARCH_EUROBERT:
{
llm = std::make_unique<llm_build_eurobert>(*this, params);
} break;
case LLM_ARCH_BLOOM:
{
llm = std::make_unique<llm_build_bloom>(*this, params);
@@ -9004,6 +9042,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_MODERN_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_EUROBERT:
case LLM_ARCH_STABLELM:
case LLM_ARCH_BITNET:
case LLM_ARCH_QWEN:

View File

@@ -1890,7 +1890,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "falcon-h1" ||
tokenizer_pre == "pixtral" ||
tokenizer_pre == "midm-2.0" ||
tokenizer_pre == "lfm2") {
tokenizer_pre == "lfm2" ||
tokenizer_pre == "jina-v5-nano") {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
ignore_merges = true;
add_bos = true;

97
src/models/eurobert.cpp Normal file
View File

@@ -0,0 +1,97 @@
#include "models.h"
llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * inp_pos = build_inp_pos();
inpL = build_inp_embd(model.tok_embd);
cb(inpL, "inp_embd", -1);
auto * inp_attn = build_attn_inp_no_cache();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * cur = inpL;
cur = build_norm(inpL,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
{
ggml_tensor * Qcur;
ggml_tensor * Kcur;
ggml_tensor * Vcur;
Qcur = build_lora_mm(model.layers[il].wq, cur);
Kcur = build_lora_mm(model.layers[il].wk, cur);
Vcur = build_lora_mm(model.layers[il].wv, cur);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
cur = ggml_add(ctx0, cur, inpL);
ggml_tensor * ffn_inp = cur;
cb(ffn_inp, "ffn_inp", il);
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
inpL = cur;
}
cur = inpL;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_embd", -1);
res->t_embd = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@@ -424,6 +424,10 @@ struct llm_build_neo_bert : public llm_graph_context {
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params);
};
struct llm_build_eurobert : public llm_graph_context {
llm_build_eurobert(const llama_model & model, const llm_graph_params & params);
};
template <bool iswa>
struct llm_build_olmo2 : public llm_graph_context {
llm_build_olmo2(const llama_model & model, const llm_graph_params & params);

View File

@@ -13,7 +13,12 @@ fi
name=$1
input=$2
make -j tests/test-tokenizer-0
# Build using CMake if binary doesn't exist
if [ ! -f ./build/bin/test-tokenizer-0 ]; then
printf "Building test-tokenizer-0 with CMake...\n"
cmake -B build -DLLAMA_BUILD_TESTS=ON
cmake --build build --target test-tokenizer-0 -j
fi
printf "Testing %s on %s ...\n" $name $input
@@ -23,7 +28,7 @@ printf "Tokenizing using (py) Python AutoTokenizer ...\n"
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
printf "Tokenizing using (cpp) llama.cpp ...\n"
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
./build/bin/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"

View File

@@ -912,7 +912,9 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params, c
const bool add_bos = llama_vocab_get_add_bos(vocab);
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_LAST) {
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
}
auto tim1 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenizing the input ..\n", __func__);

View File

@@ -1510,7 +1510,7 @@ version = 1
; If the same key is defined in a specific preset, it will override the value in this global section.
[*]
c = 8192
n-gpu-layer = 8
n-gpu-layers = 8
; If the key corresponds to an existing model on the server,
; this will be used as the default config for that model

View File

@@ -291,7 +291,9 @@ void server_models::load_models() {
for (const auto & [name, inst] : mapping) {
std::string val;
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
models_to_load.push_back(name);
if (common_arg_utils::is_truthy(val)) {
models_to_load.push_back(name);
}
}
}
if ((int)models_to_load.size() > base_params.models_max) {