mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-03-05 14:33:24 +02:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
01cd448b8c | ||
|
|
99bd67c9b2 | ||
|
|
b68d75165a | ||
|
|
ffaafde16f | ||
|
|
efba35a860 |
@@ -116,7 +116,8 @@ class ModelBase:
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
|
||||
disable_mistral_community_chat_template: bool = False,
|
||||
sentence_transformers_dense_modules: bool = False):
|
||||
sentence_transformers_dense_modules: bool = False,
|
||||
fuse_gate_up_exps: bool = False):
|
||||
if type(self) is ModelBase or \
|
||||
type(self) is TextModel or \
|
||||
type(self) is MmprojModel:
|
||||
@@ -135,6 +136,9 @@ class ModelBase:
|
||||
self.dry_run = dry_run
|
||||
self.remote_hf_model_id = remote_hf_model_id
|
||||
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
|
||||
self.fuse_gate_up_exps = fuse_gate_up_exps
|
||||
self._gate_exp_buffer: dict[int, Tensor] = {}
|
||||
self._up_exp_buffer: dict[int, Tensor] = {}
|
||||
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
|
||||
self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
|
||||
self.metadata_override = metadata_override
|
||||
@@ -512,8 +516,31 @@ class ModelBase:
|
||||
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
# Handle gate/up expert tensor fusion if enabled
|
||||
if self.fuse_gate_up_exps and bid is not None:
|
||||
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid):
|
||||
self._gate_exp_buffer[bid] = data_torch
|
||||
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid):
|
||||
self._up_exp_buffer[bid] = data_torch
|
||||
|
||||
# Check if both gate and up are buffered for this layer
|
||||
if bid in self._gate_exp_buffer and bid in self._up_exp_buffer:
|
||||
gate_data = self._gate_exp_buffer.pop(bid)
|
||||
up_data = self._up_exp_buffer.pop(bid)
|
||||
# gate/up shape: (n_expert, n_ff, n_embd), concatenate to (n_expert, n_ff*2, n_embd)
|
||||
fused_data = torch.cat([gate_data, up_data], dim=1)
|
||||
fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, bid)
|
||||
logger.info(f"Fused gate_exps and up_exps for layer {bid}")
|
||||
return [(fused_name, fused_data)]
|
||||
|
||||
# If we buffered a gate/up tensor, wait for the other
|
||||
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid) or \
|
||||
self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid):
|
||||
return []
|
||||
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||
del name, new_name, bid, n_dims # unused
|
||||
@@ -11942,6 +11969,11 @@ def parse_args() -> argparse.Namespace:
|
||||
"Default these modules are not included.")
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--fuse-gate-up-exps", action="store_true",
|
||||
help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.print_supported_models and args.model is None:
|
||||
parser.error("the following arguments are required: model")
|
||||
@@ -12079,7 +12111,8 @@ def main() -> None:
|
||||
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
||||
small_first_shard=args.no_tensor_first_split,
|
||||
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
|
||||
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
|
||||
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
|
||||
fuse_gate_up_exps=args.fuse_gate_up_exps
|
||||
)
|
||||
|
||||
if args.vocab_only:
|
||||
|
||||
@@ -152,7 +152,9 @@ Commands and data are serialized using a custom binary protocol with:
|
||||
- **VM-specific**: Only works in virtual machines with virtio-gpu support
|
||||
- **Host dependency**: Requires properly configured host-side backend
|
||||
- **Latency**: Small overhead from VM escaping for each operation
|
||||
|
||||
- **Shared-memory size**: with the `libkrun` hypervisor, the RAM + VRAM
|
||||
addressable memory is limited to 64 GB. So the maximum GPU memory
|
||||
will be `64GB - RAM`, regardless of the hardware VRAM size.
|
||||
|
||||
* This work is pending upstream changes in the VirglRenderer
|
||||
project.
|
||||
|
||||
@@ -7,9 +7,21 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
static uint32_t validate_graph_operation(size_t cgraph_size, uint32_t shmem_res_id, const char * operation) {
|
||||
if (cgraph_size == 0) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Zero-size computation graph\n", operation);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// place-holder: validate that the size of shmem_res_id is <= cgraph_size
|
||||
// need to add another method in the Virgl->APIR callback interface
|
||||
GGML_UNUSED(shmem_res_id);
|
||||
|
||||
return 0; // Valid
|
||||
}
|
||||
|
||||
uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
|
||||
GGML_UNUSED(ctx);
|
||||
GGML_UNUSED(enc);
|
||||
|
||||
static bool async_backend_initialized = false;
|
||||
static bool async_backend;
|
||||
@@ -34,10 +46,26 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
|
||||
size_t cgraph_size;
|
||||
apir_decode_size_t(dec, &cgraph_size);
|
||||
|
||||
if (validate_graph_operation(cgraph_size, shmem_res_id, __func__) != 0) {
|
||||
apir_decoder_set_fatal(dec);
|
||||
return 1;
|
||||
}
|
||||
|
||||
apir_decoder secondary_dec = apir_new_decoder((const char *) shmem_data, cgraph_size);
|
||||
|
||||
ggml_cgraph * cgraph = apir_decode_ggml_cgraph(&secondary_dec, cgraph_size);
|
||||
|
||||
if (!cgraph || apir_decoder_get_fatal(&secondary_dec)) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to deserialize computation graph\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (cgraph->n_nodes < 0 || cgraph->n_leafs < 0) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid negative node/leaf count: nodes=%d leafs=%d\n", __func__,
|
||||
cgraph->n_nodes, cgraph->n_leafs);
|
||||
return 1;
|
||||
}
|
||||
|
||||
ggml_status status;
|
||||
#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
|
||||
for (int idx = 0; idx < cgraph->n_nodes; idx++) {
|
||||
@@ -45,7 +73,8 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
|
||||
if (dev->iface.supports_op(dev, op)) {
|
||||
continue;
|
||||
}
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op));
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", __func__, idx,
|
||||
ggml_op_desc(op));
|
||||
|
||||
status = GGML_STATUS_ABORTED;
|
||||
apir_encode_ggml_status(enc, &status);
|
||||
@@ -53,9 +82,17 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Check if backend is properly initialized
|
||||
if (!bck) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Backend not initialized (bck is null)\n", __func__);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
status = bck->iface.graph_compute(bck, cgraph);
|
||||
|
||||
if (async_backend) {
|
||||
if (async_backend && bck->iface.synchronize) {
|
||||
bck->iface.synchronize(bck);
|
||||
}
|
||||
|
||||
|
||||
@@ -85,7 +85,19 @@ uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * d
|
||||
|
||||
const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec);
|
||||
|
||||
size_t value = buft->iface.get_alloc_size(buft, op);
|
||||
// Check for decode error
|
||||
if (op == nullptr) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to decode tensor\n", __func__);
|
||||
apir_decoder_set_fatal(dec);
|
||||
return 1;
|
||||
}
|
||||
|
||||
size_t value;
|
||||
if (buft->iface.get_alloc_size) {
|
||||
value = buft->iface.get_alloc_size(buft, op);
|
||||
} else {
|
||||
value = ggml_nbytes(op); // Default fallback
|
||||
}
|
||||
|
||||
apir_encode_size_t(enc, &value);
|
||||
|
||||
|
||||
@@ -6,11 +6,26 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
static uint32_t validate_buffer_operation(size_t offset, size_t size, const char * operation) {
|
||||
// Only check for critical integer overflow - no arbitrary size limits
|
||||
if (offset > SIZE_MAX - size) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Integer overflow in offset+size: %zu + %zu\n", operation, offset, size);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0; // Valid
|
||||
}
|
||||
|
||||
uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) {
|
||||
GGML_UNUSED(ctx);
|
||||
ggml_backend_buffer_t buffer;
|
||||
buffer = apir_decode_ggml_buffer(dec);
|
||||
|
||||
if (!buffer || apir_decoder_get_fatal(dec)) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
|
||||
apir_encode_uintptr_t(enc, &base);
|
||||
|
||||
@@ -24,6 +39,11 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl
|
||||
ggml_backend_buffer_t buffer;
|
||||
buffer = apir_decode_ggml_buffer(dec);
|
||||
|
||||
if (!buffer || apir_decoder_get_fatal(dec)) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
ggml_tensor * tensor;
|
||||
// safe to remove the const qualifier here
|
||||
tensor = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec);
|
||||
@@ -37,6 +57,10 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl
|
||||
size_t size;
|
||||
apir_decode_size_t(dec, &size);
|
||||
|
||||
if (validate_buffer_operation(offset, size, __func__) != 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
|
||||
|
||||
if (!shmem_data) {
|
||||
@@ -56,6 +80,11 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl
|
||||
ggml_backend_buffer_t buffer;
|
||||
buffer = apir_decode_ggml_buffer(dec);
|
||||
|
||||
if (!buffer || apir_decoder_get_fatal(dec)) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const ggml_tensor * tensor;
|
||||
// safe to remove the const qualifier here
|
||||
tensor = apir_decode_ggml_tensor(dec);
|
||||
@@ -69,6 +98,10 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl
|
||||
size_t size;
|
||||
apir_decode_size_t(dec, &size);
|
||||
|
||||
if (validate_buffer_operation(offset, size, __func__) != 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id);
|
||||
if (!shmem_data) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__);
|
||||
@@ -86,6 +119,11 @@ uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl
|
||||
ggml_backend_buffer_t buffer;
|
||||
buffer = apir_decode_ggml_buffer(dec);
|
||||
|
||||
if (!buffer || apir_decoder_get_fatal(dec)) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const ggml_tensor * src;
|
||||
// safe to remove the const qualifier here
|
||||
src = apir_decode_ggml_tensor(dec);
|
||||
@@ -105,6 +143,11 @@ uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir
|
||||
ggml_backend_buffer_t buffer;
|
||||
buffer = apir_decode_ggml_buffer(dec);
|
||||
|
||||
if (!buffer || apir_decoder_get_fatal(dec)) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
uint8_t value;
|
||||
apir_decode_uint8_t(dec, &value);
|
||||
|
||||
@@ -120,6 +163,11 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg
|
||||
ggml_backend_buffer_t buffer;
|
||||
buffer = apir_decode_ggml_buffer(dec);
|
||||
|
||||
if (!buffer || apir_decoder_get_fatal(dec)) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!apir_untrack_backend_buffer(buffer)) {
|
||||
GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: unknown buffer %p\n", __func__, (void *) buffer);
|
||||
return 1;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "backend-dispatched.h"
|
||||
#include "backend-virgl-apir.h"
|
||||
|
||||
#include "backend-virgl-apir.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
@@ -28,19 +28,24 @@ uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) {
|
||||
return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED;
|
||||
}
|
||||
|
||||
if (!reg->iface.get_device_count(reg)) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device found\n", __func__);
|
||||
size_t device_count = reg->iface.get_device_count(reg);
|
||||
if (!device_count) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: no device found\n", __func__);
|
||||
return APIR_BACKEND_INITIALIZE_NO_DEVICE;
|
||||
}
|
||||
|
||||
dev = reg->iface.get_device(reg, 0);
|
||||
|
||||
if (!dev) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device received\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: failed to get device\n", __func__);
|
||||
return APIR_BACKEND_INITIALIZE_NO_DEVICE;
|
||||
}
|
||||
|
||||
bck = dev->iface.init_backend(dev, NULL);
|
||||
if (!bck) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed\n", __func__);
|
||||
return APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED;
|
||||
}
|
||||
|
||||
return APIR_BACKEND_INITIALIZE_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -32,64 +32,6 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg
|
||||
/* backend */
|
||||
uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx);
|
||||
|
||||
static inline const char * backend_dispatch_command_name(ApirBackendCommandType type) {
|
||||
switch (type) {
|
||||
/* device */
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT:
|
||||
return "backend_device_get_device_count";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_COUNT:
|
||||
return "backend_device_get_count";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_NAME:
|
||||
return "backend_device_get_name";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION:
|
||||
return "backend_device_get_description";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_TYPE:
|
||||
return "backend_device_get_type";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY:
|
||||
return "backend_device_get_memory";
|
||||
case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP:
|
||||
return "backend_device_supports_op";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE:
|
||||
return "backend_device_get_buffer_type";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_PROPS:
|
||||
return "backend_device_get_props";
|
||||
case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR:
|
||||
return "backend_device_buffer_from_ptr";
|
||||
/* buffer-type */
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME:
|
||||
return "backend_buffer_type_get_name";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT:
|
||||
return "backend_buffer_type_get_alignment";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
|
||||
return "backend_buffer_type_get_max_size";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
|
||||
return "backend_buffer_type_is_host (DEPRECATED)";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
|
||||
return "backend_buffer_type_alloc_buffer";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
|
||||
return "backend_buffer_type_get_alloc_size";
|
||||
/* buffer */
|
||||
case APIR_COMMAND_TYPE_BUFFER_GET_BASE:
|
||||
return "backend_buffer_get_base";
|
||||
case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR:
|
||||
return "backend_buffer_set_tensor";
|
||||
case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR:
|
||||
return "backend_buffer_get_tensor";
|
||||
case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR:
|
||||
return "backend_buffer_cpy_tensor";
|
||||
case APIR_COMMAND_TYPE_BUFFER_CLEAR:
|
||||
return "backend_buffer_clear";
|
||||
case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER:
|
||||
return "backend_buffer_free_buffer";
|
||||
/* backend */
|
||||
case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE:
|
||||
return "backend_backend_graph_compute";
|
||||
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
// clang-format off
|
||||
#include <cstdint>
|
||||
#include <cstddef>
|
||||
|
||||
@@ -10,6 +11,7 @@
|
||||
#include "shared/apir_backend.h"
|
||||
#include "shared/apir_cs.h"
|
||||
#include "shared/apir_cs_ggml.h"
|
||||
// clang-format on
|
||||
|
||||
#define GGML_VIRTGPU_BCK "ggml-virtgpu-backend: "
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ struct virgl_apir_callbacks {
|
||||
};
|
||||
|
||||
extern "C" {
|
||||
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs);
|
||||
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks * virgl_cbs);
|
||||
void apir_backend_deinit(uint32_t virgl_ctx_id);
|
||||
uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id,
|
||||
virgl_apir_callbacks * virgl_cbs,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#include "backend-dispatched.h"
|
||||
#include "backend-virgl-apir.h"
|
||||
|
||||
#include "shared/api_remoting.h"
|
||||
#include "shared/apir_backend.h"
|
||||
#include "shared/apir_cs.h"
|
||||
@@ -17,10 +16,10 @@
|
||||
#define GGML_DEFAULT_BACKEND_REG "ggml_backend_init"
|
||||
|
||||
static void * backend_library_handle = NULL;
|
||||
static FILE * apir_logfile = NULL;
|
||||
static FILE * apir_logfile = NULL;
|
||||
|
||||
static void log_to_file_callback(enum ggml_log_level level, const char * text, void * user_data) {
|
||||
FILE * logfile = (FILE *)user_data;
|
||||
FILE * logfile = (FILE *) user_data;
|
||||
fprintf(logfile, "[%d] %s", level, text);
|
||||
fflush(logfile);
|
||||
}
|
||||
@@ -48,9 +47,9 @@ void apir_backend_deinit(uint32_t virgl_ctx_id) {
|
||||
}
|
||||
|
||||
#define APIR_GGML_LIBRARY_PATH_KEY "ggml.library.path"
|
||||
#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg"
|
||||
#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg"
|
||||
|
||||
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs) {
|
||||
ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks * virgl_cbs) {
|
||||
const char * dlsym_error;
|
||||
|
||||
const char * apir_log_to_file = getenv(APIR_LLAMA_CPP_LOG_TO_FILE_ENV);
|
||||
@@ -63,15 +62,13 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
|
||||
}
|
||||
}
|
||||
|
||||
const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY);
|
||||
const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY);
|
||||
const char * virgl_library_reg = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_REG_KEY);
|
||||
const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG;
|
||||
const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG;
|
||||
|
||||
if (!library_name) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
|
||||
"%s: cannot open the GGML library: env var '%s' not defined\n",
|
||||
__func__, APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
|
||||
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot open the GGML library: env var '%s' not defined\n", __func__,
|
||||
APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV);
|
||||
|
||||
return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
|
||||
}
|
||||
@@ -79,16 +76,14 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
|
||||
backend_library_handle = dlopen(library_name, RTLD_LAZY);
|
||||
|
||||
if (!backend_library_handle) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
|
||||
"%s: cannot open the GGML library: %s\n", __func__, dlerror());
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot open the GGML library: %s\n", __func__, dlerror());
|
||||
|
||||
return APIR_LOAD_LIBRARY_CANNOT_OPEN;
|
||||
}
|
||||
|
||||
if (!library_reg) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
|
||||
"%s: cannot register the GGML library: env var '%s' not defined\n",
|
||||
__func__, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot register the GGML library: env var '%s' not defined\n", __func__,
|
||||
APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV);
|
||||
|
||||
return APIR_LOAD_LIBRARY_ENV_VAR_MISSING;
|
||||
}
|
||||
@@ -96,11 +91,9 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct
|
||||
void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
|
||||
dlsym_error = dlerror();
|
||||
if (dlsym_error) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
|
||||
"%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n",
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n",
|
||||
__func__, library_reg, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error);
|
||||
|
||||
|
||||
return APIR_LOAD_LIBRARY_SYMBOL_MISSING;
|
||||
}
|
||||
|
||||
@@ -132,13 +125,12 @@ uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id,
|
||||
|
||||
virgl_apir_context ctx = {
|
||||
.ctx_id = virgl_ctx_id,
|
||||
.iface = virgl_cbs,
|
||||
.iface = virgl_cbs,
|
||||
};
|
||||
|
||||
if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK
|
||||
"%s: Received an invalid dispatch index (%d >= %d)\n",
|
||||
__func__, cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Received an invalid dispatch index (%d >= %d)\n", __func__, cmd_type,
|
||||
APIR_BACKEND_DISPATCH_TABLE_COUNT);
|
||||
return APIR_BACKEND_FORWARD_INDEX_INVALID;
|
||||
}
|
||||
|
||||
|
||||
@@ -16,28 +16,32 @@ enum ApirCommandType {
|
||||
APIR_COMMAND_TYPE_LOADLIBRARY = 1,
|
||||
APIR_COMMAND_TYPE_FORWARD = 2,
|
||||
|
||||
APIR_COMMAND_TYPE_LENGTH = 3,
|
||||
APIR_COMMAND_TYPE_LENGTH = 3,
|
||||
};
|
||||
|
||||
typedef uint64_t ApirCommandFlags;
|
||||
|
||||
enum ApirLoadLibraryReturnCode {
|
||||
APIR_LOAD_LIBRARY_SUCCESS = 0,
|
||||
// these error codes are returned by the Virglrenderer APIR component
|
||||
APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1,
|
||||
APIR_LOAD_LIBRARY_ALREADY_LOADED = 2,
|
||||
APIR_LOAD_LIBRARY_ENV_VAR_MISSING = 3,
|
||||
APIR_LOAD_LIBRARY_CANNOT_OPEN = 4,
|
||||
APIR_LOAD_LIBRARY_SYMBOL_MISSING = 5,
|
||||
APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, // anything above this is a APIR backend library initialization return code
|
||||
// any value greater than this is an APIR *backend library* initialization return code
|
||||
APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6,
|
||||
};
|
||||
|
||||
enum ApirForwardReturnCode {
|
||||
APIR_FORWARD_SUCCESS = 0,
|
||||
APIR_FORWARD_NO_DISPATCH_FCT = 1,
|
||||
APIR_FORWARD_TIMEOUT = 2,
|
||||
|
||||
APIR_FORWARD_BASE_INDEX = 3, // anything above this is a APIR backend library forward return code
|
||||
} ;
|
||||
APIR_FORWARD_SUCCESS = 0,
|
||||
// these error codes are returned by the Virglrenderer APIR component
|
||||
APIR_FORWARD_NO_DISPATCH_FCT = 1,
|
||||
APIR_FORWARD_TIMEOUT = 2,
|
||||
APIR_FORWARD_FAILED_TO_SYNC_STREAMS = 3,
|
||||
// any value greater than this index an APIR *backend library* forward return code
|
||||
APIR_FORWARD_BASE_INDEX = 4,
|
||||
};
|
||||
|
||||
__attribute__((unused)) static inline const char * apir_command_name(ApirCommandType type) {
|
||||
switch (type) {
|
||||
@@ -82,6 +86,7 @@ __attribute__((unused)) static const char * apir_forward_error(ApirForwardReturn
|
||||
APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS);
|
||||
APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT);
|
||||
APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT);
|
||||
APIR_FORWARD_ERROR(APIR_FORWARD_FAILED_TO_SYNC_STREAMS);
|
||||
APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX);
|
||||
|
||||
return "Unknown APIR_COMMAND_TYPE_FORWARD error";
|
||||
|
||||
@@ -34,3 +34,61 @@ typedef enum ApirBackendCommandType {
|
||||
// last command_type index + 1
|
||||
APIR_BACKEND_DISPATCH_TABLE_COUNT = 23,
|
||||
} ApirBackendCommandType;
|
||||
|
||||
static inline const char * apir_dispatch_command_name(ApirBackendCommandType type) {
|
||||
switch (type) {
|
||||
/* device */
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT:
|
||||
return "device_get_device_count";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_COUNT:
|
||||
return "device_get_count";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_NAME:
|
||||
return "device_get_name";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION:
|
||||
return "device_get_description";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_TYPE:
|
||||
return "device_get_type";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY:
|
||||
return "device_get_memory";
|
||||
case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP:
|
||||
return "device_supports_op";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE:
|
||||
return "device_get_buffer_type";
|
||||
case APIR_COMMAND_TYPE_DEVICE_GET_PROPS:
|
||||
return "device_get_props";
|
||||
case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR:
|
||||
return "device_buffer_from_ptr";
|
||||
/* buffer-type */
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME:
|
||||
return "buffer_type_get_name";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT:
|
||||
return "buffer_type_get_alignment";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE:
|
||||
return "buffer_type_get_max_size";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST:
|
||||
return "buffer_type_is_host";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER:
|
||||
return "buffer_type_alloc_buffer";
|
||||
case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE:
|
||||
return "buffer_type_get_alloc_size";
|
||||
/* buffer */
|
||||
case APIR_COMMAND_TYPE_BUFFER_GET_BASE:
|
||||
return "buffer_get_base";
|
||||
case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR:
|
||||
return "buffer_set_tensor";
|
||||
case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR:
|
||||
return "buffer_get_tensor";
|
||||
case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR:
|
||||
return "buffer_cpy_tensor";
|
||||
case APIR_COMMAND_TYPE_BUFFER_CLEAR:
|
||||
return "buffer_clear";
|
||||
case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER:
|
||||
return "buffer_free_buffer";
|
||||
/* backend */
|
||||
case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE:
|
||||
return "backend_graph_compute";
|
||||
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
#define APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED 6
|
||||
#define APIR_BACKEND_INITIALIZE_ALREADY_INITED 7
|
||||
#define APIR_BACKEND_INITIALIZE_NO_DEVICE 8
|
||||
|
||||
#define APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED 9
|
||||
|
||||
// new entries here need to be added to the apir_backend_initialize_error function below
|
||||
|
||||
@@ -39,6 +39,10 @@ static const char * apir_backend_initialize_error(int code) {
|
||||
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS);
|
||||
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS);
|
||||
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED);
|
||||
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED);
|
||||
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_ALREADY_INITED);
|
||||
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_NO_DEVICE);
|
||||
APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED);
|
||||
|
||||
return "Unknown APIR_BACKEND_INITIALIZE error:/";
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@ struct apir_encoder {
|
||||
const char * start;
|
||||
const char * end;
|
||||
bool fatal;
|
||||
|
||||
};
|
||||
|
||||
struct apir_decoder {
|
||||
@@ -28,8 +27,8 @@ struct apir_decoder {
|
||||
|
||||
static apir_decoder apir_new_decoder(const char * ptr, size_t size) {
|
||||
apir_decoder dec = {
|
||||
.cur = ptr,
|
||||
.end = ptr + size,
|
||||
.cur = ptr,
|
||||
.end = ptr + size,
|
||||
.fatal = false,
|
||||
};
|
||||
|
||||
@@ -79,10 +78,7 @@ static inline bool apir_decoder_get_fatal(const apir_decoder * dec) {
|
||||
* encode peek
|
||||
*/
|
||||
|
||||
static inline bool apir_decoder_peek_internal(apir_decoder * dec,
|
||||
size_t size,
|
||||
void * val,
|
||||
size_t val_size) {
|
||||
static inline bool apir_decoder_peek_internal(apir_decoder * dec, size_t size, void * val, size_t val_size) {
|
||||
assert(val_size <= size);
|
||||
|
||||
if (unlikely(size > (size_t) (dec->end - dec->cur))) {
|
||||
@@ -332,8 +328,7 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t
|
||||
static inline void * apir_decoder_alloc_array(size_t size, size_t count) {
|
||||
size_t alloc_size;
|
||||
if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
|
||||
GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n",
|
||||
__func__, size, count);
|
||||
GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n", __func__, size, count);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -352,20 +347,19 @@ static inline void apir_decode_bool_t(apir_decoder * dec, bool * val) {
|
||||
|
||||
/* apir_buffer_type_host_handle_t */
|
||||
|
||||
static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc,
|
||||
static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc,
|
||||
const apir_buffer_type_host_handle_t * val) {
|
||||
apir_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
|
||||
}
|
||||
|
||||
static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec,
|
||||
static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec,
|
||||
apir_buffer_type_host_handle_t * val) {
|
||||
apir_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
|
||||
}
|
||||
|
||||
/* apir_buffer_host_handle_t */
|
||||
|
||||
static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc,
|
||||
const apir_buffer_host_handle_t * val) {
|
||||
static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc, const apir_buffer_host_handle_t * val) {
|
||||
apir_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "apir_cs.h"
|
||||
#include "apir_cs_rpc.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
|
||||
|
||||
static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc,
|
||||
const apir_buffer_host_handle_t * handle);
|
||||
static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle);
|
||||
|
||||
static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec);
|
||||
|
||||
@@ -22,8 +21,7 @@ static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder
|
||||
return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
|
||||
}
|
||||
|
||||
static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec,
|
||||
uint32_t n_tensors) {
|
||||
static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec, uint32_t n_tensors) {
|
||||
size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors;
|
||||
|
||||
return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size);
|
||||
@@ -45,9 +43,9 @@ static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) {
|
||||
}
|
||||
|
||||
ggml_init_params params{
|
||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
/*.mem_size =*/ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/NULL,
|
||||
/*.no_alloc =*/true,
|
||||
};
|
||||
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
@@ -105,6 +103,19 @@ static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec)
|
||||
|
||||
apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
|
||||
|
||||
// SECURITY: Validate buffer handle against tracked buffers to prevent
|
||||
// guest VM from providing arbitrary host memory addresses
|
||||
if (buffer) {
|
||||
extern std::unordered_set<ggml_backend_buffer_t> backend_buffers;
|
||||
if (backend_buffers.find(buffer) == backend_buffers.end()) {
|
||||
GGML_LOG_WARN("ggml-virtgpu-backend: %s: Invalid buffer handle from guest: %p\n", __func__,
|
||||
(void *) buffer);
|
||||
// Set fatal flag to prevent further processing with invalid handle
|
||||
apir_decoder_set_fatal(dec);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
// clang-format off
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
@@ -5,6 +8,7 @@
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <cstdint>
|
||||
// clang-format on
|
||||
|
||||
// ggml_tensor is serialized into apir_rpc_tensor
|
||||
struct apir_rpc_tensor {
|
||||
|
||||
@@ -34,6 +34,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml
|
||||
static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
virtgpu * gpu = BUFT_TO_GPU(buft);
|
||||
|
||||
// Return the prefixed name that was built once during initialization
|
||||
return gpu->cached_buffer_type.name;
|
||||
}
|
||||
|
||||
@@ -53,9 +54,8 @@ static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buff
|
||||
const ggml_tensor * tensor) {
|
||||
virtgpu * gpu = BUFT_TO_GPU(buft);
|
||||
|
||||
if (tensor->buffer == NULL
|
||||
|| !tensor->buffer->context
|
||||
|| !buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) {
|
||||
if (tensor->buffer == NULL || !tensor->buffer->context ||
|
||||
!buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) {
|
||||
return ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
|
||||
virtgpu * gpu = DEV_TO_GPU(dev);
|
||||
|
||||
// Return the prefixed name that was built once during initialization
|
||||
return gpu->cached_device_info.name;
|
||||
}
|
||||
|
||||
@@ -22,7 +23,7 @@ static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_bac
|
||||
static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
virtgpu * gpu = DEV_TO_GPU(dev);
|
||||
|
||||
*free = gpu->cached_device_info.memory_free;
|
||||
*free = gpu->cached_device_info.memory_free;
|
||||
*total = gpu->cached_device_info.memory_total;
|
||||
}
|
||||
|
||||
@@ -72,7 +73,7 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_
|
||||
ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
virtgpu * gpu = DEV_TO_GPU(dev);
|
||||
|
||||
static std::atomic<bool> initialized = false;
|
||||
static std::atomic<bool> initialized = false;
|
||||
static ggml_backend_buffer_type buft;
|
||||
|
||||
if (!initialized) {
|
||||
@@ -95,7 +96,7 @@ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_bac
|
||||
static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
|
||||
virtgpu * gpu = DEV_TO_GPU(dev);
|
||||
|
||||
static std::atomic<bool> initialized = false;
|
||||
static std::atomic<bool> initialized = false;
|
||||
static ggml_backend_buffer_type buft;
|
||||
|
||||
if (!initialized) {
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
void ggml_virtgpu_cleanup(virtgpu * gpu);
|
||||
|
||||
static virtgpu * apir_initialize() {
|
||||
static virtgpu * gpu = NULL;
|
||||
static std::atomic<bool> initialized = false;
|
||||
static virtgpu * gpu = NULL;
|
||||
static std::atomic<bool> initialized = false;
|
||||
|
||||
if (initialized) {
|
||||
// fast track
|
||||
@@ -31,29 +31,53 @@ static virtgpu * apir_initialize() {
|
||||
}
|
||||
|
||||
// Pre-fetch and cache all device information, it will not change
|
||||
gpu->cached_device_info.description = apir_device_get_description(gpu);
|
||||
gpu->cached_device_info.description = apir_device_get_description(gpu);
|
||||
if (!gpu->cached_device_info.description) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device description", __func__);
|
||||
}
|
||||
gpu->cached_device_info.name = apir_device_get_name(gpu);
|
||||
if (!gpu->cached_device_info.name) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device name", __func__);
|
||||
}
|
||||
gpu->cached_device_info.device_count = apir_device_get_count(gpu);
|
||||
gpu->cached_device_info.type = apir_device_get_type(gpu);
|
||||
|
||||
apir_device_get_memory(gpu,
|
||||
&gpu->cached_device_info.memory_free,
|
||||
&gpu->cached_device_info.memory_total);
|
||||
{
|
||||
// Get the remote name and create prefixed version
|
||||
char * rmt_device_name = apir_device_get_name(gpu);
|
||||
if (!rmt_device_name) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to get the virtgpu device name", __func__);
|
||||
}
|
||||
|
||||
size_t device_name_len = strlen(rmt_device_name) + 11; // "[virtgpu] " + null terminator
|
||||
gpu->cached_device_info.name = (char *) malloc(device_name_len);
|
||||
if (!gpu->cached_device_info.name) {
|
||||
free(rmt_device_name);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to allocate memory for prefixed device name", __func__);
|
||||
}
|
||||
snprintf(gpu->cached_device_info.name, device_name_len, "[virtgpu] %s", rmt_device_name);
|
||||
free(rmt_device_name);
|
||||
}
|
||||
|
||||
apir_device_get_memory(gpu, &gpu->cached_device_info.memory_free, &gpu->cached_device_info.memory_total);
|
||||
|
||||
apir_buffer_type_host_handle_t buft_host_handle = apir_device_get_buffer_type(gpu);
|
||||
gpu->cached_buffer_type.host_handle = buft_host_handle;
|
||||
gpu->cached_buffer_type.name = apir_buffer_type_get_name(gpu, buft_host_handle);
|
||||
if (!gpu->cached_buffer_type.name) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu buffer type name", __func__);
|
||||
{
|
||||
// Get the remote name and create prefixed version
|
||||
char * rmt_name = apir_buffer_type_get_name(gpu, buft_host_handle);
|
||||
if (!rmt_name) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to get the virtgpu buffer type name", __func__);
|
||||
}
|
||||
|
||||
size_t prefixed_len = strlen(rmt_name) + 11; // "[virtgpu] " + null terminator
|
||||
gpu->cached_buffer_type.name = (char *) malloc(prefixed_len);
|
||||
if (!gpu->cached_buffer_type.name) {
|
||||
free(rmt_name);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to allocate memory for prefixed buffer type name", __func__);
|
||||
}
|
||||
snprintf(gpu->cached_buffer_type.name, prefixed_len, "[virtgpu] %s", rmt_name);
|
||||
free(rmt_name);
|
||||
}
|
||||
gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle);
|
||||
gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle);
|
||||
|
||||
gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle);
|
||||
gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle);
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
@@ -98,7 +122,7 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
|
||||
static std::atomic<bool> initialized = false;
|
||||
|
||||
if (initialized) {
|
||||
return; // fast track
|
||||
return; // fast track
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "ggml-remoting.h"
|
||||
#include "../../include/ggml-virtgpu.h"
|
||||
#include "ggml-remoting.h"
|
||||
|
||||
static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
|
||||
UNUSED(backend);
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#include <string>
|
||||
|
||||
#define GGML_VIRTGPU_NAME "ggml-virtgpu"
|
||||
#define GGML_VIRTGPU "ggml-virtgpu: "
|
||||
#define GGML_VIRTGPU "ggml-virtgpu: "
|
||||
|
||||
// USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
struct virgl_renderer_capset_apir {
|
||||
uint32_t apir_version;
|
||||
uint32_t supports_blob_resources;
|
||||
uint32_t reserved[4]; // For future expansion
|
||||
uint32_t apir_version;
|
||||
uint32_t supports_blob_resources;
|
||||
uint32_t reserved[4]; // For future expansion
|
||||
};
|
||||
|
||||
@@ -145,8 +145,31 @@ class RemotingCodebaseGenerator:
|
||||
enum_lines.append(f" APIR_BACKEND_DISPATCH_TABLE_COUNT = {total_count},")
|
||||
enum_lines.append("} ApirBackendCommandType;")
|
||||
|
||||
# Generate function name mapping
|
||||
func_lines = []
|
||||
func_lines.append("static inline const char * apir_dispatch_command_name(ApirBackendCommandType type) {")
|
||||
func_lines.append(" switch (type) {")
|
||||
|
||||
current_group = None
|
||||
for func in functions:
|
||||
# Add comment for new group
|
||||
if func['group_name'] != current_group:
|
||||
func_lines.append(f" /* {func['group_description']} */")
|
||||
current_group = func['group_name']
|
||||
|
||||
# Generate clean function name without backend_ prefix
|
||||
clean_name = f"{func['group_name']}_{func['function_name']}"
|
||||
func_lines.append(f" case {func['enum_name']}:")
|
||||
func_lines.append(f" return \"{clean_name}\";")
|
||||
|
||||
func_lines.append("")
|
||||
func_lines.append(" default:")
|
||||
func_lines.append(" return \"unknown\";")
|
||||
func_lines.append(" }")
|
||||
func_lines.append("}")
|
||||
|
||||
# Full header template
|
||||
header_content = NL.join(enum_lines) + "\n"
|
||||
header_content = NL.join(enum_lines) + "\n\n" + NL.join(func_lines) + "\n"
|
||||
|
||||
return header_content
|
||||
|
||||
@@ -170,19 +193,6 @@ class RemotingCodebaseGenerator:
|
||||
|
||||
decl_lines.append(f"{signature} {func['backend_function']}({params});")
|
||||
|
||||
# Switch cases
|
||||
switch_lines = []
|
||||
current_group = None
|
||||
|
||||
for func in functions:
|
||||
if func['group_name'] != current_group:
|
||||
switch_lines.append(f" /* {func['group_description']} */")
|
||||
current_group = func['group_name']
|
||||
|
||||
deprecated = " (DEPRECATED)" if func['deprecated'] else ""
|
||||
|
||||
switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}{deprecated}\";")
|
||||
|
||||
# Dispatch table
|
||||
table_lines = []
|
||||
current_group = None
|
||||
@@ -201,15 +211,6 @@ class RemotingCodebaseGenerator:
|
||||
|
||||
{NL.join(decl_lines)}
|
||||
|
||||
static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
|
||||
{{
|
||||
switch (type) {{
|
||||
{NL.join(switch_lines)}
|
||||
|
||||
default: return "unknown";
|
||||
}}
|
||||
}}
|
||||
|
||||
extern "C" {{
|
||||
static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {{
|
||||
{NL.join(table_lines)}
|
||||
|
||||
@@ -17,8 +17,8 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
|
||||
size_t cgraph_size = apir_serialize_ggml_cgraph(cgraph, cgraph_data);
|
||||
|
||||
virtgpu_shmem temp_shmem; // Local storage for large buffers
|
||||
virtgpu_shmem * shmem = &temp_shmem;
|
||||
bool using_shared_shmem = false;
|
||||
virtgpu_shmem * shmem = &temp_shmem;
|
||||
bool using_shared_shmem = false;
|
||||
|
||||
if (cgraph_size <= gpu->data_shmem.mmap_size) {
|
||||
// Lock mutex before using shared data_shmem buffer
|
||||
@@ -26,7 +26,7 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
|
||||
}
|
||||
using_shared_shmem = true;
|
||||
shmem = &gpu->data_shmem;
|
||||
shmem = &gpu->data_shmem;
|
||||
} else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
|
||||
}
|
||||
|
||||
@@ -62,7 +62,9 @@ size_t apir_buffer_type_get_max_size(virtgpu * gpu, apir_buffer_type_host_handle
|
||||
return max_size;
|
||||
}
|
||||
|
||||
apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size) {
|
||||
apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu,
|
||||
apir_buffer_type_host_handle_t host_handle,
|
||||
size_t size) {
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
@@ -84,7 +86,9 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_t
|
||||
return buffer_context;
|
||||
}
|
||||
|
||||
size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, const ggml_tensor * op) {
|
||||
size_t apir_buffer_type_get_alloc_size(virtgpu * gpu,
|
||||
apir_buffer_type_host_handle_t host_handle,
|
||||
const ggml_tensor * op) {
|
||||
apir_encoder * encoder;
|
||||
apir_decoder * decoder;
|
||||
ApirForwardReturnCode ret;
|
||||
|
||||
@@ -35,8 +35,8 @@ void apir_buffer_set_tensor(virtgpu * gpu,
|
||||
apir_encode_ggml_tensor(encoder, tensor);
|
||||
|
||||
virtgpu_shmem temp_shmem; // Local storage for large buffers
|
||||
virtgpu_shmem * shmem = &temp_shmem;
|
||||
bool using_shared_shmem = false;
|
||||
virtgpu_shmem * shmem = &temp_shmem;
|
||||
bool using_shared_shmem = false;
|
||||
|
||||
if (size <= gpu->data_shmem.mmap_size) {
|
||||
// Lock mutex before using shared data_shmem buffer
|
||||
@@ -44,7 +44,7 @@ void apir_buffer_set_tensor(virtgpu * gpu,
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
|
||||
}
|
||||
using_shared_shmem = true;
|
||||
shmem = &gpu->data_shmem;
|
||||
shmem = &gpu->data_shmem;
|
||||
|
||||
} else if (virtgpu_shmem_create(gpu, size, shmem)) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
|
||||
@@ -86,8 +86,8 @@ void apir_buffer_get_tensor(virtgpu * gpu,
|
||||
apir_encode_ggml_tensor(encoder, tensor);
|
||||
|
||||
virtgpu_shmem temp_shmem; // Local storage for large buffers
|
||||
virtgpu_shmem * shmem = &temp_shmem;
|
||||
bool using_shared_shmem = false;
|
||||
virtgpu_shmem * shmem = &temp_shmem;
|
||||
bool using_shared_shmem = false;
|
||||
|
||||
if (size <= gpu->data_shmem.mmap_size) {
|
||||
// Lock mutex before using shared data_shmem buffer
|
||||
@@ -95,7 +95,7 @@ void apir_buffer_get_tensor(virtgpu * gpu,
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__);
|
||||
}
|
||||
using_shared_shmem = true;
|
||||
shmem = &gpu->data_shmem;
|
||||
shmem = &gpu->data_shmem;
|
||||
|
||||
} else if (virtgpu_shmem_create(gpu, size, shmem)) {
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__);
|
||||
|
||||
@@ -26,7 +26,7 @@ char * apir_device_get_name(virtgpu * gpu) {
|
||||
REMOTE_CALL(gpu, encoder, decoder, ret);
|
||||
|
||||
const size_t string_size = apir_decode_array_size_unchecked(decoder);
|
||||
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
|
||||
char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size);
|
||||
if (!string) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__);
|
||||
return NULL;
|
||||
@@ -173,7 +173,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, si
|
||||
REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
|
||||
|
||||
if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) {
|
||||
GGML_ABORT(GGML_VIRTGPU "Couldn't allocate the guest-host shared buffer");
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate %ldb of guest-host shared buffer", __func__, size);
|
||||
}
|
||||
|
||||
apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id);
|
||||
|
||||
@@ -1,29 +1,36 @@
|
||||
#include "virtgpu.h"
|
||||
#pragma once
|
||||
|
||||
// clang-format off
|
||||
#include "virtgpu.h"
|
||||
#include "ggml-remoting.h"
|
||||
#include "backend/shared/apir_backend.h"
|
||||
#include "backend/shared/apir_cs_ggml.h"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
// clang-format on
|
||||
|
||||
#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \
|
||||
do { \
|
||||
int32_t forward_flag = (int32_t) apir_command_type__; \
|
||||
encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \
|
||||
if (!encoder_name) { \
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \
|
||||
} \
|
||||
#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \
|
||||
int32_t REMOTE_CALL_PREPARE_forward_flag = (int32_t) apir_command_type__; \
|
||||
const char * REMOTE_CALL_PREPARE_command_name = apir_dispatch_command_name(apir_command_type__); \
|
||||
do { \
|
||||
encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, REMOTE_CALL_PREPARE_forward_flag); \
|
||||
if (!encoder_name) { \
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \
|
||||
do { \
|
||||
ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \
|
||||
if (!decoder_name) { \
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \
|
||||
} \
|
||||
if (ret_name < APIR_FORWARD_BASE_INDEX) { \
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \
|
||||
apir_forward_error(ret_name), ret_name); \
|
||||
} \
|
||||
ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \
|
||||
#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \
|
||||
do { \
|
||||
ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \
|
||||
if (!decoder_name) { \
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \
|
||||
} \
|
||||
if (ret_name < APIR_FORWARD_BASE_INDEX) { \
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \
|
||||
apir_forward_error(ret_name), ret_name); \
|
||||
} \
|
||||
ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \
|
||||
if (ret_name != 0) { \
|
||||
GGML_ABORT(GGML_VIRTGPU "backend function '%s' failed (return code: %d)", \
|
||||
REMOTE_CALL_PREPARE_command_name, ret_name); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -20,6 +20,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu * gpu,
|
||||
char * apir_buffer_type_get_name(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
|
||||
size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
|
||||
size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle);
|
||||
/* apir_buffer_type_is_host is deprecated. */
|
||||
apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu,
|
||||
apir_buffer_type_host_handle_t host_handle,
|
||||
size_t size);
|
||||
|
||||
@@ -53,9 +53,9 @@ static int virtgpu_handshake(virtgpu * gpu) {
|
||||
|
||||
if (!decoder) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to initiate the communication with the virglrenderer library. "
|
||||
"Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
|
||||
__func__);
|
||||
"%s: failed to initiate the communication with the virglrenderer library. "
|
||||
"Most likely, the wrong virglrenderer library was loaded in the hypervisor.",
|
||||
__func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -65,8 +65,7 @@ static int virtgpu_handshake(virtgpu * gpu) {
|
||||
uint32_t host_minor;
|
||||
|
||||
if (ret_magic != APIR_HANDSHAKE_MAGIC) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic,
|
||||
apir_backend_initialize_error(ret_magic));
|
||||
} else {
|
||||
apir_decode_uint32_t(decoder, &host_major);
|
||||
@@ -140,15 +139,13 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
|
||||
"Make sure virglrenderer is correctly configured by the hypervisor. (%s) ",
|
||||
__func__, apir_load_library_error(ret));
|
||||
} else {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)", __func__,
|
||||
apir_load_library_error(ret), ret);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)",
|
||||
__func__, apir_load_library_error(ret), ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO(GGML_VIRTGPU
|
||||
"%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__);
|
||||
|
||||
ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX);
|
||||
|
||||
@@ -158,10 +155,11 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
|
||||
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
|
||||
__func__, apir_load_library_error(apir_ret));
|
||||
} else if (apir_ret == APIR_LOAD_LIBRARY_SYMBOL_MISSING) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. "
|
||||
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
|
||||
__func__, apir_load_library_error(apir_ret));
|
||||
GGML_ABORT(
|
||||
GGML_VIRTGPU
|
||||
"%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. "
|
||||
"Make sure virglrenderer is correctly configured by the hypervisor. (%s)",
|
||||
__func__, apir_load_library_error(apir_ret));
|
||||
} else if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: the API Remoting backend library couldn't load the GGML backend library: apir code=%d | %s)",
|
||||
@@ -169,8 +167,8 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) {
|
||||
} else {
|
||||
uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX;
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__,
|
||||
lib_ret);
|
||||
"%s: the API Remoting backend library failed to initialize its backend library: apir code=%d)",
|
||||
__func__, lib_ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@@ -184,55 +182,49 @@ virtgpu * create_virtgpu() {
|
||||
// Initialize mutex to protect shared data_shmem buffer
|
||||
if (mtx_init(&gpu->data_shmem_mutex, mtx_plain) != thrd_success) {
|
||||
delete gpu;
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to initialize data_shmem mutex", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize data_shmem mutex", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_open(gpu) != APIR_SUCCESS) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU
|
||||
"%s: failed to open the virtgpu device\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to open the virtgpu device\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_init_capset(gpu) != APIR_SUCCESS) {
|
||||
if (gpu->use_apir_capset) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library supports it.", __func__);
|
||||
"%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library "
|
||||
"supports it.",
|
||||
__func__);
|
||||
} else {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to initialize the virtgpu Venus capset", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu Venus capset", __func__);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_init_context(gpu) != APIR_SUCCESS) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to initialize the GPU context", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the GPU context", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to create the shared reply memory pages", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to create the shared reply memory pages", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to create the shared data memory pages", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to create the shared data memory pages", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_handshake(gpu)) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to handshake with the virglrenderer library", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to handshake with the virglrenderer library", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to load the backend library", __func__);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to load the backend library", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -243,8 +235,7 @@ static virt_gpu_result_t virtgpu_open(virtgpu * gpu) {
|
||||
drmDevicePtr devs[8];
|
||||
int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
|
||||
if (count < 0) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU
|
||||
"%s: failed to enumerate DRM devices\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to enumerate DRM devices\n", __func__);
|
||||
return APIR_ERROR_INITIALIZATION_FAILED;
|
||||
}
|
||||
|
||||
@@ -266,19 +257,17 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d
|
||||
|
||||
int fd = open(node_path, O_RDWR | O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
GGML_ABORT(GGML_VIRTGPU
|
||||
"%s: failed to open %s", __func__, node_path);
|
||||
GGML_ABORT(GGML_VIRTGPU "%s: failed to open %s", __func__, node_path);
|
||||
return APIR_ERROR_INITIALIZATION_FAILED;
|
||||
}
|
||||
|
||||
drmVersionPtr version = drmGetVersion(fd);
|
||||
if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) {
|
||||
if (version) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU
|
||||
"%s: unknown DRM driver %s version %d\n", __func__, version->name, version->version_major);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: unknown DRM driver %s version %d\n", __func__, version->name,
|
||||
version->version_major);
|
||||
} else {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU
|
||||
"%s: failed to get DRM driver version\n", __func__);
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to get DRM driver version\n", __func__);
|
||||
}
|
||||
|
||||
if (version) {
|
||||
@@ -322,9 +311,8 @@ static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) {
|
||||
virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data));
|
||||
|
||||
if (ret) {
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU
|
||||
"%s: failed to get APIR v%d capset: %s\n",
|
||||
__func__, gpu->capset.version, strerror(errno));
|
||||
GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to get APIR v%d capset: %s\n", __func__, gpu->capset.version,
|
||||
strerror(errno));
|
||||
return APIR_ERROR_INITIALIZATION_FAILED;
|
||||
}
|
||||
|
||||
@@ -547,13 +535,10 @@ static void log_call_duration(long long call_duration_ns, const char * name) {
|
||||
double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds
|
||||
|
||||
if (call_duration_s > 1) {
|
||||
GGML_LOG_INFO(GGML_VIRTGPU
|
||||
"waited %.2fs for the %s host reply...\n", call_duration_s, name);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "waited %.2fs for the %s host reply...\n", call_duration_s, name);
|
||||
} else if (call_duration_ms > 1) {
|
||||
GGML_LOG_INFO(GGML_VIRTGPU
|
||||
"waited %.2fms for the %s host reply...\n", call_duration_ms, name);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "waited %.2fms for the %s host reply...\n", call_duration_ms, name);
|
||||
} else {
|
||||
GGML_LOG_INFO(GGML_VIRTGPU
|
||||
"waited %lldns for the %s host reply...\n", call_duration_ns, name);
|
||||
GGML_LOG_INFO(GGML_VIRTGPU "waited %lldns for the %s host reply...\n", call_duration_ns, name);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
// clang-format off
|
||||
#include "virtgpu-utils.h"
|
||||
#include "virtgpu-shm.h"
|
||||
#include "virtgpu-apir.h"
|
||||
@@ -23,20 +24,21 @@
|
||||
#include "apir_hw.h"
|
||||
#include <drm/virtgpu_drm.h>
|
||||
#include "venus_hw.h"
|
||||
// clang-format on
|
||||
|
||||
#ifndef VIRTGPU_DRM_CAPSET_APIR
|
||||
// Will be defined include/drm/virtgpu_drm.h when
|
||||
// https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590/diffs
|
||||
// is merged
|
||||
#define VIRTGPU_DRM_CAPSET_APIR 10
|
||||
# define VIRTGPU_DRM_CAPSET_APIR 10
|
||||
#endif
|
||||
|
||||
// Mesa/Virlgrenderer Venus internal. Only necessary during the
|
||||
// Venus->APIR transition in Virglrenderer
|
||||
#define VENUS_COMMAND_TYPE_LENGTH 331
|
||||
|
||||
#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16
|
||||
#define VIRTGPU_DRM_CAPSET_VENUS 4
|
||||
#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16
|
||||
# define VIRTGPU_DRM_CAPSET_VENUS 4
|
||||
#endif
|
||||
|
||||
typedef uint32_t virgl_renderer_capset;
|
||||
|
||||
@@ -532,6 +532,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
FFN_GATE_EXP = auto()
|
||||
FFN_DOWN_EXP = auto()
|
||||
FFN_UP_EXP = auto()
|
||||
FFN_GATE_UP_EXP = auto()
|
||||
FFN_GATE_SHEXP = auto()
|
||||
FFN_DOWN_SHEXP = auto()
|
||||
FFN_UP_SHEXP = auto()
|
||||
@@ -980,6 +981,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps",
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
|
||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
|
||||
@@ -1820,6 +1822,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP,
|
||||
MODEL_TENSOR.SSM_A,
|
||||
MODEL_TENSOR.SSM_CONV1D,
|
||||
MODEL_TENSOR.SSM_DT,
|
||||
@@ -1909,6 +1912,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP,
|
||||
MODEL_TENSOR.SSM_A,
|
||||
MODEL_TENSOR.SSM_CONV1D,
|
||||
MODEL_TENSOR.SSM_DT,
|
||||
@@ -2610,6 +2614,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
|
||||
@@ -567,6 +567,10 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_UP_EXP: (
|
||||
"model.layers.{bid}.mlp.experts.gate_up_proj",
|
||||
),
|
||||
|
||||
# Feed-forward down
|
||||
MODEL_TENSOR.FFN_DOWN: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||
|
||||
@@ -349,6 +349,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
||||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_UP_EXPS, "blk.%d.ffn_gate_up_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
@@ -1004,6 +1005,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
@@ -1061,6 +1063,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
@@ -1601,6 +1604,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
@@ -2685,6 +2689,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
|
||||
@@ -373,6 +373,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_GATE_UP_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
|
||||
@@ -1165,7 +1165,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in) const {
|
||||
ggml_tensor * probs_in,
|
||||
ggml_tensor * gate_up_exps) const {
|
||||
return build_moe_ffn(
|
||||
cur,
|
||||
gate_inp, /* gate_inp_b */ nullptr,
|
||||
@@ -1181,7 +1182,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
w_scale,
|
||||
gating_op,
|
||||
il,
|
||||
probs_in
|
||||
probs_in,
|
||||
gate_up_exps
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1204,7 +1206,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in) const {
|
||||
ggml_tensor * probs_in,
|
||||
ggml_tensor * gate_up_exps,
|
||||
ggml_tensor * gate_up_exps_b) const {
|
||||
const int64_t n_embd = cur->ne[0];
|
||||
const int64_t n_tokens = cur->ne[1];
|
||||
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
||||
@@ -1343,26 +1347,48 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
}
|
||||
|
||||
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(up, "ffn_moe_up", il);
|
||||
|
||||
if (up_exps_b) {
|
||||
up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
|
||||
cb(up, "ffn_moe_up_biased", il);
|
||||
}
|
||||
|
||||
ggml_tensor * up = nullptr;
|
||||
ggml_tensor * experts = nullptr;
|
||||
if (gate_exps) {
|
||||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
|
||||
if (gate_up_exps) {
|
||||
// merged gate_up path: one mul_mat_id, then split into gate and up views
|
||||
ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens]
|
||||
cb(gate_up, "ffn_moe_gate_up", il);
|
||||
|
||||
if (gate_up_exps_b) {
|
||||
gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts);
|
||||
cb(gate_up, "ffn_moe_gate_up_biased", il);
|
||||
}
|
||||
|
||||
const int64_t n_ff = gate_up->ne[0] / 2;
|
||||
cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
up = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], n_ff * gate_up->nb[0]);
|
||||
cb(up, "ffn_moe_up", il);
|
||||
} else {
|
||||
cur = up;
|
||||
// separate gate and up path
|
||||
up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(up, "ffn_moe_up", il);
|
||||
|
||||
if (up_exps_b) {
|
||||
up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
|
||||
cb(up, "ffn_moe_up_biased", il);
|
||||
}
|
||||
|
||||
if (gate_exps) {
|
||||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
} else {
|
||||
cur = up;
|
||||
}
|
||||
|
||||
if (gate_exps_b) {
|
||||
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
|
||||
cb(cur, "ffn_moe_gate_biased", il);
|
||||
}
|
||||
}
|
||||
|
||||
if (gate_exps_b) {
|
||||
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
|
||||
cb(cur, "ffn_moe_gate_biased", il);
|
||||
}
|
||||
const bool has_gate = gate_exps || gate_up_exps;
|
||||
|
||||
switch (type_op) {
|
||||
case LLM_FFN_SILU:
|
||||
@@ -1385,7 +1411,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (has_gate) {
|
||||
cur = ggml_swiglu_split(ctx0, cur, up);
|
||||
cb(cur, "ffn_moe_swiglu", il);
|
||||
} else {
|
||||
@@ -1393,7 +1421,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cb(cur, "ffn_moe_silu", il);
|
||||
} break;
|
||||
case LLM_FFN_GELU:
|
||||
if (gate_exps) {
|
||||
if (has_gate) {
|
||||
cur = ggml_geglu_split(ctx0, cur, up);
|
||||
cb(cur, "ffn_moe_geglu", il);
|
||||
} else {
|
||||
@@ -1409,7 +1437,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cb(cur, "ffn_moe_swiglu_oai", il);
|
||||
} break;
|
||||
case LLM_FFN_RELU:
|
||||
if (gate_exps) {
|
||||
if (has_gate) {
|
||||
cur = ggml_reglu_split(ctx0, cur, up);
|
||||
cb(cur, "ffn_moe_reglu", il);
|
||||
} else {
|
||||
@@ -1417,7 +1445,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cb(cur, "ffn_moe_relu", il);
|
||||
} break;
|
||||
case LLM_FFN_RELU_SQR:
|
||||
if (gate_exps) {
|
||||
if (has_gate) {
|
||||
// TODO: add support for gated squared relu
|
||||
GGML_ABORT("fatal error: gated squared relu not implemented");
|
||||
} else {
|
||||
|
||||
@@ -814,7 +814,8 @@ struct llm_graph_context {
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in = nullptr) const;
|
||||
ggml_tensor * probs_in = nullptr,
|
||||
ggml_tensor * gate_up_exps = nullptr) const;
|
||||
|
||||
ggml_tensor * build_moe_ffn(
|
||||
ggml_tensor * cur,
|
||||
@@ -835,7 +836,9 @@ struct llm_graph_context {
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in = nullptr) const;
|
||||
ggml_tensor * probs_in = nullptr,
|
||||
ggml_tensor * gate_up_exps = nullptr,
|
||||
ggml_tensor * gate_up_exps_b = nullptr) const;
|
||||
|
||||
//
|
||||
// inputs
|
||||
|
||||
@@ -978,6 +978,9 @@ bool llama_kv_cache::get_can_shift() const {
|
||||
if (model.arch == LLM_ARCH_STEP35) {
|
||||
return false;
|
||||
}
|
||||
if (hparams.n_pos_per_embd() > 1) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -2980,6 +2980,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
// TODO: move to a separate function
|
||||
const auto tn = LLM_TN(arch);
|
||||
|
||||
// helper: try merged gate_up_exps first, fall back to separate gate and up
|
||||
auto create_tensor_gate_up_exps = [&](llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) {
|
||||
layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED);
|
||||
if (layer.ffn_gate_up_exps == nullptr) {
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
|
||||
}
|
||||
};
|
||||
switch (arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_REFACT:
|
||||
@@ -5221,9 +5230,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
// MoE branch
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
|
||||
|
||||
// Shared expert branch
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
@@ -7425,9 +7433,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
||||
create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
|
||||
|
||||
// Shared experts
|
||||
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
|
||||
@@ -7491,9 +7498,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
||||
create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
|
||||
|
||||
// Shared experts
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
||||
|
||||
@@ -280,14 +280,16 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_enc = nullptr;
|
||||
|
||||
// ff MoE
|
||||
struct ggml_tensor * ffn_gate_inp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||
struct ggml_tensor * ffn_gate_inp_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_inp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps = nullptr;
|
||||
struct ggml_tensor * ffn_gate_up_exps = nullptr;
|
||||
struct ggml_tensor * ffn_gate_inp_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_up_exps_b = nullptr;
|
||||
|
||||
// ff shared expert (shexp)
|
||||
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
||||
|
||||
@@ -218,7 +218,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
il,
|
||||
nullptr,
|
||||
model.layers[il].ffn_gate_up_exps);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// FFN shared expert
|
||||
|
||||
@@ -380,7 +380,8 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used, LLM_FFN_SILU,
|
||||
true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
||||
true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
|
||||
nullptr, model.layers[il].ffn_gate_up_exps);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// Add shared experts if present - following Qwen3Next reference implementation
|
||||
|
||||
@@ -479,7 +479,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used, LLM_FFN_SILU,
|
||||
true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
||||
true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
|
||||
nullptr, model.layers[il].ffn_gate_up_exps);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// Add shared experts if present - following Qwen3Next reference implementation
|
||||
|
||||
@@ -2363,7 +2363,7 @@ private:
|
||||
//printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint");
|
||||
} else {
|
||||
pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
|
||||
n_past = slot.prompt.tokens.size_up_to_pos(pos_next);
|
||||
n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens);
|
||||
SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, (float) checkpoint_size / 1024 / 1024);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -291,7 +291,9 @@ void server_models::load_models() {
|
||||
for (const auto & [name, inst] : mapping) {
|
||||
std::string val;
|
||||
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
|
||||
models_to_load.push_back(name);
|
||||
if (common_arg_utils::is_truthy(val)) {
|
||||
models_to_load.push_back(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((int)models_to_load.size() > base_params.models_max) {
|
||||
|
||||
Reference in New Issue
Block a user