Compare commits

..

22 Commits

Author SHA1 Message Date
Georgi Gerganov
e7fbfc9b80 ci : tmp fixes 2026-02-11 15:48:40 +02:00
Andreas Kieslinger
d46bd7ef2d Apply suggestion from @ggerganov (src->buffer to buf_src) v2
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-02-11 13:54:57 +02:00
Andreas Kieslinger
070933684f Apply suggestion from @ggerganov (src->buffer to buf_src)
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-02-11 13:54:25 +02:00
aendk
1528c841dc Simplifies synchronizations to adhere to saaasg pattern. 2026-02-11 13:54:25 +02:00
aendk
ff28ae93a2 Corrects initialization of ggml_backend_sync_mode in
ggml_backend_sched_split initialization
2026-02-11 13:54:25 +02:00
aendk
01d89f9b96 Reintroduces stricter check for CPU->CUDA backend async copy via
GGML_DEVICE_TYPE_CPU.
2026-02-11 13:54:25 +02:00
aendk
e74b070e30 Makes opt-in to relax use of explicit syncs more general. Backends like
vulkan which require a synchronization between HtoD copies and graph
execution could also adopt this change now.
2026-02-11 13:54:25 +02:00
aendk
b7376c3ed7 Minor cleanup 2026-02-11 13:54:24 +02:00
aendk
d776354dc9 Relax requirement of checks in async CUDA copies from backend and buffer type to just buffer type, to avoid linking issues 2026-02-11 13:54:23 +02:00
aendk
79a77277ad Reworked backend detection in ggml-backend.cpp to avoid linking
conflicts
2026-02-11 13:53:43 +02:00
aendk
44e481bb34 Adds macro guards to allow compilation in non-CUDA builds 2026-02-11 13:53:43 +02:00
aendk
91c6026b5c Exchanges synchronous copy with async copy function. 2026-02-11 13:53:43 +02:00
aendk
2ad0d391e1 Adds function to relax sync requirements between input copies on
supported backends (CUDA for now)
2026-02-11 13:53:43 +02:00
aendk
dd9f1faf42 Adds CPU-to-CUDA copy capability to
ggml_backend_cuda_cpy_tensor_async()
2026-02-11 13:53:41 +02:00
Georgi Gerganov
a554bdd70f metal : fix event synchronization in cpy_tensor_async (#19402) 2026-02-11 13:52:47 +02:00
Johannes Gäßler
02ee504f90 fix output pattern 2026-02-09 23:04:31 +01:00
Johannes Gäßler
94c66557b8 re-use buffers + ggml contexts 2026-02-09 00:37:11 +01:00
Johannes Gäßler
6ab02d0908 unconditional peer access 2026-02-08 12:20:28 +01:00
Johannes Gäßler
2b7055381c add support for 4/8 GPUs 2026-02-07 23:08:10 +01:00
Johannes Gäßler
b7fd10664e partial Vulkan fix 2026-02-07 15:31:57 +01:00
Johannes Gäßler
a630b27da7 support for GPT-OSS, Qwen 3 MoE 2026-02-06 22:13:53 +01:00
Johannes Gäßler
39b96f8fe1 ggml: backend-agnostic tensor parallelism 2026-02-05 21:49:34 +01:00
38 changed files with 1891 additions and 262 deletions

View File

@@ -468,7 +468,7 @@ jobs:
export GGML_VK_VISIBLE_DEVICES=0
export GGML_VK_DISABLE_F16=1
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 4800
ctest -L main --verbose --timeout 4200
ubuntu-24-cmake-webgpu:
runs-on: ubuntu-24.04

View File

@@ -413,8 +413,6 @@ function gg_run_qwen3_0_6b {
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -540,8 +538,6 @@ function gg_run_embd_bge_small {
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -582,8 +578,6 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf"
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
# for this model, the SEP token is "</s>"
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
@@ -683,8 +677,8 @@ fi
ret=0
test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
#test $ret -eq 0 && gg_run ctest_debug
#test $ret -eq 0 && gg_run ctest_release
if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
test $ret -eq 0 && gg_run test_backend_ops_cpu

View File

@@ -2331,19 +2331,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
add_opt(common_arg(
{"-sm", "--split-mode"}, "{none,layer,row}",
{"-sm", "--split-mode"}, "{none,layer,row,tensor}",
"how to split the model across multiple GPUs, one of:\n"
"- none: use one GPU only\n"
"- layer (default): split layers and KV across GPUs\n"
"- row: split rows across GPUs",
"- layer (default): split layers and KV across GPUs (pipelined)\n"
"- row: split weight across GPUs by rows (parallelized)\n"
"- tensor: split weights and KV across GPUs (parallelized)",
[](common_params & params, const std::string & value) {
std::string arg_next = value;
if (arg_next == "none") {
if (value == "none") {
params.split_mode = LLAMA_SPLIT_MODE_NONE;
} else if (arg_next == "layer") {
} else if (value == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (arg_next == "row") {
} else if (value == "row") {
params.split_mode = LLAMA_SPLIT_MODE_ROW;
} else if (value == "tensor") {
params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
} else {
throw std::invalid_argument("invalid value");
}

View File

@@ -68,7 +68,7 @@ extern "C" {
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst);
//
// Backend (stream)
@@ -109,7 +109,18 @@ extern "C" {
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// asynchronous tensor shuffle
// - src1, dst1 belong to backend_1
// - src2, dst2 belong to backend_2
// - src1 is copied to dst2
// - src2 is copied to dst1
// - both backends wait until both copies have completed
GGML_API void ggml_backend_tensor_shfl_async(
ggml_backend_t backend_1, ggml_backend_t backend_2,
const struct ggml_tensor * src1, const struct ggml_tensor * src2,
struct ggml_tensor * dst1, struct ggml_tensor * dst2);
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
@@ -135,7 +146,9 @@ extern "C" {
// integrated GPU device using host memory
GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
GGML_BACKEND_DEVICE_TYPE_ACCEL,
// "meta" device wrapping multiple other devices for tensor parallelism
GGML_BACKEND_DEVICE_TYPE_META,
};
// functionality supported by the device
@@ -211,6 +224,52 @@ extern "C" {
};
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
//
// Meta backend
//
enum ggml_backend_meta_split_state {
// tensor split by tensor dimensions:
GGML_BACKEND_SPLIT_STATE_BY_NE0 = 0,
GGML_BACKEND_SPLIT_STATE_BY_NE1 = 1,
GGML_BACKEND_SPLIT_STATE_BY_NE2 = 2,
GGML_BACKEND_SPLIT_STATE_BY_NE3 = 3,
GGML_BACKEND_SPLIT_STATE_MIRRORED = 10, // all values on all backends
GGML_BACKEND_SPLIT_STATE_PARTIAL = 11, // each backend has a partial sum
// for internal bookkeeping only:
GGML_BACKEND_SPLIT_STATE_NONE = 98,
GGML_BACKEND_SPLIT_STATE_UNKNOWN = 99,
};
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
typedef enum ggml_backend_meta_split_state (*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
GGML_API bool ggml_backend_dev_is_meta(ggml_backend_dev_t dev);
GGML_API size_t ggml_backend_meta_dev_n_devs(ggml_backend_dev_t meta_dev);
GGML_API ggml_backend_dev_t ggml_backend_meta_dev_simple_dev(ggml_backend_dev_t meta_dev, size_t index);
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
GGML_API bool ggml_backend_buft_is_meta(ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_meta_buft_n_bufts(ggml_backend_buffer_type_t meta_buft);
GGML_API ggml_backend_buffer_type_t ggml_backend_meta_buft_simple_buft(ggml_backend_buffer_type_t meta_buft, size_t index);
GGML_API bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf);
GGML_API size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf);
GGML_API ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index);
GGML_API struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index);
GGML_API bool ggml_backend_is_meta(ggml_backend_t backend);
GGML_API size_t ggml_backend_meta_n_backends(ggml_backend_t meta_backend);
GGML_API ggml_backend_t ggml_backend_meta_simple_backend(ggml_backend_t meta_backend, size_t index);
GGML_API enum ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
//
// Backend registry
//

View File

@@ -200,6 +200,7 @@ add_library(ggml-base
ggml.cpp
ggml-alloc.c
ggml-backend.cpp
ggml-backend-meta.cpp
ggml-opt.cpp
ggml-threading.cpp
ggml-threading.h

View File

@@ -1,5 +1,6 @@
#include "ggml-alloc.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml.h"
#include "ggml-impl.h"
#include <assert.h>
@@ -1240,6 +1241,9 @@ size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx,
}
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_meta(buft)) {
return ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft);
}
size_t nbytes_total = 0;
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
}

View File

@@ -2,7 +2,9 @@
// ggml-backend internal header
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
@@ -90,9 +92,16 @@ extern "C" {
void (*free)(ggml_backend_t backend);
// (optional) asynchronous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*set_tensor_async) (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async) (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*set_tensor_2d_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
bool (*shfl_tensor_async)(ggml_backend_t backend_1, ggml_backend_t backend_2,
const struct ggml_tensor * src1, const struct ggml_tensor * src2, struct ggml_tensor * dst1, struct ggml_tensor * dst2);
// (optional) backend-specific AllReduce operation for meta backend
bool (*allreduce_tensor_async)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// (optional) complete all pending operations (required if the backend supports async operations)
void (*synchronize)(ggml_backend_t backend);
@@ -250,6 +259,9 @@ extern "C" {
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
#endif
// temporary workaround to statically allocate tensors from a context in a deduplicated way:
GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -123,7 +123,7 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
// get_base is optional if the buffer is zero-sized
if (buffer->size == 0) {
if (!ggml_backend_buffer_is_meta(buffer) && buffer->size == 0) {
return NULL;
}
@@ -388,7 +388,7 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
// backend copy
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) {
@@ -402,7 +402,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
#endif
#endif // NDEBUG
size_t nbytes = ggml_nbytes(src);
void * data = malloc(nbytes);
ggml_backend_tensor_get(src, data, 0, nbytes);
@@ -411,7 +411,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
}
}
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) {
@@ -432,6 +432,20 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
ggml_backend_tensor_copy(src, dst);
}
void ggml_backend_tensor_shfl_async(
ggml_backend_t backend_1, ggml_backend_t backend_2,
const struct ggml_tensor * src1, const struct ggml_tensor * src2,
struct ggml_tensor * dst1, struct ggml_tensor * dst2) {
GGML_ASSERT(ggml_are_same_layout(src1, dst1) && "cannot shuffle tensors with different layouts");
GGML_ASSERT(ggml_are_same_layout(src2, dst2) && "cannot shuffle tensors with different layouts");
if (backend_1->iface.shfl_tensor_async != NULL) {
if (backend_1->iface.shfl_tensor_async(backend_1, backend_2, src1, src2, dst1, dst2)) {
return;
}
}
ggml_backend_tensor_copy_async(backend_1, backend_2, src1, dst2);
ggml_backend_tensor_copy_async(backend_2, backend_1, src2, dst1);
}
// events
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
@@ -500,6 +514,7 @@ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
}
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
GGML_ASSERT(device);
memset(props, 0, sizeof(*props));
device->iface.get_props(device, props);
}
@@ -1455,6 +1470,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
int split_backend_id = split->backend_id;
ggml_backend_t split_backend = sched->backends[split_backend_id];
if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
ggml_backend_synchronize(split_backend);
}
// copy the input tensors to the split backend
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
@@ -1465,16 +1484,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
ggml_backend_tensor_copy(input, input_cpy);
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
} else {
// wait for the split backend to finish using the input before overwriting it
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
@@ -1578,6 +1593,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
}
if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
ggml_backend_synchronize(split_backend);
}
if (!sched->callback_eval) {
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
if (ec != GGML_STATUS_SUCCESS) {
@@ -1899,8 +1918,9 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct
GGML_ASSERT(tensor->data == NULL);
GGML_ASSERT(tensor->view_src == NULL);
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer) ||
(char *) addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *) ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
tensor->buffer = buffer;
tensor->data = addr;

View File

@@ -260,8 +260,12 @@ static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_get_name,
/* .free = */ ggml_backend_blas_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -2582,7 +2582,11 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .free = */ ggml_backend_cann_free,
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_cann_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -195,7 +195,11 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .free = */ ggml_backend_cpu_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,

View File

@@ -309,6 +309,19 @@ static ggml_cuda_device_info ggml_cuda_init() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
for (int id = 0; id < info.device_count; ++id) {
ggml_cuda_set_device(id);
for (int id_other = 0; id_other < info.device_count; ++id_other) {
if (id == id_other) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
}
}
}
return info;
}
@@ -1371,64 +1384,6 @@ static void ggml_cuda_op_mul_mat_cublas(
GGML_UNUSED_VARS(dst, src1_ddq_i, src1_padded_row_size);
}
static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
static bool peer_access_enabled = false;
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
if (peer_access_enabled == enable_peer_access) {
return;
}
#ifdef NDEBUG
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
ggml_cuda_set_device(id);
CUDA_CHECK(cudaDeviceSynchronize());
}
for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
ggml_cuda_set_device(id);
for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) {
if (id == id_other) {
continue;
}
if (id != main_device && id_other != main_device) {
continue;
}
int can_access_peer;
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
if (can_access_peer) {
if (enable_peer_access) {
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
if (err != cudaErrorPeerAccessAlreadyEnabled) {
CUDA_CHECK(err);
} else {
// reset the error
(void)cudaGetLastError();
}
} else {
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
if (err != cudaErrorPeerAccessNotEnabled) {
CUDA_CHECK(err);
} else {
// reset the error
(void)cudaGetLastError();
}
}
}
}
}
ggml_cuda_set_device(main_device);
#endif // NDEBUG
peer_access_enabled = enable_peer_access;
GGML_UNUSED(main_device);
}
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
@@ -2420,11 +2375,6 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
// why is this here instead of mul_mat?
if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
}
switch (dst->op) {
case GGML_OP_ARGMAX:
ggml_cuda_argmax(ctx, dst);
@@ -2800,29 +2750,35 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
//enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
return false;
}
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(buf_dst)) {
return false;
}
// device -> device copy
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *) backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *) backend_dst->context;
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
!copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif
#endif // NDEBUG
return false;
}
if (backend_src != backend_dst) {
if (copy_from_host) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
} else if (backend_src != backend_dst) {
// copy on src stream
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -2831,7 +2787,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
return false;
#else
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
#endif
#endif // GGML_CUDA_NO_PEER_COPY
}
// record event on src stream after the copy
@@ -2851,6 +2807,77 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
return true;
}
static bool ggml_backend_cuda_shfl_tensor_async(
ggml_backend_t backend_1, ggml_backend_t backend_2, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst1, ggml_tensor * dst2) {
ggml_backend_buffer_t buf_src1 = src1->view_src ? src1->view_src->buffer : src1->buffer;
ggml_backend_buffer_t buf_src2 = src2->view_src ? src2->view_src->buffer : src2->buffer;
ggml_backend_buffer_t buf_dst1 = dst1->view_src ? dst1->view_src->buffer : dst1->buffer;
ggml_backend_buffer_t buf_dst2 = dst2->view_src ? dst2->view_src->buffer : dst2->buffer;
if (!ggml_backend_is_cuda(backend_1) || !ggml_backend_is_cuda(backend_2)) {
return false;
}
if (!ggml_backend_buffer_is_cuda(buf_src1) || !ggml_backend_buffer_is_cuda(buf_src2) ||
!ggml_backend_buffer_is_cuda(buf_dst1) || !ggml_backend_buffer_is_cuda(buf_dst2)) {
return false;
}
// device -> device copy
ggml_backend_cuda_context * cuda_ctx_1 = (ggml_backend_cuda_context *) backend_1->context;
ggml_backend_cuda_context * cuda_ctx_2 = (ggml_backend_cuda_context *) backend_2->context;
ggml_backend_cuda_buffer_context * buf_ctx_src1 = (ggml_backend_cuda_buffer_context *) buf_src1->context;
ggml_backend_cuda_buffer_context * buf_ctx_src2 = (ggml_backend_cuda_buffer_context *) buf_src2->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst1 = (ggml_backend_cuda_buffer_context *) buf_dst1->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst2 = (ggml_backend_cuda_buffer_context *) buf_dst2->context;
if (cuda_ctx_1->device != buf_ctx_src1->device || cuda_ctx_2->device != buf_ctx_src2->device ||
cuda_ctx_1->device != buf_ctx_dst1->device || cuda_ctx_2->device != buf_ctx_dst2->device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif // NDEBUG
return false;
}
if (backend_1 != backend_2) {
// Copies under control of src streams:
if (cuda_ctx_1->device == cuda_ctx_2->device) {
CUDA_CHECK(cudaMemcpyAsync(dst2->data, src1->data, ggml_nbytes(dst2), cudaMemcpyDeviceToDevice, cuda_ctx_1->stream()));
CUDA_CHECK(cudaMemcpyAsync(dst1->data, src2->data, ggml_nbytes(dst1), cudaMemcpyDeviceToDevice, cuda_ctx_2->stream()));
} else {
#ifdef GGML_CUDA_NO_PEER_COPY
return false;
#else
CUDA_CHECK(cudaMemcpyPeerAsync(dst2->data, cuda_ctx_2->device, src1->data, cuda_ctx_1->device, ggml_nbytes(dst2), cuda_ctx_1->stream()));
CUDA_CHECK(cudaMemcpyPeerAsync(dst1->data, cuda_ctx_1->device, src2->data, cuda_ctx_2->device, ggml_nbytes(dst1), cuda_ctx_2->stream()));
#endif // GGML_CUDA_NO_PEER_COPY
}
// Record event on src streams after the copy:
if (!cuda_ctx_1->copy_event) {
ggml_cuda_set_device(cuda_ctx_1->device);
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_1->copy_event, cudaEventDisableTiming));
}
if (!cuda_ctx_2->copy_event) {
ggml_cuda_set_device(cuda_ctx_2->device);
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_2->copy_event, cudaEventDisableTiming));
}
CUDA_CHECK(cudaEventRecord(cuda_ctx_1->copy_event, cuda_ctx_1->stream()));
CUDA_CHECK(cudaEventRecord(cuda_ctx_2->copy_event, cuda_ctx_2->stream()));
// Wait on dst stream for the copies to complete:
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_2->stream(), cuda_ctx_1->copy_event, 0));
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_1->stream(), cuda_ctx_2->copy_event, 0));
} else {
// srcs and dsts are on the same backend:
CUDA_CHECK(cudaMemcpyAsync(dst2->data, src1->data, ggml_nbytes(dst2), cudaMemcpyDeviceToDevice, cuda_ctx_1->stream()));
CUDA_CHECK(cudaMemcpyAsync(dst1->data, src2->data, ggml_nbytes(dst1), cudaMemcpyDeviceToDevice, cuda_ctx_2->stream()));
}
return true;
}
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -2979,7 +3006,8 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
}
}
if (memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
return false;
}
@@ -4250,7 +4278,11 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .free = */ ggml_backend_cuda_free,
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
/* .shfl_tensor_async = */ ggml_backend_cuda_shfl_tensor_async,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -2756,7 +2756,11 @@ static struct ggml_backend_i hexagon_backend_i = {
/* .free = */ ggml_backend_hexagon_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_hexagon_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -394,7 +394,7 @@ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, con
[encoder endEncoding];
ggml_metal_event_t ev_cpy = ggml_metal_get_ev_cpy(ctx_src);
ggml_metal_event_record(ctx_src, ev_cpy);
ggml_metal_event_encode_signal(ev_cpy, cmd_buf);
[cmd_buf commit];

View File

@@ -563,7 +563,11 @@ static ggml_backend_i ggml_backend_metal_i = {
/* .free = */ ggml_backend_metal_free,
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_metal_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -5285,7 +5285,6 @@ constant int32_t FC_flash_attn_ext_blk_ncpsg [[function_constant(FC_FLASH_ATTN_E
// scan the blocks of the mask that are not masked
// 0 - masked (i.e. full of -INF, skip)
// 1 - not masked (i.e. at least one element of the mask is not -INF)
// 2 - all zero
kernel void kernel_flash_attn_ext_blk(
constant ggml_metal_kargs_flash_attn_ext_blk & args,
device const char * mask,
@@ -5307,29 +5306,27 @@ kernel void kernel_flash_attn_ext_blk(
device const half * mask_src = (device const half *) (mask + (i1*Q)*args.nb31 + i2*args.nb32 + i3*args.nb33) + i0*C + tiisg;
// fast route
if (res == 0) {
if (simd_max(*mask_src) > -MAXHALF/2) {
res = 1;
}
}
// detailed check of the elements of the block
if ((C > NW || Q > 1) && res == 0) {
half mmin = MAXHALF;
half mmax = -MAXHALF;
half m = -MAXHALF;
FOR_UNROLL (short j = 0; j < Q; ++j) {
FOR_UNROLL (short ii = 0; ii < C/NW; ++ii) {
mmin = min(mmin, mask_src[ii*NW]);
mmax = max(mmax, mask_src[ii*NW]);
m = max(m, mask_src[ii*NW]);
}
mask_src += args.nb31/2;
}
mmin = simd_min(mmin);
mmax = simd_max(mmax);
if (mmax > -MAXHALF) {
if (mmin == 0.0 && mmax == 0.0) {
res = 2;
} else {
res = 1;
}
if (simd_max(m) > -MAXHALF/2) {
res = 1;
}
}
@@ -5571,13 +5568,9 @@ void kernel_flash_attn_ext_impl(
ic = 0;
}
char blk_cur = 1;
// read the mask into shared mem
if (FC_flash_attn_ext_has_mask) {
blk_cur = blk[ic0];
if (blk_cur == 0) {
if (blk[ic0] == 0) {
FOR_UNROLL (short jj = 0; jj < NQ; ++jj) {
pm2[jj] += NW;
}
@@ -5585,22 +5578,16 @@ void kernel_flash_attn_ext_impl(
continue;
}
if (blk_cur == 1) {
FOR_UNROLL (short jj = 0; jj < NQ; ++jj) {
const short j = jj*NSG + sgitg;
FOR_UNROLL (short jj = 0; jj < NQ; ++jj) {
const short j = jj*NSG + sgitg;
if (FC_flash_attn_ext_bc_mask) {
sm2[j*SH + tiisg] = (iq1 + j) < args.ne31 ? pm2[jj][tiisg] : half2(-MAXHALF, -MAXHALF);
} else {
sm2[j*SH + tiisg] = pm2[jj][tiisg];
}
if (FC_flash_attn_ext_bc_mask) {
sm2[j*SH + tiisg] = (iq1 + j) < args.ne31 ? pm2[jj][tiisg] : half2(-MAXHALF, -MAXHALF);
} else {
sm2[j*SH + tiisg] = pm2[jj][tiisg];
}
pm2[jj] += NW;
}
} else if (blk_cur == 2) {
FOR_UNROLL (short jj = 0; jj < NQ; ++jj) {
pm2[jj] += NW;
}
pm2[jj] += NW;
}
#if 0
@@ -5765,12 +5752,10 @@ void kernel_flash_attn_ext_impl(
}
// mqk = mqk + slope*mask
if (blk_cur != 2) {
if (FC_flash_attn_ext_has_bias) {
s2 += s2_t(sm2[j*SH + tiisg])*slope;
} else {
s2 += s2_t(sm2[j*SH + tiisg]);
}
if (FC_flash_attn_ext_has_bias) {
s2 += s2_t(sm2[j*SH + tiisg])*slope;
} else {
s2 += s2_t(sm2[j*SH + tiisg]);
}
M[jj] = simd_max(max(M[jj], max(s2[0], s2[1])));

View File

@@ -3478,6 +3478,10 @@ static ggml_backend_i ggml_backend_opencl_i = {
/* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
/* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
/* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_opencl_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -893,6 +893,10 @@ static ggml_backend_i ggml_backend_rpc_interface = {
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_rpc_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -4455,9 +4455,13 @@ static ggml_backend_i ggml_backend_sycl_interface = {
/* .free = */ ggml_backend_sycl_free,
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
// // TODO: update for the new
// interface
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_sycl_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -34,7 +34,11 @@ static ggml_backend_i ggml_backend_remoting_interface = {
/* .free = */ ggml_backend_remoting_free,
/* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async,
/* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL, // ggml_backend_remoting_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -402,19 +402,19 @@ enum FaCodePath {
};
struct vk_fa_pipeline_state {
vk_fa_pipeline_state(uint32_t HSK, uint32_t HSV, bool small_rows, bool small_cache, FaCodePath path, bool aligned, bool f32acc, uint32_t flags)
: HSK(HSK), HSV(HSV), small_rows(small_rows), small_cache(small_cache), path(path), aligned(aligned), f32acc(f32acc), flags(flags) {}
vk_fa_pipeline_state(uint32_t HSK, uint32_t HSV, bool small_rows, bool small_cache, FaCodePath path, bool aligned, bool f32acc, bool use_mask_opt)
: HSK(HSK), HSV(HSV), small_rows(small_rows), small_cache(small_cache), path(path), aligned(aligned), f32acc(f32acc), use_mask_opt(use_mask_opt) {}
uint32_t HSK, HSV;
bool small_rows, small_cache;
FaCodePath path;
bool aligned;
bool f32acc;
uint32_t flags;
bool use_mask_opt;
bool operator<(const vk_fa_pipeline_state &b) const {
return std::tie(HSK, HSV, small_rows, small_cache, path, aligned, f32acc, flags) <
std::tie(b.HSK, b.HSV, b.small_rows, b.small_cache, b.path, b.aligned, b.f32acc, b.flags);
return std::tie(HSK, HSV, small_rows, small_cache, path, aligned, f32acc, use_mask_opt) <
std::tie(b.HSK, b.HSV, b.small_rows, b.small_cache, b.path, b.aligned, b.f32acc, b.use_mask_opt);
}
};
@@ -3193,7 +3193,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
return {fa_rows_cols(path, hsk, hsv, clamp, type, small_rows, small_cache)[0], 1, 1};
};
auto const &fa_spec_constants = [&](FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows, bool small_cache, uint32_t flags) -> std::vector<uint32_t> {
auto const &fa_spec_constants = [&](FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows, bool small_cache, bool use_mask_opt) -> std::vector<uint32_t> {
// For large number of rows, 128 invocations seems to work best.
// For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we
// can't use 256 for D==80.
@@ -3225,7 +3225,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
// AMD prefers loading K directly from global memory
const uint32_t k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 ? 1 : 0;
return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, device->subgroup_size, k_load_shmem, flags};
return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, device->subgroup_size, k_load_shmem, use_mask_opt};
};
#define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \
@@ -3237,19 +3237,19 @@ static void ggml_vk_load_shaders(vk_device& device) {
FaCodePath path = fa.first.path; \
bool aligned = fa.first.aligned; \
bool f32acc = fa.first.f32acc; \
uint32_t flags = fa.first.flags; \
bool use_mask_opt = fa.first.use_mask_opt; \
if (path == FAPATH) { \
if (aligned) { \
if (f32acc) { \
ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache,flags), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \
ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache,use_mask_opt), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \
} else { \
ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache,flags), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \
ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache,use_mask_opt), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \
} \
} else { \
if (f32acc) { \
ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache,flags), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \
ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache,use_mask_opt), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \
} else { \
ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache,flags), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \
ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache,use_mask_opt), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \
} \
} \
} \
@@ -8595,26 +8595,10 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
float scale = 1.0f;
float max_bias = 0.0f;
float logit_softcap = 0.0f;
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float));
if (logit_softcap != 0) {
scale /= logit_softcap;
}
// Only use mask opt when the mask is fairly large. This hasn't been tuned extensively.
bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768;
uint32_t flags = (use_mask_opt ? 1 : 0) |
(mask != nullptr ? 2 : 0) |
(logit_softcap != 0 ? 4 : 0);
vk_fa_pipeline_state fa_pipeline_state(HSK, HSV, small_rows, small_cache, path, aligned, f32acc, flags);
vk_fa_pipeline_state fa_pipeline_state(HSK, HSV, small_rows, small_cache, path, aligned, f32acc, use_mask_opt);
vk_pipeline pipeline = nullptr;
@@ -8694,6 +8678,18 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
}
}
float scale = 1.0f;
float max_bias = 0.0f;
float logit_softcap = 0.0f;
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float));
if (logit_softcap != 0) {
scale /= logit_softcap;
}
const uint32_t n_head_kv = neq2;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
@@ -8707,7 +8703,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
vk_subbuffer sinks_buf = sinks ? ggml_vk_tensor_subbuffer(ctx, sinks) : q_buf;
vk_subbuffer mask_opt_buf = use_mask_opt ? ggml_vk_subbuffer(ctx, ctx->prealloc_y, 0) : q_buf;
uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | n_head_log2;
uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | ((mask != nullptr) << 16) | n_head_log2;
if (use_mask_opt)
{
@@ -14374,7 +14370,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .free = */ ggml_backend_vk_free,
/* .set_tensor_async = */ ggml_backend_vk_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_vk_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -127,7 +127,7 @@ void main() {
continue;
}
// Only load if the block is not all zeros
if (MASK_ENABLE && mask_opt_bits != MASK_OPT_ALL_ZERO) {
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0 && mask_opt_bits != MASK_OPT_ALL_ZERO) {
bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
@@ -181,7 +181,7 @@ void main() {
}
}
if (LOGIT_SOFTCAP) {
if (p.logit_softcap != 0.0f) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
Sf[r][c] = p.logit_softcap * tanh(Sf[r][c]);
@@ -189,7 +189,7 @@ void main() {
}
}
if (MASK_ENABLE && mask_opt_bits != MASK_OPT_ALL_ZERO) {
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0 && mask_opt_bits != MASK_OPT_ALL_ZERO) {
[[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
float mvf = masksh[c * cols_per_iter + col_tid][r];

View File

@@ -10,11 +10,7 @@ layout (constant_id = 5) const uint32_t Clamp = 0;
layout (constant_id = 6) const uint32_t D_split = 16;
layout (constant_id = 7) const uint32_t SubGroupSize = 32;
layout (constant_id = 8) const uint32_t K_LOAD_SHMEM = 0;
layout (constant_id = 9) const uint32_t Flags = 0;
const bool USE_MASK_OPT = (Flags & 1) != 0;
const bool MASK_ENABLE = (Flags & 2) != 0;
const bool LOGIT_SOFTCAP = (Flags & 4) != 0;
layout (constant_id = 9) const bool USE_MASK_OPT = false;
// Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths
const uint32_t HSK_pad = (HSK + 15) & ~15;
@@ -64,6 +60,7 @@ layout (push_constant) uniform parameter {
} p;
#define SINK_ENABLE_BIT (1<<24)
#define MASK_ENABLE_BIT (1<<16)
#define N_LOG2_MASK 0xFFFF
layout (binding = 4) readonly buffer S {float data_s[];};

View File

@@ -160,7 +160,7 @@ void main() {
mask_cache[idx] = f16vec4(0);
}
if (MASK_ENABLE) {
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
if (USE_MASK_OPT && mask_opt_idx != j / 16) {
mask_opt_idx = j / 16;
@@ -303,7 +303,7 @@ void main() {
coopMatStore(SfMat, sfsh, coord, sfshstride, gl_CooperativeMatrixLayoutRowMajor);
barrier();
if (LOGIT_SOFTCAP) {
if (p.logit_softcap != 0.0f) {
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br / 4; idx += gl_WorkGroupSize.x) {
uint32_t c = (idx + tid) / (Br / 4);
uint32_t r = (idx + tid) % (Br / 4);
@@ -314,7 +314,7 @@ void main() {
barrier();
}
if (MASK_ENABLE) {
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
[[unroll]] for (uint32_t idx = 0; idx < Bc * Br / 4; idx += gl_WorkGroupSize.x) {
uint32_t c = (idx + tid) / (Br / 4);
uint32_t r = (idx + tid) % (Br / 4);

View File

@@ -155,7 +155,7 @@ void main() {
for (uint32_t j = start_j; j < end_j; ++j) {
coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv = coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
if (MASK_ENABLE) {
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
if (USE_MASK_OPT && mask_opt_idx != j / 16) {
mask_opt_idx = j / 16;
@@ -197,14 +197,14 @@ void main() {
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose DECODEFUNC);
S = coopMatMulAdd(Qf16, K_T, S);
if (LOGIT_SOFTCAP) {
if (p.logit_softcap != 0.0f) {
[[unroll]]
for (int k = 0; k < S.length(); ++k) {
S[k] = ACC_TYPE(p.logit_softcap)*tanh(S[k]);
}
}
if (MASK_ENABLE) {
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
}

View File

@@ -2140,7 +2140,11 @@ static ggml_backend_i ggml_backend_webgpu_i = {
/* .free = */ ggml_backend_webgpu_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -417,20 +417,24 @@ static enum ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend,
}
static ggml_backend_i ggml_backend_zdnn_i = {
/* .get_name = */ ggml_backend_zdnn_name,
/* .free = */ ggml_backend_zdnn_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_zdnn_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .graph_optimize = */ NULL,
/* .get_name = */ ggml_backend_zdnn_name,
/* .free = */ ggml_backend_zdnn_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_zdnn_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .graph_optimize = */ NULL,
};
static ggml_guid_t ggml_backend_zdnn_guid(void) {

View File

@@ -240,7 +240,11 @@ static struct ggml_backend_i ggml_backend_zendnn_i = {
/* .free = */ ggml_backend_zendnn_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -189,9 +189,10 @@ extern "C" {
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
LLAMA_SPLIT_MODE_TENSOR = 3,
};
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)

View File

@@ -31,7 +31,7 @@ add_library(llama
llama-model-saver.cpp
llama-model.cpp
llama-quant.cpp
llama-sampler.cpp
llama-sampling.cpp
llama-vocab.cpp
unicode-data.cpp
unicode.cpp

View File

@@ -972,9 +972,11 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
for (auto & backend : backends) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
if (set_abort_callback_fn) {
set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
if (reg) {
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
if (set_abort_callback_fn) {
set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
}
}
}
}

View File

@@ -2,7 +2,7 @@
#include "llama-impl.h"
#include "llama-vocab.h"
#include "llama-sampler.h"
#include "llama-sampling.h"
#include <cmath>
#include <algorithm>

View File

@@ -416,14 +416,16 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
// add the device extra buffer type (if any)
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
if (reg) {
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(dev, *extra_bufts);
++extra_bufts;
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(dev, *extra_bufts);
++extra_bufts;
}
}
}

View File

@@ -1,4 +1,4 @@
#include "llama-sampler.h"
#include "llama-sampling.h"
#include "llama-impl.h"
#include "llama-vocab.h"

View File

@@ -1,5 +1,7 @@
#pragma once
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
#include "llama.h"
#include <vector>

View File

@@ -21,7 +21,9 @@
#include <cstdio>
#include <cstring>
#include <ctime>
#include <regex>
#include <stdexcept>
#include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -160,6 +162,9 @@ static void llama_params_fit_impl(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
throw llama_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
}
constexpr int64_t MiB = 1024*1024;
typedef std::vector<llama_device_memory_data> dmds_t;
const llama_model_params default_mparams = llama_model_default_params();
@@ -879,6 +884,67 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
return 0;
}
static enum ggml_backend_meta_split_state llama_meta_device_get_tensor_split(const struct ggml_tensor * tensor, void * userdata) {
// attention
const std::regex pattern_qkv_weight("blk\\.\\d*\\.attn_(q|k|v).weight");
if (std::regex_match(tensor->name, pattern_qkv_weight)) {
return GGML_BACKEND_SPLIT_STATE_BY_NE1;
}
const std::regex pattern_qkv_bias("blk\\.\\d*\\.attn_(q|k|v)\\.bias");
if (std::regex_match(tensor->name, pattern_qkv_bias)) {
return GGML_BACKEND_SPLIT_STATE_BY_NE0;
}
const std::regex pattern_qk_norm("blk\\.\\d*\\.attn_(q|k)_norm\\.weight");
if (std::regex_match(tensor->name, pattern_qk_norm)) {
return tensor->ne[1] == 1 ? GGML_BACKEND_SPLIT_STATE_MIRRORED : GGML_BACKEND_SPLIT_STATE_BY_NE1;
}
const std::regex pattern_kv_cache("cache_(k|v)_l\\d*");
const std::regex pattern_attn_sinks("blk\\.\\d*\\.attn_sinks.weight");
if (std::regex_match(tensor->name, pattern_kv_cache) || std::regex_match(tensor->name, pattern_attn_sinks)) {
return GGML_BACKEND_SPLIT_STATE_BY_NE0;
}
const std::regex pattern_attn_out_weight("blk\\.\\d*\\.attn_output.weight");
if (std::regex_match(tensor->name, pattern_attn_out_weight)) {
return GGML_BACKEND_SPLIT_STATE_BY_NE0;
}
const std::regex pattern_attn_out_bias("blk\\.\\d*\\.attn_output.bias");
if (std::regex_match(tensor->name, pattern_attn_out_bias)) {
return GGML_BACKEND_SPLIT_STATE_MIRRORED;
}
// FFN
const std::regex pattern_ffn_up_gate_weight("blk\\.\\d*\\.ffn_(up|gate)(_exps)?.weight");
if (std::regex_match(tensor->name, pattern_ffn_up_gate_weight)) {
return GGML_BACKEND_SPLIT_STATE_BY_NE1;
}
const std::regex pattern_ffn_up_gate_bias("blk\\.\\d*\\.ffn_(up|gate)(_exps)?.bias");
if (std::regex_match(tensor->name, pattern_ffn_up_gate_bias)) {
return GGML_BACKEND_SPLIT_STATE_BY_NE0;
}
const std::regex pattern_ffn_down_weight("blk\\.\\d*\\.ffn_down(_exps)?.weight");
if (std::regex_match(tensor->name, pattern_ffn_down_weight)) {
return GGML_BACKEND_SPLIT_STATE_BY_NE0;
}
const std::regex pattern_ffn_down_bias("blk\\.\\d*\\.ffn_down(_exps)?.bias");
if (std::regex_match(tensor->name, pattern_ffn_down_bias)) {
return GGML_BACKEND_SPLIT_STATE_MIRRORED;
}
// output
const std::regex pattern_output_weight("output\\.weight");
if (std::regex_match(tensor->name, pattern_output_weight)) {
return GGML_BACKEND_SPLIT_STATE_BY_NE1;
}
const std::regex pattern_output_bias("output\\.bias");
if (std::regex_match(tensor->name, pattern_output_bias)) {
return GGML_BACKEND_SPLIT_STATE_BY_NE0;
}
// everything else
return GGML_BACKEND_SPLIT_STATE_MIRRORED;
GGML_UNUSED(userdata);
}
static struct llama_model * llama_model_load_from_file_impl(
const std::string & path_model,
std::vector<std::string> & splits,
@@ -909,10 +975,23 @@ static struct llama_model * llama_model_load_from_file_impl(
llama_model * model = new llama_model(params);
if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR && (path_model.find("stories") != std::string::npos || path_model.find("bge-small") != std::string::npos || path_model.find("reranker") != std::string::npos) || path_model.find("tinygemma") != std::string::npos) {
params.split_mode = LLAMA_SPLIT_MODE_NONE;
}
// create list of devices to use with this model
if (params.devices) {
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
model->devices.push_back(*dev);
if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
size_t n_devs = 0;
while (params.devices[n_devs]) {
n_devs++;
}
model->devices.push_back(ggml_backend_meta_device(params.devices, n_devs, llama_meta_device_get_tensor_split, nullptr));
} else {
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
model->devices.push_back(*dev);
}
}
} else {
// default device selection
@@ -922,47 +1001,61 @@ static struct llama_model * llama_model_load_from_file_impl(
std::vector<ggml_backend_dev_t> igpus;
std::vector<ggml_backend_dev_t> rpc_servers;
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
switch (ggml_backend_dev_type(dev)) {
case GGML_BACKEND_DEVICE_TYPE_CPU:
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
// skip CPU backends since they are handled separately
break;
if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) {
std::vector<ggml_backend_dev_t> devs;
devs.reserve(ggml_backend_dev_count());
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
devs.push_back(ggml_backend_dev_get(i));
}
GGML_ASSERT(devs.size() >= 2);
GGML_ASSERT(ggml_backend_dev_buffer_type(devs.back()) == ggml_backend_cpu_buffer_type());
gpus.push_back(ggml_backend_meta_device(devs.data(), devs.size() - 1, llama_meta_device_get_tensor_split, nullptr));
} else {
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
switch (ggml_backend_dev_type(dev)) {
case GGML_BACKEND_DEVICE_TYPE_CPU:
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
// skip CPU backends since they are handled separately
break;
case GGML_BACKEND_DEVICE_TYPE_GPU: {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_servers.push_back(dev);
} else {
// check if there is already a GPU with the same device id
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
ggml_backend_dev_props d_props;
ggml_backend_dev_get_props(d, &d_props);
if (props.device_id && d_props.device_id) {
return strcmp(props.device_id, d_props.device_id) == 0;
}
return false;
});
if (it != gpus.end()) {
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
__func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
props.device_id ? props.device_id : "unknown id",
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
case GGML_BACKEND_DEVICE_TYPE_GPU: {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_servers.push_back(dev);
} else {
gpus.push_back(dev);
}
}
break;
}
// check if there is already a GPU with the same device id
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
ggml_backend_dev_props d_props;
ggml_backend_dev_get_props(d, &d_props);
if (props.device_id && d_props.device_id) {
return strcmp(props.device_id, d_props.device_id) == 0;
}
return false;
});
case GGML_BACKEND_DEVICE_TYPE_IGPU:
igpus.push_back(dev);
break;
if (it != gpus.end()) {
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
__func__,
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
props.device_id ? props.device_id : "unknown id",
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
} else {
gpus.push_back(dev);
}
}
break;
}
case GGML_BACKEND_DEVICE_TYPE_IGPU:
igpus.push_back(dev);
break;
case GGML_BACKEND_DEVICE_TYPE_META:
GGML_ABORT("fatal error");
break;
}
}
}

View File

@@ -259,6 +259,8 @@ static const char * split_mode_str(llama_split_mode mode) {
return "layer";
case LLAMA_SPLIT_MODE_ROW:
return "row";
case LLAMA_SPLIT_MODE_TENSOR:
return "tensor";
default:
GGML_ABORT("invalid split mode");
}
@@ -440,7 +442,7 @@ static void print_usage(int /* argc */, char ** argv) {
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
printf(" -sm, --split-mode <none|layer|row|tensor> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n",
join(cmd_params_defaults.main_gpu, ",").c_str());
@@ -723,6 +725,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
mode = LLAMA_SPLIT_MODE_LAYER;
} else if (m == "row") {
mode = LLAMA_SPLIT_MODE_ROW;
} else if (m == "tensor") {
mode = LLAMA_SPLIT_MODE_TENSOR;
} else {
invalid_param = true;
break;
@@ -1685,7 +1689,7 @@ struct markdown_printer : public printer {
return 6;
}
if (field == "split_mode") {
return 5;
return 6;
}
if (field == "flash_attn") {
return 2;