Compare commits

...

10 Commits
b8079 ... b8089

Author SHA1 Message Date
Jeff Bolz
d0061be838 vulkan: split mul_mat into multiple dispatches to avoid overflow (#19509)
* vulkan: split mul_mat into multiple dispatches to avoid overflow

The batch dimensions can be greater than the max workgroup count limit,
in which case we need to split into multiple dispatches and pass the base
index through a push constant.

Fall back for the less common p021 and nc variants.

* address feedback
2026-02-18 10:47:10 +01:00
Adrien Gallouët
a569bda445 common : make small string helpers as inline functions (#19693)
Also use string_view when it make sense and fix some corner cases.

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-02-18 08:03:01 +01:00
shaofeiqi
e2f19b320f opencl: refactor expm1 and softplus (#19404)
* opencl: refactor expm1

* opencl: refactor softplus

* opencl: use h for half literals

---------

Co-authored-by: Li He <lih@qti.qualcomm.com>
2026-02-17 14:47:18 -08:00
shaofeiqi
983559d24b opencl: optimize mean and sum_row kernels (#19614)
* opencl: optimize mean and sum_row kernels

* opencl: add comment for max subgroups

* opencl: format

---------

Co-authored-by: Li He <lih@qti.qualcomm.com>
2026-02-17 13:56:09 -08:00
Daniel Bevenius
2b089c7758 model-conversion : add option to print tensor values (#19692)
This commit updates the tensor-info.py script to support the option to
print the first N values of a tensor when displaying its information.

The motivation for this is that it can be useful to inspect some actual
values in addition to the shapes of the tensors.
2026-02-17 20:43:22 +01:00
Aleksander Grygier
afa6bfe4f7 Pre-MCP UI and architecture cleanup (#19685)
* webui: extract non-MCP changes from mcp-mvp review split

* webui: extract additional pre-MCP UI and architecture cleanup

* chore: update webui build output
2026-02-17 13:47:45 +01:00
Talha Can Havadar
ae2d3f28a8 ggml: ggml-cpu: force-no-lto-for-cpu-feats (#19609)
When LTO enabled in build environments it forces all builds to have LTO
in place. But feature detection logic is fragile, and causing Illegal
instruction errors with lto. This disables LTO for the feature
detection code to prevent cross-module optimization from inlining
architecture-specific instructions into the score function. Without this,
LTO can cause SIGILL when loading backends on older CPUs (e.g., loading
power10 backend on power9 crashes before feature check runs).
2026-02-17 13:22:46 +02:00
Georgi Gerganov
ad8207af77 cuda : enable CUDA graphs for MMID 1 <= BS <= 4 (#19645)
* cuda : enable CUDA graphs for MMID BS <= 4

* cont : add stream capture check

Co-authored-by: Oliver Simons <osimons@nvidia.com>

* cont : add MMVQ_MMID_MAX_BATCH_SIZE

---------

Co-authored-by: Oliver Simons <osimons@nvidia.com>
2026-02-17 12:31:49 +02:00
Daniel Bevenius
667b694278 model-conversion : make printing of config values optional (#19681)
* model-conversion : make printing of config values optional

This commit updates run-org-model.py to make the printing of model
configuration values optional.

The motivation for this change is that not all models have these
configuration values defined and those that do not will error when
running this script. With these changes we only print the values if they
exist or a default value.

We could optionally just remove them but it can be useful to see these
values when running the original model.
2026-02-17 10:46:53 +01:00
Sigbjørn Skjæret
e48349a49d ci : bump komac version (#19682) 2026-02-17 09:30:31 +01:00
30 changed files with 842 additions and 507 deletions

View File

@@ -17,7 +17,7 @@ jobs:
- name: Install komac
run: |
cargo binstall komac@2.11.2 -y
cargo binstall komac@2.15.0 -y
- name: Find latest release
id: find_latest_release

View File

@@ -452,34 +452,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
s = std::move(builder);
}
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
bool has_suffix = string_ends_with(str, suffix);
if (has_suffix) {
str = str.substr(0, str.size() - suffix.size());
}
return has_suffix;
}
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
if (!str.empty() && !stop.empty()) {
const char text_last_char = str.back();
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
if (stop[char_index] == text_last_char) {
const auto current_partial = stop.substr(0, char_index + 1);
if (string_ends_with(str, current_partial)) {
return str.size() - char_index - 1;
}
}
}
}
return std::string::npos;
}
std::string regex_escape(const std::string & s) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
return std::regex_replace(s, special_chars, "\\$&");

View File

@@ -670,30 +670,55 @@ static std::vector<T> string_split(const std::string & str, char delim) {
}
template<>
inline std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
{
std::vector<std::string> parts;
size_t begin_pos = 0;
size_t separator_pos = input.find(separator);
while (separator_pos != std::string::npos) {
std::string part = input.substr(begin_pos, separator_pos - begin_pos);
size_t delim_pos = str.find(delim);
while (delim_pos != std::string::npos) {
std::string part = str.substr(begin_pos, delim_pos - begin_pos);
parts.emplace_back(part);
begin_pos = separator_pos + 1;
separator_pos = input.find(separator, begin_pos);
begin_pos = delim_pos + 1;
delim_pos = str.find(delim, begin_pos);
}
parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
parts.emplace_back(str.substr(begin_pos));
return parts;
}
inline bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
// remove when moving to c++20
inline bool string_starts_with(std::string_view str, std::string_view prefix) {
return str.size() >= prefix.size() &&
str.compare(0, prefix.size(), prefix) == 0;
}
// While we wait for C++20's std::string::ends_with...
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
// remove when moving to c++20
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
return str.size() >= suffix.size() &&
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
}
inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
if (string_ends_with(str, suffix)) {
str.resize(str.size() - suffix.size());
return true;
}
return false;
}
inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
if (!str.empty() && !stop.empty()) {
const size_t max_len = std::min(str.size(), stop.size());
const char last_char = str.back();
for (size_t len = max_len; len > 0; --len) {
if (stop[len - 1] == last_char) {
if (string_ends_with(str, stop.substr(0, len))) {
return str.size() - len;
}
}
}
}
return std::string::npos;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);

View File

@@ -42,11 +42,15 @@ def load_model_and_tokenizer(model_path, device="auto"):
config = config.text_config
multimodal = True
print("Vocab size: ", config.vocab_size)
print("Hidden size: ", config.hidden_size)
print("Number of layers: ", config.num_hidden_layers)
print("BOS token id: ", config.bos_token_id)
print("EOS token id: ", config.eos_token_id)
def print_if_exists(label, obj, attr, default="N/A"):
val = getattr(obj, attr) if hasattr(obj, attr) else default
print(f"{label}", val)
print_if_exists("Vocab size: ", config, "vocab_size")
print_if_exists("Hidden size: ", config, "hidden_size")
print_if_exists("Number of layers: ", config, "num_hidden_layers")
print_if_exists("BOS token id: ", config, "bos_token_id")
print_if_exists("EOS token id: ", config, "eos_token_id")
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
if unreleased_model_name:

View File

@@ -78,7 +78,7 @@ def list_all_tensors(model_path: Path, unique: bool = False):
print(tensor_name)
def print_tensor_info(model_path: Path, tensor_name: str):
def print_tensor_info(model_path: Path, tensor_name: str, num_values: Optional[int] = None):
tensor_file = find_tensor_file(model_path, tensor_name)
if tensor_file is None:
@@ -96,6 +96,12 @@ def print_tensor_info(model_path: Path, tensor_name: str):
print(f"Tensor: {tensor_name}")
print(f"File: {tensor_file}")
print(f"Shape: {shape}")
if num_values is not None:
tensor = f.get_tensor(tensor_name)
print(f"Dtype: {tensor.dtype}")
flat = tensor.flatten()
n = min(num_values, flat.numel())
print(f"Values: {flat[:n].tolist()}")
else:
print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
sys.exit(1)
@@ -127,6 +133,15 @@ def main():
action="store_true",
help="List unique tensor patterns in the model (layer numbers replaced with #)"
)
parser.add_argument(
"-n", "--num-values",
nargs="?",
const=10,
default=None,
type=int,
metavar="N",
help="Print the first N values of the tensor flattened (default: 10 if flag is given without a number)"
)
args = parser.parse_args()
@@ -152,7 +167,7 @@ def main():
if args.tensor_name is None:
print("Error: tensor_name is required when not using --list")
sys.exit(1)
print_tensor_info(model_path, args.tensor_name)
print_tensor_info(model_path, args.tensor_name, args.num_values)
if __name__ == "__main__":

View File

@@ -9,6 +9,11 @@ function(ggml_add_cpu_backend_features cpu_name arch)
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
# Disable LTO for the feature detection code to prevent cross-module optimization
# from inlining architecture-specific instructions into the score function.
# Without this, LTO can cause SIGILL when loading backends on older CPUs
# (e.g., loading power10 backend on power9 crashes before feature check runs).
target_compile_options(${GGML_CPU_FEATS_NAME} PRIVATE -fno-lto)
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
endfunction()

View File

@@ -2278,11 +2278,12 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
// [TAG_MUL_MAT_ID_CUDA_GRAPHS]
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
if (ggml_is_quantized(src0->type)) {
if (ne2 <= 4) {
if (ne2 <= MMVQ_MMID_MAX_BATCH_SIZE) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
return;
}
@@ -2305,6 +2306,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
}
}
// note: this path should not be reached when recording CUDA graphs, because it requires stream synchronization
// TODO: add asserts to verify this. should work with CUDA, HIP, etc.
cudaStream_t stream = ctx.stream();
GGML_ASSERT(nb12 % nb11 == 0);
@@ -2865,15 +2868,6 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
bool use_cuda_graph = true;
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
const std::string delta_net_prefix = "dnet_add";
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
@@ -2888,31 +2882,14 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
#endif
}
if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
#endif
}
if (node->op == GGML_OP_ADD &&
node->src[1] && node->src[1]->ne[1] > 1 &&
(node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
(node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0 &&
strncmp(node->name, delta_net_prefix.c_str(), delta_net_prefix.size()) != 0) {
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
// by means of matching node names. See
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
// https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
// [TAG_MUL_MAT_ID_CUDA_GRAPHS]
if (node->op == GGML_OP_MUL_MAT_ID && (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > MMVQ_MMID_MAX_BATCH_SIZE)) {
// under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
// TODO: figure out a way to enable for larger batch sizes, without hurting performance
// ref: https://github.com/ggml-org/llama.cpp/pull/18958
use_cuda_graph = false;
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
#endif
}

View File

@@ -1,6 +1,7 @@
#include "common.cuh"
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
#define MMVQ_MMID_MAX_BATCH_SIZE 4 // Max. batch size for which to use MMVQ kernels for MUL_MAT_ID
void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr);

View File

@@ -484,7 +484,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_scale_f32, kernel_scale_f32_4;
cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
cl_kernel kernel_mean_f32;
cl_kernel kernel_mean_f32, kernel_mean_f32_4;
cl_kernel kernel_silu, kernel_silu_4;
cl_kernel kernel_gelu, kernel_gelu_4;
cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
@@ -543,15 +543,15 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_solve_tri_f32;
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
cl_kernel kernel_argsort_f32_i32;
cl_kernel kernel_sum_rows_f32;
cl_kernel kernel_sum_rows_f32, kernel_sum_rows_f32_4;
cl_kernel kernel_repeat_f32;
cl_kernel kernel_pad;
cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
cl_kernel kernel_tanh_f16, kernel_tanh_f16_4, kernel_tanh_f16_nc;
cl_kernel kernel_expm1_f32_nd;
cl_kernel kernel_expm1_f16_nd;
cl_kernel kernel_softplus_f32_nd;
cl_kernel kernel_softplus_f16_nd;
cl_kernel kernel_expm1_f32, kernel_expm1_f32_4, kernel_expm1_f32_nc;
cl_kernel kernel_expm1_f16, kernel_expm1_f16_4, kernel_expm1_f16_nc;
cl_kernel kernel_softplus_f32, kernel_softplus_f32_4, kernel_softplus_f32_nc;
cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
cl_kernel kernel_upscale;
cl_kernel kernel_upscale_bilinear;
cl_kernel kernel_concat_f32;
@@ -1837,6 +1837,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mean_f32 = clCreateKernel(prog, "kernel_mean_f32", &err), err));
CL_CHECK((backend_ctx->kernel_mean_f32_4 = clCreateKernel(prog, "kernel_mean_f32_4", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
@@ -1874,6 +1875,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
CL_CHECK((backend_ctx->kernel_sum_rows_f32_4 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32_4", &err), err));
GGML_LOG_CONT(".");
}
@@ -1978,20 +1980,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
#else
const std::string kernel_src = read_file("expm1.cl");
#endif
cl_program prog;
if (!kernel_src.empty()) {
prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_expm1_f32_nd = clCreateKernel(prog, "kernel_expm1_f32_nd", &err), err));
CL_CHECK((backend_ctx->kernel_expm1_f16_nd = clCreateKernel(prog, "kernel_expm1_f16_nd", &err), err));
GGML_LOG_CONT(".");
} else {
GGML_LOG_WARN("ggml_opencl: expm1 kernel source not found or empty. Expm1 operation will not be available.\n");
prog = nullptr;
backend_ctx->kernel_expm1_f32_nd = nullptr;
backend_ctx->kernel_expm1_f16_nd = nullptr;
}
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_expm1_f32 = clCreateKernel(prog, "kernel_expm1_f32", &err), err));
CL_CHECK((backend_ctx->kernel_expm1_f32_4 = clCreateKernel(prog, "kernel_expm1_f32_4", &err), err));
CL_CHECK((backend_ctx->kernel_expm1_f32_nc = clCreateKernel(prog, "kernel_expm1_f32_nc", &err), err));
CL_CHECK((backend_ctx->kernel_expm1_f16 = clCreateKernel(prog, "kernel_expm1_f16", &err), err));
CL_CHECK((backend_ctx->kernel_expm1_f16_4 = clCreateKernel(prog, "kernel_expm1_f16_4", &err), err));
CL_CHECK((backend_ctx->kernel_expm1_f16_nc = clCreateKernel(prog, "kernel_expm1_f16_nc", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// softplus
@@ -2003,20 +2001,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
#else
const std::string kernel_src = read_file("softplus.cl");
#endif
cl_program prog;
if (!kernel_src.empty()) {
prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_softplus_f32_nd = clCreateKernel(prog, "kernel_softplus_f32_nd", &err), err));
CL_CHECK((backend_ctx->kernel_softplus_f16_nd = clCreateKernel(prog, "kernel_softplus_f16_nd", &err), err));
GGML_LOG_CONT(".");
} else {
GGML_LOG_WARN("ggml_opencl: softplus kernel source not found or empty. Softplus operation will not be available.\n");
prog = nullptr;
backend_ctx->kernel_softplus_f32_nd = nullptr;
backend_ctx->kernel_softplus_f16_nd = nullptr;
}
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_softplus_f32 = clCreateKernel(prog, "kernel_softplus_f32", &err), err));
CL_CHECK((backend_ctx->kernel_softplus_f32_4 = clCreateKernel(prog, "kernel_softplus_f32_4", &err), err));
CL_CHECK((backend_ctx->kernel_softplus_f32_nc = clCreateKernel(prog, "kernel_softplus_f32_nc", &err), err));
CL_CHECK((backend_ctx->kernel_softplus_f16 = clCreateKernel(prog, "kernel_softplus_f16", &err), err));
CL_CHECK((backend_ctx->kernel_softplus_f16_4 = clCreateKernel(prog, "kernel_softplus_f16_4", &err), err));
CL_CHECK((backend_ctx->kernel_softplus_f16_nc = clCreateKernel(prog, "kernel_softplus_f16_nc", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// upscale
@@ -3463,11 +3457,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_UNARY_OP_TANH:
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
case GGML_UNARY_OP_EXPM1:
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
case GGML_UNARY_OP_SOFTPLUS:
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
default:
return false;
}
@@ -3587,7 +3579,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
}
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_FLASH_ATTN_EXT:
{
const ggml_tensor * q = op->src[0];
@@ -6400,7 +6392,6 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
GGML_UNUSED(src1);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(ggml_is_contiguous(src0));
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@@ -6423,7 +6414,14 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
cl_kernel kernel = backend_ctx->kernel_mean_f32;
cl_kernel kernel;
const bool is_c4 = ne00 % 4 == 0;
if (is_c4) {
kernel = backend_ctx->kernel_mean_f32_4;
} else {
kernel = backend_ctx->kernel_mean_f32;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -6440,7 +6438,7 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)64, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
@@ -7388,18 +7386,8 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
cl_kernel kernel;
if (dst->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_expm1_f32_nd;
} else if (dst->type == GGML_TYPE_F16) {
kernel = backend_ctx->kernel_expm1_f16_nd;
} else {
GGML_ASSERT(false && "Unsupported type for ggml_cl_expm1");
}
GGML_ASSERT(kernel != nullptr);
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
@@ -7411,70 +7399,74 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = dst->ne[0];
const int ne11 = dst->ne[1];
const int ne12 = dst->ne[2];
const int ne13 = dst->ne[3];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
const cl_ulong nb10 = dst->nb[0];
const cl_ulong nb11 = dst->nb[1];
const cl_ulong nb12 = dst->nb[2];
const cl_ulong nb13 = dst->nb[3];
cl_kernel kernel;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
size_t global_work_size[3];
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
return;
}
global_work_size[0] = (size_t)ne10;
global_work_size[1] = (size_t)ne11;
global_work_size[2] = (size_t)ne12;
size_t lws0 = 16, lws1 = 4, lws2 = 1;
if (ne10 < 16) lws0 = ne10;
if (ne11 < 4) lws1 = ne11;
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
size_t local_work_size[] = {lws0, lws1, lws2};
size_t* local_work_size_ptr = local_work_size;
if (!backend_ctx->non_uniform_workgroups) {
if (global_work_size[0] % local_work_size[0] != 0 ||
global_work_size[1] % local_work_size[1] != 0 ||
global_work_size[2] % local_work_size[2] != 0) {
local_work_size_ptr = NULL;
if (ggml_is_contiguous(src0)) {
// Handle contiguous input
int n = ggml_nelements(dst);
if (n % 4 == 0) {
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_expm1_f32_4;
} else {
kernel = backend_ctx->kernel_expm1_f16_4;
}
n /= 4;
} else {
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_expm1_f32;
} else {
kernel = backend_ctx->kernel_expm1_f16;
}
}
}
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
size_t * local_work_size_ptr = local_work_size;
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
local_work_size_ptr = nullptr;
}
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
} else {
// Handle non-contiguous input
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_expm1_f32_nc;
} else {
kernel = backend_ctx->kernel_expm1_f16_nc;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
int nth = 64;
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
}
static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7490,18 +7482,8 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
cl_kernel kernel;
if (dst->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_softplus_f32_nd;
} else if (dst->type == GGML_TYPE_F16) {
kernel = backend_ctx->kernel_softplus_f16_nd;
} else {
GGML_ASSERT(false && "Unsupported type for ggml_cl_softplus");
}
GGML_ASSERT(kernel != nullptr);
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
@@ -7513,70 +7495,74 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = dst->ne[0];
const int ne11 = dst->ne[1];
const int ne12 = dst->ne[2];
const int ne13 = dst->ne[3];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
const cl_ulong nb10 = dst->nb[0];
const cl_ulong nb11 = dst->nb[1];
const cl_ulong nb12 = dst->nb[2];
const cl_ulong nb13 = dst->nb[3];
cl_kernel kernel;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
size_t global_work_size[3];
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
return;
}
global_work_size[0] = (size_t)ne10;
global_work_size[1] = (size_t)ne11;
global_work_size[2] = (size_t)ne12;
size_t lws0 = 16, lws1 = 4, lws2 = 1;
if (ne10 < 16) lws0 = ne10;
if (ne11 < 4) lws1 = ne11;
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
size_t local_work_size[] = {lws0, lws1, lws2};
size_t* local_work_size_ptr = local_work_size;
if (!backend_ctx->non_uniform_workgroups) {
if (global_work_size[0] % local_work_size[0] != 0 ||
global_work_size[1] % local_work_size[1] != 0 ||
global_work_size[2] % local_work_size[2] != 0) {
local_work_size_ptr = NULL;
if (ggml_is_contiguous(src0)) {
// Handle contiguous input
int n = ggml_nelements(dst);
if (n % 4 == 0) {
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_softplus_f32_4;
} else {
kernel = backend_ctx->kernel_softplus_f16_4;
}
n /= 4;
} else {
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_softplus_f32;
} else {
kernel = backend_ctx->kernel_softplus_f16;
}
}
}
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
size_t * local_work_size_ptr = local_work_size;
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
local_work_size_ptr = nullptr;
}
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
} else {
// Handle non-contiguous input
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_softplus_f32_nc;
} else {
kernel = backend_ctx->kernel_softplus_f16_nc;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
int nth = 64;
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
}
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
@@ -11088,7 +11074,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
GGML_UNUSED(src1);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(ggml_is_contiguous(src0));
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@@ -11111,7 +11096,14 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
cl_kernel kernel;
const bool is_c4 = ne00 % 4 == 0;
if (is_c4) {
kernel = backend_ctx->kernel_sum_rows_f32_4;
} else {
kernel = backend_ctx->kernel_sum_rows_f32;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -11128,7 +11120,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)64, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);

View File

@@ -3,80 +3,111 @@
//------------------------------------------------------------------------------
// expm1
//------------------------------------------------------------------------------
kernel void kernel_expm1_f32_nd(
global void * p_src0_base,
ulong off_src0_abs,
global void * p_dst_base,
ulong off_dst_abs,
int ne00,
int ne01,
int ne02,
int ne03,
kernel void kernel_expm1_f32(
global const float * src0,
ulong offset0,
global float * dst,
ulong offsetd
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0f;
}
kernel void kernel_expm1_f32_4(
global const float4 * src0,
ulong offset0,
global float4 * dst,
ulong offsetd
) {
src0 = (global float4*)((global char*)src0 + offset0);
dst = (global float4*)((global char*)dst + offsetd);
dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0f;
}
kernel void kernel_expm1_f16(
global const half * src0,
ulong offset0,
global half * dst,
ulong offsetd
) {
src0 = (global half*)((global char*)src0 + offset0);
dst = (global half*)((global char*)dst + offsetd);
dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0h;
}
kernel void kernel_expm1_f16_4(
global const half4 * src0,
ulong offset0,
global half4 * dst,
ulong offsetd
) {
src0 = (global half4*)((global char*)src0 + offset0);
dst = (global half4*)((global char*)dst + offsetd);
dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0h;
}
kernel void kernel_expm1_f32_nc(
global const char * src0,
ulong offset0,
global char * dst,
ulong offsetd,
int ne00,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
int i0 = get_global_id(0);
int i1 = get_global_id(1);
int i2 = get_global_id(2);
src0 = src0 + offset0;
dst = dst + offsetd;
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
for (int i3 = 0; i3 < ne13; ++i3) {
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
*dst_val_ptr = exp(*src_val_ptr) - 1;
}
*y = exp(*x) - 1.0f;
}
}
kernel void kernel_expm1_f16_nd(
global void * p_src0_base,
ulong off_src0_abs,
global void * p_dst_base,
ulong off_dst_abs,
int ne00,
int ne01,
int ne02,
int ne03,
kernel void kernel_expm1_f16_nc(
global const char * src0,
ulong offset0,
global char * dst,
ulong offsetd,
int ne00,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
int i0 = get_global_id(0);
int i1 = get_global_id(1);
int i2 = get_global_id(2);
src0 = src0 + offset0;
dst = dst + offsetd;
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
for (int i3 = 0; i3 < ne13; ++i3) {
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
global half * y = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
*dst_val_ptr = exp(*src_val_ptr) - 1;
}
*y = exp(*x) - 1.0f;
}
}

View File

@@ -1,8 +1,13 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
// Most devices have max workgroup size of 1024, so this is enough for subgroup
// sizes of 16, 32, 64 and 128. Increase this value for smaller subgroups sizes
#define MAX_SUBGROUPS 64
kernel void kernel_mean_f32(
global float * src0,
global char * src0,
ulong offset0,
global float * dst,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
@@ -15,25 +20,121 @@ kernel void kernel_mean_f32(
ulong nb2,
ulong nb3
) {
src0 = (global float *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
src0 = src0 + offset0;
dst = dst + offsetd;
int i3 = get_global_id(2);
int i2 = get_global_id(1);
int i1 = get_global_id(0);
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
const int lid = get_local_id(0);
const int lsize = get_local_size(0);
const uint sg_size = get_sub_group_size();
const uint sg_id = get_sub_group_id();
const uint sg_lid = get_sub_group_local_id();
__local float lmem[MAX_SUBGROUPS];
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
return;
}
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
float row_sum = 0;
for (int i0 = 0; i0 < ne00; i0++) {
row_sum += src_row[i0];
if(sg_id == 0){
lmem[sg_lid] = 0.0f;
}
dst_row[0] = row_sum / ne00;
global float * src_row = (global float *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
global float * dst_row = (global float *) (dst + i1*nb1 + i2*nb2 + i3*nb3);
float sumf = 0.0f;
for (int i0 = lid; i0 < ne00; i0 += lsize) {
sumf += src_row[i0];
}
sumf = sub_group_reduce_add(sumf);
barrier(CLK_LOCAL_MEM_FENCE);
if(sg_lid == 0){
lmem[sg_id] = sumf;
}
barrier(CLK_LOCAL_MEM_FENCE);
sumf = lmem[sg_lid];
sumf = sub_group_reduce_add(sumf);
if (lid == 0) {
dst_row[0] = sumf / ne00;
}
}
kernel void kernel_mean_f32_4(
global char * src0,
ulong offset0,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
dst = dst + offsetd;
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
const int lid = get_local_id(0);
const int lsize = get_local_size(0);
const uint sg_size = get_sub_group_size();
const uint sg_id = get_sub_group_id();
const uint sg_lid = get_sub_group_local_id();
__local float lmem[MAX_SUBGROUPS];
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
return;
}
if(sg_id == 0){
lmem[sg_lid] = 0.0f;
}
global float4 * src_row = (global float4 *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
global float * dst_row = (global float *) (dst + i1*nb1 + i2*nb2 + i3*nb3);
float4 sum_vec = (float4)0.0f;
for (int i0 = lid; i0 < ne00 / 4; i0 += lsize) {
sum_vec += src_row[i0];
}
float sumf = dot(sum_vec, (float4)(1.0f));
sumf = sub_group_reduce_add(sumf);
barrier(CLK_LOCAL_MEM_FENCE);
if(sg_lid == 0){
lmem[sg_id] = sumf;
}
barrier(CLK_LOCAL_MEM_FENCE);
sumf = lmem[sg_lid];
sumf = sub_group_reduce_add(sumf);
if (lid == 0) {
dst_row[0] = sumf / ne00;
}
}

View File

@@ -3,86 +3,114 @@
//------------------------------------------------------------------------------
// softplus
//------------------------------------------------------------------------------
inline float softplus_f32(float x){
float ax = fabs(x);
float m = fmax(x, 0.0f);
return log1p(exp(-ax)) + m;
kernel void kernel_softplus_f32(
global const float * src0,
ulong offset0,
global float * dst,
ulong offsetd
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
dst[get_global_id(0)] = (src0[get_global_id(0)] > 20.0f) ? src0[get_global_id(0)] : log(1.0f + exp(src0[get_global_id(0)]));
}
kernel void kernel_softplus_f32_nd(
global void * p_src0_base,
ulong off_src0_abs,
global void * p_dst_base,
ulong off_dst_abs,
int ne00,
int ne01,
int ne02,
int ne03,
kernel void kernel_softplus_f32_4(
global const float4 * src0,
ulong offset0,
global float4 * dst,
ulong offsetd
) {
src0 = (global float4*)((global char*)src0 + offset0);
dst = (global float4*)((global char*)dst + offsetd);
dst[get_global_id(0)] = (src0[get_global_id(0)] > 20.0f) ? src0[get_global_id(0)] : log(1.0f + exp(src0[get_global_id(0)]));
}
kernel void kernel_softplus_f16(
global const half * src0,
ulong offset0,
global half * dst,
ulong offsetd
) {
src0 = (global half*)((global char*)src0 + offset0);
dst = (global half*)((global char*)dst + offsetd);
const float x = convert_float(src0[get_global_id(0)]);
dst[get_global_id(0)] = convert_half_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
}
kernel void kernel_softplus_f16_4(
global const half4 * src0,
ulong offset0,
global half4 * dst,
ulong offsetd
) {
src0 = (global half4*)((global char*)src0 + offset0);
dst = (global half4*)((global char*)dst + offsetd);
const float4 x = convert_float4(src0[get_global_id(0)]);
dst[get_global_id(0)] = convert_half4_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
}
kernel void kernel_softplus_f32_nc(
global const char * src0,
ulong offset0,
global char * dst,
ulong offsetd,
int ne00,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
int i0 = get_global_id(0);
int i1 = get_global_id(1);
int i2 = get_global_id(2);
src0 = src0 + offset0;
dst = dst + offsetd;
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
for (int i3 = 0; i3 < ne13; ++i3) {
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
*dst_val_ptr = softplus_f32(*src_val_ptr);
}
*y = (*x > 20.0f) ? *x : log(1.0f + exp(*x));
}
}
kernel void kernel_softplus_f16_nd(
global void * p_src0_base,
ulong off_src0_abs,
global void * p_dst_base,
ulong off_dst_abs,
int ne00,
int ne01,
int ne02,
int ne03,
kernel void kernel_softplus_f16_nc(
global const char * src0,
ulong offset0,
global char * dst,
ulong offsetd,
int ne00,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
int i0 = get_global_id(0);
int i1 = get_global_id(1);
int i2 = get_global_id(2);
src0 = src0 + offset0;
dst = dst + offsetd;
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
for (int i3 = 0; i3 < ne13; ++i3) {
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
global const half * hx = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
global half * hy = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
*dst_val_ptr = (half)(softplus_f32((float)(*src_val_ptr)));
}
const float x = convert_float(*hx);
*hy = convert_half_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
}
}

View File

@@ -1,8 +1,13 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
// Most devices have max workgroup size of 1024, so this is enough for subgroup
// sizes of 16, 32, 64 and 128. Increase this value for smaller subgroups sizes
#define MAX_SUBGROUPS 64
kernel void kernel_sum_rows_f32(
global float * src0,
global char * src0,
ulong offset0,
global float * dst,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
@@ -15,25 +20,121 @@ kernel void kernel_sum_rows_f32(
ulong nb2,
ulong nb3
) {
src0 = (global float *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
src0 = src0 + offset0;
dst = dst + offsetd;
int i3 = get_global_id(2);
int i2 = get_global_id(1);
int i1 = get_global_id(0);
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
const int lid = get_local_id(0);
const int lsize = get_local_size(0);
const uint sg_size = get_sub_group_size();
const uint sg_id = get_sub_group_id();
const uint sg_lid = get_sub_group_local_id();
__local float lmem[MAX_SUBGROUPS];
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
return;
}
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
float row_sum = 0;
for (int i0 = 0; i0 < ne00; i0++) {
row_sum += src_row[i0];
if(sg_id == 0){
lmem[sg_lid] = 0.0f;
}
dst_row[0] = row_sum;
global float * src_row = (global float *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
global float * dst_row = (global float *) (dst + i1*nb1 + i2*nb2 + i3*nb3);
float sumf = 0.0f;
for (int i0 = lid; i0 < ne00; i0 += lsize) {
sumf += src_row[i0];
}
sumf = sub_group_reduce_add(sumf);
barrier(CLK_LOCAL_MEM_FENCE);
if(sg_lid == 0){
lmem[sg_id] = sumf;
}
barrier(CLK_LOCAL_MEM_FENCE);
sumf = lmem[sg_lid];
sumf = sub_group_reduce_add(sumf);
if (lid == 0) {
dst_row[0] = sumf;
}
}
kernel void kernel_sum_rows_f32_4(
global char * src0,
ulong offset0,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
dst = dst + offsetd;
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
const int lid = get_local_id(0);
const int lsize = get_local_size(0);
const uint sg_size = get_sub_group_size();
const uint sg_id = get_sub_group_id();
const uint sg_lid = get_sub_group_local_id();
__local float lmem[MAX_SUBGROUPS];
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
return;
}
if(sg_id == 0){
lmem[sg_lid] = 0.0f;
}
global float4 * src_row = (global float4 *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
global float * dst_row = (global float *) (dst + i1*nb1 + i2*nb2 + i3*nb3);
float4 sum_vec = (float4)0.0f;
for (int i0 = lid; i0 < ne00 / 4; i0 += lsize) {
sum_vec += src_row[i0];
}
float sumf = dot(sum_vec, (float4)(1.0f));
sumf = sub_group_reduce_add(sumf);
barrier(CLK_LOCAL_MEM_FENCE);
if(sg_lid == 0){
lmem[sg_id] = sumf;
}
barrier(CLK_LOCAL_MEM_FENCE);
sumf = lmem[sg_lid];
sumf = sub_group_reduce_add(sumf);
if (lid == 0) {
dst_row[0] = sumf;
}
}

View File

@@ -944,6 +944,7 @@ struct vk_mat_mat_push_constants {
uint32_t M; uint32_t N; uint32_t K;
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
uint32_t base_work_group_z; uint32_t num_batches;
uint32_t k_split;
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
uint32_t padded_N;
@@ -963,6 +964,7 @@ struct vk_mat_vec_push_constants {
uint32_t batch_stride_b;
uint32_t batch_stride_d;
uint32_t fusion_flags;
uint32_t base_work_group_y;
uint32_t ne02;
uint32_t ne12;
uint32_t broadcast2;
@@ -6773,8 +6775,16 @@ static void ggml_vk_matmul(
uint32_t padded_n) {
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")");
if (split_k == 1) {
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2]));
uint32_t base_work_group_z = 0;
while (base_work_group_z < batch) {
uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k, ne02, ne12, broadcast2, broadcast3, padded_n };
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, groups_z });
base_work_group_z += groups_z;
}
return;
}
@@ -6788,9 +6798,17 @@ static void ggml_vk_matmul(
uint32_t k_split = CEIL_DIV(k, split_k);
k_split = ROUNDUP_POW2(k_split, 256);
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
// Make sure enough workgroups get assigned for split k to work
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2]));
uint32_t base_work_group_z = 0;
while (base_work_group_z < batch) {
uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
// Make sure enough workgroups get assigned for split k to work
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, groups_z });
base_work_group_z += groups_z;
}
ggml_vk_sync_buffers(ctx, subctx);
const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
@@ -7186,7 +7204,6 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
}
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
if (qx_needs_dequant) {
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
}
@@ -7484,7 +7501,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
if (quantize_y) {
ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
}
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
}
vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]);
@@ -7579,22 +7595,29 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1;
}
// compute
const vk_mat_vec_push_constants pc = {
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
stride_batch_x, stride_batch_y, stride_batch_d,
fusion_flags,
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
};
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
{
d_X,
d_Y,
d_D,
d_F0,
d_F1,
},
pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
ggml_pipeline_request_descriptor_sets(ctx, dmmv, CEIL_DIV(ne12 * ne13, ctx->device->properties.limits.maxComputeWorkGroupCount[1]));
uint32_t base_work_group_y = 0;
while (base_work_group_y < ne12 * ne13) {
uint32_t groups_y = std::min((uint32_t)(ne12 * ne13) - base_work_group_y, ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
const vk_mat_vec_push_constants pc = {
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
stride_batch_x, stride_batch_y, stride_batch_d,
fusion_flags, base_work_group_y,
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
};
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
{
d_X,
d_Y,
d_D,
d_F0,
d_F1,
},
pc, { groups_x, groups_y, groups_z });
base_work_group_y += groups_y;
}
if (x_non_contig) {
ctx->prealloc_x_need_sync = true;
@@ -7832,10 +7855,15 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c
src1->nb[2] <= src1->nb[1] &&
src1->nb[1] <= src1->nb[3] &&
src0->ne[3] == 1 &&
src1->ne[3] == 1) {
src1->ne[3] == 1 &&
src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] &&
src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) {
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx);
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
!ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
!ggml_is_permuted(src0) && !ggml_is_permuted(src1) &&
src0->ne[3] <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] &&
src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] &&
src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) {
ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx);
// mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four)
// when ne12 and ne13 are one.
@@ -11560,7 +11588,6 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
}
}
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
if (split_k > 1) {
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
@@ -12069,7 +12096,6 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
// y[i] = i % k;
}
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
if (split_k > 1) {
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);

View File

@@ -32,6 +32,7 @@ layout (push_constant) uniform parameter
uint expert_i1;
uint nbi1;
#else
uint base_work_group_y;
uint ne02;
uint ne12;
uint broadcast2;
@@ -45,9 +46,9 @@ uint expert_id;
void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
#ifdef MUL_MAT_ID
const uint expert_i0 = gl_GlobalInvocationID.y;
const uint expert_i0 = gl_WorkGroupID.y;
#else
const uint batch_idx = gl_GlobalInvocationID.y;
const uint batch_idx = gl_WorkGroupID.y + p.base_work_group_y;
#endif
#ifndef MUL_MAT_ID

View File

@@ -90,6 +90,8 @@ layout (push_constant) uniform parameter
uint nbi1;
uint ne11;
#else
uint base_work_group_z;
uint num_batches;
uint k_split;
uint ne02;
uint ne12;
@@ -139,7 +141,7 @@ void main() {
const uint ic = gl_WorkGroupID.y;
#ifdef MUL_MAT_ID
const uint expert_idx = gl_GlobalInvocationID.z;
const uint expert_idx = gl_WorkGroupID.z;
if (ic * BN >= data_expert_count[expert_idx]) {
return;
}
@@ -149,7 +151,7 @@ void main() {
#endif
#ifndef MUL_MAT_ID
const uint batch_idx = gl_GlobalInvocationID.z;
const uint batch_idx = gl_WorkGroupID.z + p.base_work_group_z;
const uint i13 = batch_idx / p.ne12;
const uint i12 = batch_idx % p.ne12;
@@ -366,7 +368,7 @@ void main() {
const uint dc = ic * BN + warp_c * WN;
#ifndef MUL_MAT_ID
const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * p.num_batches;
#endif
#ifdef COOPMAT

View File

@@ -53,6 +53,8 @@ layout (push_constant) uniform parameter
uint nbi1;
uint ne11;
#else
uint base_work_group_z;
uint num_batches;
uint k_split;
uint ne02;
uint ne12;
@@ -197,7 +199,7 @@ void main() {
const uint ic = gl_WorkGroupID.y;
#ifdef MUL_MAT_ID
const uint expert_idx = gl_GlobalInvocationID.z;
const uint expert_idx = gl_WorkGroupID.z;
if (ic * BN >= data_expert_count[expert_idx]) {
return;
}
@@ -215,7 +217,7 @@ void main() {
#endif
#ifndef MUL_MAT_ID
const uint batch_idx = gl_GlobalInvocationID.z;
const uint batch_idx = gl_WorkGroupID.z + p.base_work_group_z;
const uint i13 = batch_idx / p.ne12;
const uint i12 = batch_idx % p.ne12;
@@ -255,7 +257,7 @@ void main() {
#else
uint pos_a = batch_idx_a * (p.batch_stride_a / QUANT_K);
uint pos_b = batch_idx * p.batch_stride_b;
uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * p.num_batches;
#endif
uint stride_a = p.stride_a / QUANT_K;

Binary file not shown.

View File

@@ -1,5 +1,5 @@
<script lang="ts">
import { RemoveButton } from '$lib/components/app';
import { ActionIconRemove } from '$lib/components/app';
import { formatFileSize, getFileTypeLabel, getPreviewText, isTextFile } from '$lib/utils';
import { AttachmentType } from '$lib/enums';
@@ -104,7 +104,7 @@
onclick={onClick}
>
<div class="absolute top-2 right-2 opacity-0 transition-opacity group-hover:opacity-100">
<RemoveButton {id} {onRemove} />
<ActionIconRemove {id} {onRemove} />
</div>
<div class="pr-8">
@@ -158,7 +158,7 @@
{#if !readonly}
<div class="absolute top-2 right-2 opacity-0 transition-opacity group-hover:opacity-100">
<RemoveButton {id} {onRemove} />
<ActionIconRemove {id} {onRemove} />
</div>
{/if}
</button>

View File

@@ -1,5 +1,5 @@
<script lang="ts">
import { RemoveButton } from '$lib/components/app';
import { ActionIconRemove } from '$lib/components/app';
interface Props {
id: string;
@@ -58,7 +58,7 @@
<div
class="absolute top-1 right-1 flex items-center justify-center opacity-0 transition-opacity group-hover:opacity-100"
>
<RemoveButton {id} {onRemove} class="text-white" />
<ActionIconRemove {id} {onRemove} class="text-white" />
</div>
{/if}
</div>

View File

@@ -5,6 +5,7 @@
interface Props {
class?: string;
disabled?: boolean;
onInput?: () => void;
onKeydown?: (event: KeyboardEvent) => void;
onPaste?: (event: ClipboardEvent) => void;
placeholder?: string;
@@ -14,6 +15,7 @@
let {
class: className = '',
disabled = false,
onInput,
onKeydown,
onPaste,
placeholder = 'Ask anything...',
@@ -52,7 +54,10 @@
class:cursor-not-allowed={disabled}
{disabled}
onkeydown={onKeydown}
oninput={(event) => autoResizeTextarea(event.currentTarget)}
oninput={(event) => {
autoResizeTextarea(event.currentTarget);
onInput?.();
}}
onpaste={onPaste}
{placeholder}
></textarea>

View File

@@ -14,12 +14,17 @@
</script>
<header
class="md:background-transparent pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end bg-background/40 p-4 backdrop-blur-xl duration-200 ease-linear {sidebar.open
class="pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end p-4 duration-200 ease-linear {sidebar.open
? 'md:left-[var(--sidebar-width)]'
: ''}"
>
<div class="pointer-events-auto flex items-center space-x-2">
<Button variant="ghost" size="sm" onclick={toggleSettings}>
<Button
variant="ghost"
size="icon"
onclick={toggleSettings}
class="rounded-full backdrop-blur-lg"
>
<Settings class="h-4 w-4" />
</Button>
</div>

View File

@@ -11,7 +11,7 @@
let isCurrentConversationLoading = $derived(isLoading());
let isStreaming = $derived(isChatStreaming());
let hasProcessingData = $derived(processingState.processingState !== null);
let processingDetails = $derived(processingState.getProcessingDetails());
let processingDetails = $derived(processingState.getTechnicalDetails());
let showProcessingInfo = $derived(
isCurrentConversationLoading || isStreaming || config().keepStatsVisible || hasProcessingData
@@ -63,7 +63,7 @@
<div class="chat-processing-info-container pointer-events-none" class:visible={showProcessingInfo}>
<div class="chat-processing-info-content">
{#each processingDetails as detail (detail)}
<span class="chat-processing-info-detail pointer-events-auto">{detail}</span>
<span class="chat-processing-info-detail pointer-events-auto backdrop-blur-sm">{detail}</span>
{/each}
</div>
</div>
@@ -73,7 +73,7 @@
position: sticky;
top: 0;
z-index: 10;
padding: 1.5rem 1rem;
padding: 0 1rem 0.75rem;
opacity: 0;
transform: translateY(50%);
transition:
@@ -100,7 +100,6 @@
color: var(--muted-foreground);
font-size: 0.75rem;
padding: 0.25rem 0.75rem;
background: var(--muted);
border-radius: 0.375rem;
font-family:
ui-monospace, SFMono-Regular, 'SF Mono', Consolas, 'Liberation Mono', Menlo, monospace;

View File

@@ -1,11 +1,10 @@
<script lang="ts">
import { Download, Upload, Trash2 } from '@lucide/svelte';
import { Button } from '$lib/components/ui/button';
import { DialogConversationSelection } from '$lib/components/app';
import { DialogConversationSelection, DialogConfirmation } from '$lib/components/app';
import { createMessageCountMap } from '$lib/utils';
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
import { toast } from 'svelte-sonner';
import DialogConfirmation from '$lib/components/app/dialogs/DialogConfirmation.svelte';
let exportedConversations = $state<DatabaseConversation[]>([]);
let importedConversations = $state<DatabaseConversation[]>([]);

View File

@@ -9,7 +9,7 @@
import Input from '$lib/components/ui/input/input.svelte';
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
import { chatStore } from '$lib/stores/chat.svelte';
import { getPreviewText } from '$lib/utils/text';
import { getPreviewText } from '$lib/utils';
import ChatSidebarActions from './ChatSidebarActions.svelte';
const sidebar = Sidebar.useSidebar();

View File

@@ -1,6 +1,6 @@
<script lang="ts">
import { Trash2, Pencil, MoreHorizontal, Download, Loader2, Square } from '@lucide/svelte';
import { ActionDropdown } from '$lib/components/app';
import { DropdownMenuActions } from '$lib/components/app';
import * as Tooltip from '$lib/components/ui/tooltip';
import { getAllLoadingChats } from '$lib/stores/chat.svelte';
import { conversationsStore } from '$lib/stores/conversations.svelte';
@@ -128,7 +128,7 @@
{#if renderActionsDropdown}
<div class="actions flex items-center">
<ActionDropdown
<DropdownMenuActions
triggerIcon={MoreHorizontal}
triggerTooltip="More actions"
bind:open={dropdownOpen}

View File

@@ -616,7 +616,7 @@
code={incompleteCodeBlock.code}
language={incompleteCodeBlock.language || 'text'}
disabled={true}
onPreview={(code: string, lang: string) => {
onPreview={(code, lang) => {
previewCode = code;
previewLanguage = lang;
previewDialogOpen = true;

View File

@@ -18,9 +18,13 @@ import { ServerRole } from '$lib/enums';
* - **Default Params**: Server-wide generation defaults
*/
class ServerStore {
// ─────────────────────────────────────────────────────────────────────────────
// State
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* State
*
*
*/
props = $state<ApiLlamaCppServerProps | null>(null);
loading = $state(false);
@@ -28,16 +32,22 @@ class ServerStore {
role = $state<ServerRole | null>(null);
private fetchPromise: Promise<void> | null = null;
// ─────────────────────────────────────────────────────────────────────────────
// Getters
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* Getters
*
*
*/
get defaultParams(): ApiLlamaCppServerProps['default_generation_settings']['params'] | null {
return this.props?.default_generation_settings?.params || null;
}
get contextSize(): number | null {
return this.props?.default_generation_settings?.n_ctx ?? null;
const nCtx = this.props?.default_generation_settings?.n_ctx;
return typeof nCtx === 'number' ? nCtx : null;
}
get webuiSettings(): Record<string, string | number | boolean> | undefined {
@@ -52,9 +62,13 @@ class ServerStore {
return this.role === ServerRole.MODEL;
}
// ─────────────────────────────────────────────────────────────────────────────
// Data Handling
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* Data Handling
*
*
*/
async fetch(): Promise<void> {
if (this.fetchPromise) return this.fetchPromise;
@@ -115,9 +129,13 @@ class ServerStore {
this.fetchPromise = null;
}
// ─────────────────────────────────────────────────────────────────────────────
// Utilities
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* Utilities
*
*
*/
private detectRole(props: ApiLlamaCppServerProps): void {
const newRole = props?.role === ServerRole.ROUTER ? ServerRole.ROUTER : ServerRole.MODEL;

View File

@@ -47,18 +47,26 @@ import {
} from '$lib/constants/localstorage-keys';
class SettingsStore {
// ─────────────────────────────────────────────────────────────────────────────
// State
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* State
*
*
*/
config = $state<SettingsConfigType>({ ...SETTING_CONFIG_DEFAULT });
theme = $state<string>('auto');
isInitialized = $state(false);
userOverrides = $state<Set<string>>(new Set());
// ─────────────────────────────────────────────────────────────────────────────
// Utilities (private helpers)
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* Utilities (private helpers)
*
*
*/
/**
* Helper method to get server defaults with null safety
@@ -76,9 +84,13 @@ class SettingsStore {
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Lifecycle
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* Lifecycle
*
*
*/
/**
* Initialize the settings store by loading from localStorage
@@ -130,9 +142,13 @@ class SettingsStore {
this.theme = localStorage.getItem('theme') || 'auto';
}
// ─────────────────────────────────────────────────────────────────────────────
// Config Updates
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* Config Updates
*
*
*/
/**
* Update a specific configuration setting
@@ -234,9 +250,13 @@ class SettingsStore {
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Reset
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* Reset
*
*
*/
/**
* Reset configuration to defaults
@@ -285,9 +305,13 @@ class SettingsStore {
this.saveConfig();
}
// ─────────────────────────────────────────────────────────────────────────────
// Server Sync
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* Server Sync
*
*
*/
/**
* Initialize settings with props defaults when server properties are first loaded
@@ -349,9 +373,13 @@ class SettingsStore {
this.saveConfig();
}
// ─────────────────────────────────────────────────────────────────────────────
// Utilities
// ─────────────────────────────────────────────────────────────────────────────
/**
*
*
* Utilities
*
*
*/
/**
* Get a specific configuration value

View File

@@ -44,8 +44,7 @@
<Story
name="Default"
args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
play={async (context) => {
const { canvas, userEvent } = context;
play={async ({ canvas, userEvent }) => {
const textarea = await canvas.findByRole('textbox');
const submitButton = await canvas.findByRole('button', { name: 'Send' });
@@ -75,8 +74,7 @@
class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
uploadedFiles: fileAttachments
}}
play={async (context) => {
const { canvas } = context;
play={async ({ canvas }) => {
const jpgAttachment = canvas.getByAltText('1.jpg');
const svgAttachment = canvas.getByAltText('hf-logo.svg');
const pdfFileExtension = canvas.getByText('PDF');