mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d0061be838 | ||
|
|
a569bda445 | ||
|
|
e2f19b320f | ||
|
|
983559d24b | ||
|
|
2b089c7758 | ||
|
|
afa6bfe4f7 | ||
|
|
ae2d3f28a8 | ||
|
|
ad8207af77 | ||
|
|
667b694278 | ||
|
|
e48349a49d |
2
.github/workflows/winget.yml
vendored
2
.github/workflows/winget.yml
vendored
@@ -17,7 +17,7 @@ jobs:
|
||||
|
||||
- name: Install komac
|
||||
run: |
|
||||
cargo binstall komac@2.11.2 -y
|
||||
cargo binstall komac@2.15.0 -y
|
||||
|
||||
- name: Find latest release
|
||||
id: find_latest_release
|
||||
|
||||
@@ -452,34 +452,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
|
||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
||||
}
|
||||
|
||||
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
|
||||
bool has_suffix = string_ends_with(str, suffix);
|
||||
if (has_suffix) {
|
||||
str = str.substr(0, str.size() - suffix.size());
|
||||
}
|
||||
return has_suffix;
|
||||
}
|
||||
|
||||
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
|
||||
if (!str.empty() && !stop.empty()) {
|
||||
const char text_last_char = str.back();
|
||||
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
|
||||
if (stop[char_index] == text_last_char) {
|
||||
const auto current_partial = stop.substr(0, char_index + 1);
|
||||
if (string_ends_with(str, current_partial)) {
|
||||
return str.size() - char_index - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return std::string::npos;
|
||||
}
|
||||
|
||||
std::string regex_escape(const std::string & s) {
|
||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||
return std::regex_replace(s, special_chars, "\\$&");
|
||||
|
||||
@@ -670,30 +670,55 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
||||
}
|
||||
|
||||
template<>
|
||||
inline std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
|
||||
inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
|
||||
{
|
||||
std::vector<std::string> parts;
|
||||
size_t begin_pos = 0;
|
||||
size_t separator_pos = input.find(separator);
|
||||
while (separator_pos != std::string::npos) {
|
||||
std::string part = input.substr(begin_pos, separator_pos - begin_pos);
|
||||
size_t delim_pos = str.find(delim);
|
||||
while (delim_pos != std::string::npos) {
|
||||
std::string part = str.substr(begin_pos, delim_pos - begin_pos);
|
||||
parts.emplace_back(part);
|
||||
begin_pos = separator_pos + 1;
|
||||
separator_pos = input.find(separator, begin_pos);
|
||||
begin_pos = delim_pos + 1;
|
||||
delim_pos = str.find(delim, begin_pos);
|
||||
}
|
||||
parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
|
||||
parts.emplace_back(str.substr(begin_pos));
|
||||
return parts;
|
||||
}
|
||||
|
||||
inline bool string_starts_with(const std::string & str,
|
||||
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
|
||||
return str.rfind(prefix, 0) == 0;
|
||||
// remove when moving to c++20
|
||||
inline bool string_starts_with(std::string_view str, std::string_view prefix) {
|
||||
return str.size() >= prefix.size() &&
|
||||
str.compare(0, prefix.size(), prefix) == 0;
|
||||
}
|
||||
|
||||
// While we wait for C++20's std::string::ends_with...
|
||||
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
|
||||
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
|
||||
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
|
||||
// remove when moving to c++20
|
||||
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
|
||||
return str.size() >= suffix.size() &&
|
||||
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
|
||||
}
|
||||
|
||||
inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
|
||||
if (string_ends_with(str, suffix)) {
|
||||
str.resize(str.size() - suffix.size());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
|
||||
if (!str.empty() && !stop.empty()) {
|
||||
const size_t max_len = std::min(str.size(), stop.size());
|
||||
const char last_char = str.back();
|
||||
for (size_t len = max_len; len > 0; --len) {
|
||||
if (stop[len - 1] == last_char) {
|
||||
if (string_ends_with(str, stop.substr(0, len))) {
|
||||
return str.size() - len;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::string::npos;
|
||||
}
|
||||
|
||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
void string_process_escapes(std::string & input);
|
||||
|
||||
@@ -42,11 +42,15 @@ def load_model_and_tokenizer(model_path, device="auto"):
|
||||
config = config.text_config
|
||||
multimodal = True
|
||||
|
||||
print("Vocab size: ", config.vocab_size)
|
||||
print("Hidden size: ", config.hidden_size)
|
||||
print("Number of layers: ", config.num_hidden_layers)
|
||||
print("BOS token id: ", config.bos_token_id)
|
||||
print("EOS token id: ", config.eos_token_id)
|
||||
def print_if_exists(label, obj, attr, default="N/A"):
|
||||
val = getattr(obj, attr) if hasattr(obj, attr) else default
|
||||
print(f"{label}", val)
|
||||
|
||||
print_if_exists("Vocab size: ", config, "vocab_size")
|
||||
print_if_exists("Hidden size: ", config, "hidden_size")
|
||||
print_if_exists("Number of layers: ", config, "num_hidden_layers")
|
||||
print_if_exists("BOS token id: ", config, "bos_token_id")
|
||||
print_if_exists("EOS token id: ", config, "eos_token_id")
|
||||
|
||||
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
|
||||
if unreleased_model_name:
|
||||
|
||||
@@ -78,7 +78,7 @@ def list_all_tensors(model_path: Path, unique: bool = False):
|
||||
print(tensor_name)
|
||||
|
||||
|
||||
def print_tensor_info(model_path: Path, tensor_name: str):
|
||||
def print_tensor_info(model_path: Path, tensor_name: str, num_values: Optional[int] = None):
|
||||
tensor_file = find_tensor_file(model_path, tensor_name)
|
||||
|
||||
if tensor_file is None:
|
||||
@@ -96,6 +96,12 @@ def print_tensor_info(model_path: Path, tensor_name: str):
|
||||
print(f"Tensor: {tensor_name}")
|
||||
print(f"File: {tensor_file}")
|
||||
print(f"Shape: {shape}")
|
||||
if num_values is not None:
|
||||
tensor = f.get_tensor(tensor_name)
|
||||
print(f"Dtype: {tensor.dtype}")
|
||||
flat = tensor.flatten()
|
||||
n = min(num_values, flat.numel())
|
||||
print(f"Values: {flat[:n].tolist()}")
|
||||
else:
|
||||
print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
|
||||
sys.exit(1)
|
||||
@@ -127,6 +133,15 @@ def main():
|
||||
action="store_true",
|
||||
help="List unique tensor patterns in the model (layer numbers replaced with #)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n", "--num-values",
|
||||
nargs="?",
|
||||
const=10,
|
||||
default=None,
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="Print the first N values of the tensor flattened (default: 10 if flag is given without a number)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -152,7 +167,7 @@ def main():
|
||||
if args.tensor_name is None:
|
||||
print("Error: tensor_name is required when not using --list")
|
||||
sys.exit(1)
|
||||
print_tensor_info(model_path, args.tensor_name)
|
||||
print_tensor_info(model_path, args.tensor_name, args.num_values)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -9,6 +9,11 @@ function(ggml_add_cpu_backend_features cpu_name arch)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
# Disable LTO for the feature detection code to prevent cross-module optimization
|
||||
# from inlining architecture-specific instructions into the score function.
|
||||
# Without this, LTO can cause SIGILL when loading backends on older CPUs
|
||||
# (e.g., loading power10 backend on power9 crashes before feature check runs).
|
||||
target_compile_options(${GGML_CPU_FEATS_NAME} PRIVATE -fno-lto)
|
||||
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endfunction()
|
||||
|
||||
|
||||
@@ -2278,11 +2278,12 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
// [TAG_MUL_MAT_ID_CUDA_GRAPHS]
|
||||
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
|
||||
if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
|
||||
if (ggml_is_quantized(src0->type)) {
|
||||
if (ne2 <= 4) {
|
||||
if (ne2 <= MMVQ_MMID_MAX_BATCH_SIZE) {
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
@@ -2305,6 +2306,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
}
|
||||
}
|
||||
|
||||
// note: this path should not be reached when recording CUDA graphs, because it requires stream synchronization
|
||||
// TODO: add asserts to verify this. should work with CUDA, HIP, etc.
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(nb12 % nb11 == 0);
|
||||
@@ -2865,15 +2868,6 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
bool use_cuda_graph = true;
|
||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||
|
||||
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
|
||||
const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
|
||||
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
|
||||
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
|
||||
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
|
||||
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
|
||||
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
|
||||
const std::string delta_net_prefix = "dnet_add";
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
@@ -2888,31 +2882,14 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
#endif
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
|
||||
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_ADD &&
|
||||
node->src[1] && node->src[1]->ne[1] > 1 &&
|
||||
(node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) &&
|
||||
(node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
|
||||
strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
|
||||
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
|
||||
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
|
||||
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
|
||||
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0 &&
|
||||
strncmp(node->name, delta_net_prefix.c_str(), delta_net_prefix.size()) != 0) {
|
||||
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
|
||||
// by means of matching node names. See
|
||||
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
|
||||
// https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
|
||||
// Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
|
||||
// [TAG_MUL_MAT_ID_CUDA_GRAPHS]
|
||||
if (node->op == GGML_OP_MUL_MAT_ID && (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > MMVQ_MMID_MAX_BATCH_SIZE)) {
|
||||
// under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
|
||||
// TODO: figure out a way to enable for larger batch sizes, without hurting performance
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18958
|
||||
use_cuda_graph = false;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
|
||||
#define MMVQ_MMID_MAX_BATCH_SIZE 4 // Max. batch size for which to use MMVQ kernels for MUL_MAT_ID
|
||||
|
||||
void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr);
|
||||
|
||||
@@ -484,7 +484,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_scale_f32, kernel_scale_f32_4;
|
||||
cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
|
||||
cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
|
||||
cl_kernel kernel_mean_f32;
|
||||
cl_kernel kernel_mean_f32, kernel_mean_f32_4;
|
||||
cl_kernel kernel_silu, kernel_silu_4;
|
||||
cl_kernel kernel_gelu, kernel_gelu_4;
|
||||
cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
|
||||
@@ -543,15 +543,15 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_solve_tri_f32;
|
||||
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
||||
cl_kernel kernel_argsort_f32_i32;
|
||||
cl_kernel kernel_sum_rows_f32;
|
||||
cl_kernel kernel_sum_rows_f32, kernel_sum_rows_f32_4;
|
||||
cl_kernel kernel_repeat_f32;
|
||||
cl_kernel kernel_pad;
|
||||
cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
|
||||
cl_kernel kernel_tanh_f16, kernel_tanh_f16_4, kernel_tanh_f16_nc;
|
||||
cl_kernel kernel_expm1_f32_nd;
|
||||
cl_kernel kernel_expm1_f16_nd;
|
||||
cl_kernel kernel_softplus_f32_nd;
|
||||
cl_kernel kernel_softplus_f16_nd;
|
||||
cl_kernel kernel_expm1_f32, kernel_expm1_f32_4, kernel_expm1_f32_nc;
|
||||
cl_kernel kernel_expm1_f16, kernel_expm1_f16_4, kernel_expm1_f16_nc;
|
||||
cl_kernel kernel_softplus_f32, kernel_softplus_f32_4, kernel_softplus_f32_nc;
|
||||
cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
|
||||
cl_kernel kernel_upscale;
|
||||
cl_kernel kernel_upscale_bilinear;
|
||||
cl_kernel kernel_concat_f32;
|
||||
@@ -1837,6 +1837,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mean_f32 = clCreateKernel(prog, "kernel_mean_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_mean_f32_4 = clCreateKernel(prog, "kernel_mean_f32_4", &err), err));
|
||||
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
@@ -1874,6 +1875,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sum_rows_f32_4 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32_4", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
@@ -1978,20 +1980,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
#else
|
||||
const std::string kernel_src = read_file("expm1.cl");
|
||||
#endif
|
||||
cl_program prog;
|
||||
if (!kernel_src.empty()) {
|
||||
prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_expm1_f32_nd = clCreateKernel(prog, "kernel_expm1_f32_nd", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_expm1_f16_nd = clCreateKernel(prog, "kernel_expm1_f16_nd", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: expm1 kernel source not found or empty. Expm1 operation will not be available.\n");
|
||||
prog = nullptr;
|
||||
backend_ctx->kernel_expm1_f32_nd = nullptr;
|
||||
backend_ctx->kernel_expm1_f16_nd = nullptr;
|
||||
}
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_expm1_f32 = clCreateKernel(prog, "kernel_expm1_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_expm1_f32_4 = clCreateKernel(prog, "kernel_expm1_f32_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_expm1_f32_nc = clCreateKernel(prog, "kernel_expm1_f32_nc", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_expm1_f16 = clCreateKernel(prog, "kernel_expm1_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_expm1_f16_4 = clCreateKernel(prog, "kernel_expm1_f16_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_expm1_f16_nc = clCreateKernel(prog, "kernel_expm1_f16_nc", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// softplus
|
||||
@@ -2003,20 +2001,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
#else
|
||||
const std::string kernel_src = read_file("softplus.cl");
|
||||
#endif
|
||||
cl_program prog;
|
||||
if (!kernel_src.empty()) {
|
||||
prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_softplus_f32_nd = clCreateKernel(prog, "kernel_softplus_f32_nd", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_softplus_f16_nd = clCreateKernel(prog, "kernel_softplus_f16_nd", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: softplus kernel source not found or empty. Softplus operation will not be available.\n");
|
||||
prog = nullptr;
|
||||
backend_ctx->kernel_softplus_f32_nd = nullptr;
|
||||
backend_ctx->kernel_softplus_f16_nd = nullptr;
|
||||
}
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_softplus_f32 = clCreateKernel(prog, "kernel_softplus_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_softplus_f32_4 = clCreateKernel(prog, "kernel_softplus_f32_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_softplus_f32_nc = clCreateKernel(prog, "kernel_softplus_f32_nc", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_softplus_f16 = clCreateKernel(prog, "kernel_softplus_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_softplus_f16_4 = clCreateKernel(prog, "kernel_softplus_f16_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_softplus_f16_nc = clCreateKernel(prog, "kernel_softplus_f16_nc", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// upscale
|
||||
@@ -3463,11 +3457,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_UNARY_OP_TANH:
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -3587,7 +3579,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
}
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
const ggml_tensor * q = op->src[0];
|
||||
@@ -6400,7 +6392,6 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
GGML_UNUSED(src1);
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
@@ -6423,7 +6414,14 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_mean_f32;
|
||||
cl_kernel kernel;
|
||||
|
||||
const bool is_c4 = ne00 % 4 == 0;
|
||||
if (is_c4) {
|
||||
kernel = backend_ctx->kernel_mean_f32_4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_mean_f32;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
@@ -6440,7 +6438,7 @@ static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
||||
size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)64, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
@@ -7388,18 +7386,8 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_expm1_f32_nd;
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
kernel = backend_ctx->kernel_expm1_f16_nd;
|
||||
} else {
|
||||
GGML_ASSERT(false && "Unsupported type for ggml_cl_expm1");
|
||||
}
|
||||
GGML_ASSERT(kernel != nullptr);
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
@@ -7411,70 +7399,74 @@ static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, cons
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = dst->ne[0];
|
||||
const int ne11 = dst->ne[1];
|
||||
const int ne12 = dst->ne[2];
|
||||
const int ne13 = dst->ne[3];
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
const cl_ulong nb10 = dst->nb[0];
|
||||
const cl_ulong nb11 = dst->nb[1];
|
||||
const cl_ulong nb12 = dst->nb[2];
|
||||
const cl_ulong nb13 = dst->nb[3];
|
||||
cl_kernel kernel;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
||||
|
||||
size_t global_work_size[3];
|
||||
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
||||
return;
|
||||
}
|
||||
global_work_size[0] = (size_t)ne10;
|
||||
global_work_size[1] = (size_t)ne11;
|
||||
global_work_size[2] = (size_t)ne12;
|
||||
|
||||
size_t lws0 = 16, lws1 = 4, lws2 = 1;
|
||||
if (ne10 < 16) lws0 = ne10;
|
||||
if (ne11 < 4) lws1 = ne11;
|
||||
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
|
||||
|
||||
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
|
||||
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
|
||||
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
|
||||
|
||||
|
||||
size_t local_work_size[] = {lws0, lws1, lws2};
|
||||
|
||||
size_t* local_work_size_ptr = local_work_size;
|
||||
if (!backend_ctx->non_uniform_workgroups) {
|
||||
if (global_work_size[0] % local_work_size[0] != 0 ||
|
||||
global_work_size[1] % local_work_size[1] != 0 ||
|
||||
global_work_size[2] % local_work_size[2] != 0) {
|
||||
local_work_size_ptr = NULL;
|
||||
if (ggml_is_contiguous(src0)) {
|
||||
// Handle contiguous input
|
||||
int n = ggml_nelements(dst);
|
||||
if (n % 4 == 0) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_expm1_f32_4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_expm1_f16_4;
|
||||
}
|
||||
n /= 4;
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_expm1_f32;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_expm1_f16;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
} else {
|
||||
// Handle non-contiguous input
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_expm1_f32_nc;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_expm1_f16_nc;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
|
||||
|
||||
int nth = 64;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -7490,18 +7482,8 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_softplus_f32_nd;
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
kernel = backend_ctx->kernel_softplus_f16_nd;
|
||||
} else {
|
||||
GGML_ASSERT(false && "Unsupported type for ggml_cl_softplus");
|
||||
}
|
||||
GGML_ASSERT(kernel != nullptr);
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
@@ -7513,70 +7495,74 @@ static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = dst->ne[0];
|
||||
const int ne11 = dst->ne[1];
|
||||
const int ne12 = dst->ne[2];
|
||||
const int ne13 = dst->ne[3];
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
const cl_ulong nb10 = dst->nb[0];
|
||||
const cl_ulong nb11 = dst->nb[1];
|
||||
const cl_ulong nb12 = dst->nb[2];
|
||||
const cl_ulong nb13 = dst->nb[3];
|
||||
cl_kernel kernel;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
||||
|
||||
size_t global_work_size[3];
|
||||
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
||||
return;
|
||||
}
|
||||
global_work_size[0] = (size_t)ne10;
|
||||
global_work_size[1] = (size_t)ne11;
|
||||
global_work_size[2] = (size_t)ne12;
|
||||
|
||||
size_t lws0 = 16, lws1 = 4, lws2 = 1;
|
||||
if (ne10 < 16) lws0 = ne10;
|
||||
if (ne11 < 4) lws1 = ne11;
|
||||
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
|
||||
|
||||
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
|
||||
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
|
||||
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
|
||||
|
||||
|
||||
size_t local_work_size[] = {lws0, lws1, lws2};
|
||||
|
||||
size_t* local_work_size_ptr = local_work_size;
|
||||
if (!backend_ctx->non_uniform_workgroups) {
|
||||
if (global_work_size[0] % local_work_size[0] != 0 ||
|
||||
global_work_size[1] % local_work_size[1] != 0 ||
|
||||
global_work_size[2] % local_work_size[2] != 0) {
|
||||
local_work_size_ptr = NULL;
|
||||
if (ggml_is_contiguous(src0)) {
|
||||
// Handle contiguous input
|
||||
int n = ggml_nelements(dst);
|
||||
if (n % 4 == 0) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_softplus_f32_4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_softplus_f16_4;
|
||||
}
|
||||
n /= 4;
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_softplus_f32;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_softplus_f16;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
||||
} else {
|
||||
// Handle non-contiguous input
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_softplus_f32_nc;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_softplus_f16_nc;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
|
||||
|
||||
int nth = 64;
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
||||
@@ -11088,7 +11074,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
GGML_UNUSED(src1);
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
@@ -11111,7 +11096,14 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
|
||||
cl_kernel kernel;
|
||||
|
||||
const bool is_c4 = ne00 % 4 == 0;
|
||||
if (is_c4) {
|
||||
kernel = backend_ctx->kernel_sum_rows_f32_4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_sum_rows_f32;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
@@ -11128,7 +11120,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
||||
size_t global_work_size[] = {64 * (size_t)ne01, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)64, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
@@ -3,80 +3,111 @@
|
||||
//------------------------------------------------------------------------------
|
||||
// expm1
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_expm1_f32_nd(
|
||||
global void * p_src0_base,
|
||||
ulong off_src0_abs,
|
||||
global void * p_dst_base,
|
||||
ulong off_dst_abs,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
|
||||
kernel void kernel_expm1_f32(
|
||||
global const float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0f;
|
||||
}
|
||||
|
||||
kernel void kernel_expm1_f32_4(
|
||||
global const float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0f;
|
||||
}
|
||||
|
||||
kernel void kernel_expm1_f16(
|
||||
global const half * src0,
|
||||
ulong offset0,
|
||||
global half * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half*)((global char*)src0 + offset0);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0h;
|
||||
}
|
||||
|
||||
kernel void kernel_expm1_f16_4(
|
||||
global const half4 * src0,
|
||||
ulong offset0,
|
||||
global half4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = exp(src0[get_global_id(0)]) - 1.0h;
|
||||
}
|
||||
|
||||
kernel void kernel_expm1_f32_nc(
|
||||
global const char * src0,
|
||||
ulong offset0,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
int i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int i2 = get_global_id(2);
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
|
||||
global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*dst_val_ptr = exp(*src_val_ptr) - 1;
|
||||
}
|
||||
*y = exp(*x) - 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_expm1_f16_nd(
|
||||
global void * p_src0_base,
|
||||
ulong off_src0_abs,
|
||||
global void * p_dst_base,
|
||||
ulong off_dst_abs,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
kernel void kernel_expm1_f16_nc(
|
||||
global const char * src0,
|
||||
ulong offset0,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
int i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int i2 = get_global_id(2);
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
|
||||
global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
global half * y = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*dst_val_ptr = exp(*src_val_ptr) - 1;
|
||||
}
|
||||
*y = exp(*x) - 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
// Most devices have max workgroup size of 1024, so this is enough for subgroup
|
||||
// sizes of 16, 32, 64 and 128. Increase this value for smaller subgroups sizes
|
||||
#define MAX_SUBGROUPS 64
|
||||
kernel void kernel_mean_f32(
|
||||
global float * src0,
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
@@ -15,25 +20,121 @@ kernel void kernel_mean_f32(
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = (global float *)((global char *)src0 + offset0);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i3 = get_global_id(2);
|
||||
int i2 = get_global_id(1);
|
||||
int i1 = get_global_id(0);
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
const int lid = get_local_id(0);
|
||||
const int lsize = get_local_size(0);
|
||||
|
||||
const uint sg_size = get_sub_group_size();
|
||||
const uint sg_id = get_sub_group_id();
|
||||
const uint sg_lid = get_sub_group_local_id();
|
||||
|
||||
__local float lmem[MAX_SUBGROUPS];
|
||||
|
||||
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
float row_sum = 0;
|
||||
|
||||
for (int i0 = 0; i0 < ne00; i0++) {
|
||||
row_sum += src_row[i0];
|
||||
if(sg_id == 0){
|
||||
lmem[sg_lid] = 0.0f;
|
||||
}
|
||||
|
||||
dst_row[0] = row_sum / ne00;
|
||||
global float * src_row = (global float *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
global float * dst_row = (global float *) (dst + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i0 = lid; i0 < ne00; i0 += lsize) {
|
||||
sumf += src_row[i0];
|
||||
}
|
||||
|
||||
sumf = sub_group_reduce_add(sumf);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(sg_lid == 0){
|
||||
lmem[sg_id] = sumf;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
sumf = lmem[sg_lid];
|
||||
sumf = sub_group_reduce_add(sumf);
|
||||
|
||||
if (lid == 0) {
|
||||
dst_row[0] = sumf / ne00;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mean_f32_4(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
const int lid = get_local_id(0);
|
||||
const int lsize = get_local_size(0);
|
||||
|
||||
const uint sg_size = get_sub_group_size();
|
||||
const uint sg_id = get_sub_group_id();
|
||||
const uint sg_lid = get_sub_group_local_id();
|
||||
|
||||
__local float lmem[MAX_SUBGROUPS];
|
||||
|
||||
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
if(sg_id == 0){
|
||||
lmem[sg_lid] = 0.0f;
|
||||
}
|
||||
|
||||
global float4 * src_row = (global float4 *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
global float * dst_row = (global float *) (dst + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
float4 sum_vec = (float4)0.0f;
|
||||
|
||||
for (int i0 = lid; i0 < ne00 / 4; i0 += lsize) {
|
||||
sum_vec += src_row[i0];
|
||||
}
|
||||
|
||||
float sumf = dot(sum_vec, (float4)(1.0f));
|
||||
sumf = sub_group_reduce_add(sumf);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(sg_lid == 0){
|
||||
lmem[sg_id] = sumf;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
sumf = lmem[sg_lid];
|
||||
sumf = sub_group_reduce_add(sumf);
|
||||
|
||||
if (lid == 0) {
|
||||
dst_row[0] = sumf / ne00;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,86 +3,114 @@
|
||||
//------------------------------------------------------------------------------
|
||||
// softplus
|
||||
//------------------------------------------------------------------------------
|
||||
inline float softplus_f32(float x){
|
||||
float ax = fabs(x);
|
||||
float m = fmax(x, 0.0f);
|
||||
return log1p(exp(-ax)) + m;
|
||||
|
||||
kernel void kernel_softplus_f32(
|
||||
global const float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = (src0[get_global_id(0)] > 20.0f) ? src0[get_global_id(0)] : log(1.0f + exp(src0[get_global_id(0)]));
|
||||
}
|
||||
|
||||
kernel void kernel_softplus_f32_nd(
|
||||
global void * p_src0_base,
|
||||
ulong off_src0_abs,
|
||||
global void * p_dst_base,
|
||||
ulong off_dst_abs,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
kernel void kernel_softplus_f32_4(
|
||||
global const float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = (src0[get_global_id(0)] > 20.0f) ? src0[get_global_id(0)] : log(1.0f + exp(src0[get_global_id(0)]));
|
||||
}
|
||||
|
||||
kernel void kernel_softplus_f16(
|
||||
global const half * src0,
|
||||
ulong offset0,
|
||||
global half * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half*)((global char*)src0 + offset0);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
const float x = convert_float(src0[get_global_id(0)]);
|
||||
dst[get_global_id(0)] = convert_half_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
|
||||
}
|
||||
|
||||
kernel void kernel_softplus_f16_4(
|
||||
global const half4 * src0,
|
||||
ulong offset0,
|
||||
global half4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half4*)((global char*)src0 + offset0);
|
||||
dst = (global half4*)((global char*)dst + offsetd);
|
||||
|
||||
const float4 x = convert_float4(src0[get_global_id(0)]);
|
||||
dst[get_global_id(0)] = convert_half4_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
|
||||
}
|
||||
|
||||
kernel void kernel_softplus_f32_nc(
|
||||
global const char * src0,
|
||||
ulong offset0,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
int i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int i2 = get_global_id(2);
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
|
||||
global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*dst_val_ptr = softplus_f32(*src_val_ptr);
|
||||
}
|
||||
*y = (*x > 20.0f) ? *x : log(1.0f + exp(*x));
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_softplus_f16_nd(
|
||||
global void * p_src0_base,
|
||||
ulong off_src0_abs,
|
||||
global void * p_dst_base,
|
||||
ulong off_dst_abs,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
kernel void kernel_softplus_f16_nc(
|
||||
global const char * src0,
|
||||
ulong offset0,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
int i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int i2 = get_global_id(2);
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
|
||||
global const half * hx = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
global half * hy = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*dst_val_ptr = (half)(softplus_f32((float)(*src_val_ptr)));
|
||||
}
|
||||
const float x = convert_float(*hx);
|
||||
*hy = convert_half_rte((x > 20.0f) ? x : log(1.0f + exp(x)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
// Most devices have max workgroup size of 1024, so this is enough for subgroup
|
||||
// sizes of 16, 32, 64 and 128. Increase this value for smaller subgroups sizes
|
||||
#define MAX_SUBGROUPS 64
|
||||
kernel void kernel_sum_rows_f32(
|
||||
global float * src0,
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
@@ -15,25 +20,121 @@ kernel void kernel_sum_rows_f32(
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = (global float *)((global char *)src0 + offset0);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i3 = get_global_id(2);
|
||||
int i2 = get_global_id(1);
|
||||
int i1 = get_global_id(0);
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
const int lid = get_local_id(0);
|
||||
const int lsize = get_local_size(0);
|
||||
|
||||
const uint sg_size = get_sub_group_size();
|
||||
const uint sg_id = get_sub_group_id();
|
||||
const uint sg_lid = get_sub_group_local_id();
|
||||
|
||||
__local float lmem[MAX_SUBGROUPS];
|
||||
|
||||
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
float row_sum = 0;
|
||||
|
||||
for (int i0 = 0; i0 < ne00; i0++) {
|
||||
row_sum += src_row[i0];
|
||||
if(sg_id == 0){
|
||||
lmem[sg_lid] = 0.0f;
|
||||
}
|
||||
|
||||
dst_row[0] = row_sum;
|
||||
global float * src_row = (global float *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
global float * dst_row = (global float *) (dst + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i0 = lid; i0 < ne00; i0 += lsize) {
|
||||
sumf += src_row[i0];
|
||||
}
|
||||
|
||||
sumf = sub_group_reduce_add(sumf);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(sg_lid == 0){
|
||||
lmem[sg_id] = sumf;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
sumf = lmem[sg_lid];
|
||||
sumf = sub_group_reduce_add(sumf);
|
||||
|
||||
if (lid == 0) {
|
||||
dst_row[0] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_sum_rows_f32_4(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
dst = dst + offsetd;
|
||||
|
||||
const int i3 = get_group_id(2);
|
||||
const int i2 = get_group_id(1);
|
||||
const int i1 = get_group_id(0);
|
||||
|
||||
const int lid = get_local_id(0);
|
||||
const int lsize = get_local_size(0);
|
||||
|
||||
const uint sg_size = get_sub_group_size();
|
||||
const uint sg_id = get_sub_group_id();
|
||||
const uint sg_lid = get_sub_group_local_id();
|
||||
|
||||
__local float lmem[MAX_SUBGROUPS];
|
||||
|
||||
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
if(sg_id == 0){
|
||||
lmem[sg_lid] = 0.0f;
|
||||
}
|
||||
|
||||
global float4 * src_row = (global float4 *) (src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
global float * dst_row = (global float *) (dst + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
float4 sum_vec = (float4)0.0f;
|
||||
|
||||
for (int i0 = lid; i0 < ne00 / 4; i0 += lsize) {
|
||||
sum_vec += src_row[i0];
|
||||
}
|
||||
|
||||
float sumf = dot(sum_vec, (float4)(1.0f));
|
||||
sumf = sub_group_reduce_add(sumf);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(sg_lid == 0){
|
||||
lmem[sg_id] = sumf;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
sumf = lmem[sg_lid];
|
||||
sumf = sub_group_reduce_add(sumf);
|
||||
|
||||
if (lid == 0) {
|
||||
dst_row[0] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -944,6 +944,7 @@ struct vk_mat_mat_push_constants {
|
||||
uint32_t M; uint32_t N; uint32_t K;
|
||||
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
|
||||
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
|
||||
uint32_t base_work_group_z; uint32_t num_batches;
|
||||
uint32_t k_split;
|
||||
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
|
||||
uint32_t padded_N;
|
||||
@@ -963,6 +964,7 @@ struct vk_mat_vec_push_constants {
|
||||
uint32_t batch_stride_b;
|
||||
uint32_t batch_stride_d;
|
||||
uint32_t fusion_flags;
|
||||
uint32_t base_work_group_y;
|
||||
uint32_t ne02;
|
||||
uint32_t ne12;
|
||||
uint32_t broadcast2;
|
||||
@@ -6773,8 +6775,16 @@ static void ggml_vk_matmul(
|
||||
uint32_t padded_n) {
|
||||
VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")");
|
||||
if (split_k == 1) {
|
||||
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2]));
|
||||
|
||||
uint32_t base_work_group_z = 0;
|
||||
while (base_work_group_z < batch) {
|
||||
uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
||||
|
||||
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, groups_z });
|
||||
base_work_group_z += groups_z;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -6788,9 +6798,17 @@ static void ggml_vk_matmul(
|
||||
uint32_t k_split = CEIL_DIV(k, split_k);
|
||||
k_split = ROUNDUP_POW2(k_split, 256);
|
||||
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
// Make sure enough workgroups get assigned for split k to work
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, CEIL_DIV(batch, ctx->device->properties.limits.maxComputeWorkGroupCount[2]));
|
||||
|
||||
uint32_t base_work_group_z = 0;
|
||||
while (base_work_group_z < batch) {
|
||||
uint32_t groups_z = std::min(batch - base_work_group_z, ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
|
||||
|
||||
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, base_work_group_z, batch, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
|
||||
// Make sure enough workgroups get assigned for split k to work
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, groups_z });
|
||||
base_work_group_z += groups_z;
|
||||
}
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
|
||||
@@ -7186,7 +7204,6 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
}
|
||||
|
||||
// Request descriptor sets
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
if (qx_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
||||
}
|
||||
@@ -7484,7 +7501,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
if (quantize_y) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
|
||||
}
|
||||
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
|
||||
}
|
||||
|
||||
vk_subbuffer d_D = ggml_vk_tensor_subbuffer(ctx, cgraph->nodes[node_idx + ctx->num_additional_fused_ops]);
|
||||
@@ -7579,22 +7595,29 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
fusion_flags |= MAT_VEC_FUSION_FLAGS_BIAS1;
|
||||
}
|
||||
|
||||
// compute
|
||||
const vk_mat_vec_push_constants pc = {
|
||||
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
||||
stride_batch_x, stride_batch_y, stride_batch_d,
|
||||
fusion_flags,
|
||||
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
||||
};
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
||||
{
|
||||
d_X,
|
||||
d_Y,
|
||||
d_D,
|
||||
d_F0,
|
||||
d_F1,
|
||||
},
|
||||
pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
|
||||
ggml_pipeline_request_descriptor_sets(ctx, dmmv, CEIL_DIV(ne12 * ne13, ctx->device->properties.limits.maxComputeWorkGroupCount[1]));
|
||||
|
||||
uint32_t base_work_group_y = 0;
|
||||
while (base_work_group_y < ne12 * ne13) {
|
||||
|
||||
uint32_t groups_y = std::min((uint32_t)(ne12 * ne13) - base_work_group_y, ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
|
||||
const vk_mat_vec_push_constants pc = {
|
||||
(uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
|
||||
stride_batch_x, stride_batch_y, stride_batch_d,
|
||||
fusion_flags, base_work_group_y,
|
||||
(uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
|
||||
};
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
||||
{
|
||||
d_X,
|
||||
d_Y,
|
||||
d_D,
|
||||
d_F0,
|
||||
d_F1,
|
||||
},
|
||||
pc, { groups_x, groups_y, groups_z });
|
||||
base_work_group_y += groups_y;
|
||||
}
|
||||
|
||||
if (x_non_contig) {
|
||||
ctx->prealloc_x_need_sync = true;
|
||||
@@ -7832,10 +7855,15 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||
src1->nb[2] <= src1->nb[1] &&
|
||||
src1->nb[1] <= src1->nb[3] &&
|
||||
src0->ne[3] == 1 &&
|
||||
src1->ne[3] == 1) {
|
||||
src1->ne[3] == 1 &&
|
||||
src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] &&
|
||||
src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) {
|
||||
ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, cgraph, node_idx);
|
||||
} else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
|
||||
!ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
|
||||
!ggml_is_permuted(src0) && !ggml_is_permuted(src1) &&
|
||||
src0->ne[3] <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] &&
|
||||
src0->ne[1] <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] &&
|
||||
src1->ne[2] <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]) {
|
||||
ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, cgraph, node_idx);
|
||||
// mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four)
|
||||
// when ne12 and ne13 are one.
|
||||
@@ -11560,7 +11588,6 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
}
|
||||
}
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||
|
||||
@@ -12069,7 +12096,6 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
// y[i] = i % k;
|
||||
}
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ layout (push_constant) uniform parameter
|
||||
uint expert_i1;
|
||||
uint nbi1;
|
||||
#else
|
||||
uint base_work_group_y;
|
||||
uint ne02;
|
||||
uint ne12;
|
||||
uint broadcast2;
|
||||
@@ -45,9 +46,9 @@ uint expert_id;
|
||||
|
||||
void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_i0 = gl_GlobalInvocationID.y;
|
||||
const uint expert_i0 = gl_WorkGroupID.y;
|
||||
#else
|
||||
const uint batch_idx = gl_GlobalInvocationID.y;
|
||||
const uint batch_idx = gl_WorkGroupID.y + p.base_work_group_y;
|
||||
#endif
|
||||
|
||||
#ifndef MUL_MAT_ID
|
||||
|
||||
@@ -90,6 +90,8 @@ layout (push_constant) uniform parameter
|
||||
uint nbi1;
|
||||
uint ne11;
|
||||
#else
|
||||
uint base_work_group_z;
|
||||
uint num_batches;
|
||||
uint k_split;
|
||||
uint ne02;
|
||||
uint ne12;
|
||||
@@ -139,7 +141,7 @@ void main() {
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
const uint expert_idx = gl_WorkGroupID.z;
|
||||
if (ic * BN >= data_expert_count[expert_idx]) {
|
||||
return;
|
||||
}
|
||||
@@ -149,7 +151,7 @@ void main() {
|
||||
#endif
|
||||
|
||||
#ifndef MUL_MAT_ID
|
||||
const uint batch_idx = gl_GlobalInvocationID.z;
|
||||
const uint batch_idx = gl_WorkGroupID.z + p.base_work_group_z;
|
||||
|
||||
const uint i13 = batch_idx / p.ne12;
|
||||
const uint i12 = batch_idx % p.ne12;
|
||||
@@ -366,7 +368,7 @@ void main() {
|
||||
const uint dc = ic * BN + warp_c * WN;
|
||||
|
||||
#ifndef MUL_MAT_ID
|
||||
const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
|
||||
const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * p.num_batches;
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
|
||||
@@ -53,6 +53,8 @@ layout (push_constant) uniform parameter
|
||||
uint nbi1;
|
||||
uint ne11;
|
||||
#else
|
||||
uint base_work_group_z;
|
||||
uint num_batches;
|
||||
uint k_split;
|
||||
uint ne02;
|
||||
uint ne12;
|
||||
@@ -197,7 +199,7 @@ void main() {
|
||||
const uint ic = gl_WorkGroupID.y;
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
const uint expert_idx = gl_GlobalInvocationID.z;
|
||||
const uint expert_idx = gl_WorkGroupID.z;
|
||||
if (ic * BN >= data_expert_count[expert_idx]) {
|
||||
return;
|
||||
}
|
||||
@@ -215,7 +217,7 @@ void main() {
|
||||
#endif
|
||||
|
||||
#ifndef MUL_MAT_ID
|
||||
const uint batch_idx = gl_GlobalInvocationID.z;
|
||||
const uint batch_idx = gl_WorkGroupID.z + p.base_work_group_z;
|
||||
|
||||
const uint i13 = batch_idx / p.ne12;
|
||||
const uint i12 = batch_idx % p.ne12;
|
||||
@@ -255,7 +257,7 @@ void main() {
|
||||
#else
|
||||
uint pos_a = batch_idx_a * (p.batch_stride_a / QUANT_K);
|
||||
uint pos_b = batch_idx * p.batch_stride_b;
|
||||
uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
|
||||
uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * p.num_batches;
|
||||
#endif
|
||||
|
||||
uint stride_a = p.stride_a / QUANT_K;
|
||||
|
||||
Binary file not shown.
@@ -1,5 +1,5 @@
|
||||
<script lang="ts">
|
||||
import { RemoveButton } from '$lib/components/app';
|
||||
import { ActionIconRemove } from '$lib/components/app';
|
||||
import { formatFileSize, getFileTypeLabel, getPreviewText, isTextFile } from '$lib/utils';
|
||||
import { AttachmentType } from '$lib/enums';
|
||||
|
||||
@@ -104,7 +104,7 @@
|
||||
onclick={onClick}
|
||||
>
|
||||
<div class="absolute top-2 right-2 opacity-0 transition-opacity group-hover:opacity-100">
|
||||
<RemoveButton {id} {onRemove} />
|
||||
<ActionIconRemove {id} {onRemove} />
|
||||
</div>
|
||||
|
||||
<div class="pr-8">
|
||||
@@ -158,7 +158,7 @@
|
||||
|
||||
{#if !readonly}
|
||||
<div class="absolute top-2 right-2 opacity-0 transition-opacity group-hover:opacity-100">
|
||||
<RemoveButton {id} {onRemove} />
|
||||
<ActionIconRemove {id} {onRemove} />
|
||||
</div>
|
||||
{/if}
|
||||
</button>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
<script lang="ts">
|
||||
import { RemoveButton } from '$lib/components/app';
|
||||
import { ActionIconRemove } from '$lib/components/app';
|
||||
|
||||
interface Props {
|
||||
id: string;
|
||||
@@ -58,7 +58,7 @@
|
||||
<div
|
||||
class="absolute top-1 right-1 flex items-center justify-center opacity-0 transition-opacity group-hover:opacity-100"
|
||||
>
|
||||
<RemoveButton {id} {onRemove} class="text-white" />
|
||||
<ActionIconRemove {id} {onRemove} class="text-white" />
|
||||
</div>
|
||||
{/if}
|
||||
</div>
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
interface Props {
|
||||
class?: string;
|
||||
disabled?: boolean;
|
||||
onInput?: () => void;
|
||||
onKeydown?: (event: KeyboardEvent) => void;
|
||||
onPaste?: (event: ClipboardEvent) => void;
|
||||
placeholder?: string;
|
||||
@@ -14,6 +15,7 @@
|
||||
let {
|
||||
class: className = '',
|
||||
disabled = false,
|
||||
onInput,
|
||||
onKeydown,
|
||||
onPaste,
|
||||
placeholder = 'Ask anything...',
|
||||
@@ -52,7 +54,10 @@
|
||||
class:cursor-not-allowed={disabled}
|
||||
{disabled}
|
||||
onkeydown={onKeydown}
|
||||
oninput={(event) => autoResizeTextarea(event.currentTarget)}
|
||||
oninput={(event) => {
|
||||
autoResizeTextarea(event.currentTarget);
|
||||
onInput?.();
|
||||
}}
|
||||
onpaste={onPaste}
|
||||
{placeholder}
|
||||
></textarea>
|
||||
|
||||
@@ -14,12 +14,17 @@
|
||||
</script>
|
||||
|
||||
<header
|
||||
class="md:background-transparent pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end bg-background/40 p-4 backdrop-blur-xl duration-200 ease-linear {sidebar.open
|
||||
class="pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end p-4 duration-200 ease-linear {sidebar.open
|
||||
? 'md:left-[var(--sidebar-width)]'
|
||||
: ''}"
|
||||
>
|
||||
<div class="pointer-events-auto flex items-center space-x-2">
|
||||
<Button variant="ghost" size="sm" onclick={toggleSettings}>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
onclick={toggleSettings}
|
||||
class="rounded-full backdrop-blur-lg"
|
||||
>
|
||||
<Settings class="h-4 w-4" />
|
||||
</Button>
|
||||
</div>
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
let isCurrentConversationLoading = $derived(isLoading());
|
||||
let isStreaming = $derived(isChatStreaming());
|
||||
let hasProcessingData = $derived(processingState.processingState !== null);
|
||||
let processingDetails = $derived(processingState.getProcessingDetails());
|
||||
let processingDetails = $derived(processingState.getTechnicalDetails());
|
||||
|
||||
let showProcessingInfo = $derived(
|
||||
isCurrentConversationLoading || isStreaming || config().keepStatsVisible || hasProcessingData
|
||||
@@ -63,7 +63,7 @@
|
||||
<div class="chat-processing-info-container pointer-events-none" class:visible={showProcessingInfo}>
|
||||
<div class="chat-processing-info-content">
|
||||
{#each processingDetails as detail (detail)}
|
||||
<span class="chat-processing-info-detail pointer-events-auto">{detail}</span>
|
||||
<span class="chat-processing-info-detail pointer-events-auto backdrop-blur-sm">{detail}</span>
|
||||
{/each}
|
||||
</div>
|
||||
</div>
|
||||
@@ -73,7 +73,7 @@
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 10;
|
||||
padding: 1.5rem 1rem;
|
||||
padding: 0 1rem 0.75rem;
|
||||
opacity: 0;
|
||||
transform: translateY(50%);
|
||||
transition:
|
||||
@@ -100,7 +100,6 @@
|
||||
color: var(--muted-foreground);
|
||||
font-size: 0.75rem;
|
||||
padding: 0.25rem 0.75rem;
|
||||
background: var(--muted);
|
||||
border-radius: 0.375rem;
|
||||
font-family:
|
||||
ui-monospace, SFMono-Regular, 'SF Mono', Consolas, 'Liberation Mono', Menlo, monospace;
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
<script lang="ts">
|
||||
import { Download, Upload, Trash2 } from '@lucide/svelte';
|
||||
import { Button } from '$lib/components/ui/button';
|
||||
import { DialogConversationSelection } from '$lib/components/app';
|
||||
import { DialogConversationSelection, DialogConfirmation } from '$lib/components/app';
|
||||
import { createMessageCountMap } from '$lib/utils';
|
||||
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
|
||||
import { toast } from 'svelte-sonner';
|
||||
import DialogConfirmation from '$lib/components/app/dialogs/DialogConfirmation.svelte';
|
||||
|
||||
let exportedConversations = $state<DatabaseConversation[]>([]);
|
||||
let importedConversations = $state<DatabaseConversation[]>([]);
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
import Input from '$lib/components/ui/input/input.svelte';
|
||||
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
|
||||
import { chatStore } from '$lib/stores/chat.svelte';
|
||||
import { getPreviewText } from '$lib/utils/text';
|
||||
import { getPreviewText } from '$lib/utils';
|
||||
import ChatSidebarActions from './ChatSidebarActions.svelte';
|
||||
|
||||
const sidebar = Sidebar.useSidebar();
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
<script lang="ts">
|
||||
import { Trash2, Pencil, MoreHorizontal, Download, Loader2, Square } from '@lucide/svelte';
|
||||
import { ActionDropdown } from '$lib/components/app';
|
||||
import { DropdownMenuActions } from '$lib/components/app';
|
||||
import * as Tooltip from '$lib/components/ui/tooltip';
|
||||
import { getAllLoadingChats } from '$lib/stores/chat.svelte';
|
||||
import { conversationsStore } from '$lib/stores/conversations.svelte';
|
||||
@@ -128,7 +128,7 @@
|
||||
|
||||
{#if renderActionsDropdown}
|
||||
<div class="actions flex items-center">
|
||||
<ActionDropdown
|
||||
<DropdownMenuActions
|
||||
triggerIcon={MoreHorizontal}
|
||||
triggerTooltip="More actions"
|
||||
bind:open={dropdownOpen}
|
||||
|
||||
@@ -616,7 +616,7 @@
|
||||
code={incompleteCodeBlock.code}
|
||||
language={incompleteCodeBlock.language || 'text'}
|
||||
disabled={true}
|
||||
onPreview={(code: string, lang: string) => {
|
||||
onPreview={(code, lang) => {
|
||||
previewCode = code;
|
||||
previewLanguage = lang;
|
||||
previewDialogOpen = true;
|
||||
|
||||
@@ -18,9 +18,13 @@ import { ServerRole } from '$lib/enums';
|
||||
* - **Default Params**: Server-wide generation defaults
|
||||
*/
|
||||
class ServerStore {
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// State
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* State
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
props = $state<ApiLlamaCppServerProps | null>(null);
|
||||
loading = $state(false);
|
||||
@@ -28,16 +32,22 @@ class ServerStore {
|
||||
role = $state<ServerRole | null>(null);
|
||||
private fetchPromise: Promise<void> | null = null;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Getters
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Getters
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
get defaultParams(): ApiLlamaCppServerProps['default_generation_settings']['params'] | null {
|
||||
return this.props?.default_generation_settings?.params || null;
|
||||
}
|
||||
|
||||
get contextSize(): number | null {
|
||||
return this.props?.default_generation_settings?.n_ctx ?? null;
|
||||
const nCtx = this.props?.default_generation_settings?.n_ctx;
|
||||
|
||||
return typeof nCtx === 'number' ? nCtx : null;
|
||||
}
|
||||
|
||||
get webuiSettings(): Record<string, string | number | boolean> | undefined {
|
||||
@@ -52,9 +62,13 @@ class ServerStore {
|
||||
return this.role === ServerRole.MODEL;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Data Handling
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Data Handling
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
async fetch(): Promise<void> {
|
||||
if (this.fetchPromise) return this.fetchPromise;
|
||||
@@ -115,9 +129,13 @@ class ServerStore {
|
||||
this.fetchPromise = null;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Utilities
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Utilities
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
private detectRole(props: ApiLlamaCppServerProps): void {
|
||||
const newRole = props?.role === ServerRole.ROUTER ? ServerRole.ROUTER : ServerRole.MODEL;
|
||||
|
||||
@@ -47,18 +47,26 @@ import {
|
||||
} from '$lib/constants/localstorage-keys';
|
||||
|
||||
class SettingsStore {
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// State
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* State
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
config = $state<SettingsConfigType>({ ...SETTING_CONFIG_DEFAULT });
|
||||
theme = $state<string>('auto');
|
||||
isInitialized = $state(false);
|
||||
userOverrides = $state<Set<string>>(new Set());
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Utilities (private helpers)
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Utilities (private helpers)
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* Helper method to get server defaults with null safety
|
||||
@@ -76,9 +84,13 @@ class SettingsStore {
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Lifecycle
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Lifecycle
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* Initialize the settings store by loading from localStorage
|
||||
@@ -130,9 +142,13 @@ class SettingsStore {
|
||||
|
||||
this.theme = localStorage.getItem('theme') || 'auto';
|
||||
}
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Config Updates
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Config Updates
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* Update a specific configuration setting
|
||||
@@ -234,9 +250,13 @@ class SettingsStore {
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Reset
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Reset
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* Reset configuration to defaults
|
||||
@@ -285,9 +305,13 @@ class SettingsStore {
|
||||
this.saveConfig();
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Server Sync
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Server Sync
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* Initialize settings with props defaults when server properties are first loaded
|
||||
@@ -349,9 +373,13 @@ class SettingsStore {
|
||||
this.saveConfig();
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Utilities
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
/**
|
||||
*
|
||||
*
|
||||
* Utilities
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* Get a specific configuration value
|
||||
|
||||
@@ -44,8 +44,7 @@
|
||||
<Story
|
||||
name="Default"
|
||||
args={{ class: 'max-w-[56rem] w-[calc(100vw-2rem)]' }}
|
||||
play={async (context) => {
|
||||
const { canvas, userEvent } = context;
|
||||
play={async ({ canvas, userEvent }) => {
|
||||
const textarea = await canvas.findByRole('textbox');
|
||||
const submitButton = await canvas.findByRole('button', { name: 'Send' });
|
||||
|
||||
@@ -75,8 +74,7 @@
|
||||
class: 'max-w-[56rem] w-[calc(100vw-2rem)]',
|
||||
uploadedFiles: fileAttachments
|
||||
}}
|
||||
play={async (context) => {
|
||||
const { canvas } = context;
|
||||
play={async ({ canvas }) => {
|
||||
const jpgAttachment = canvas.getByAltText('1.jpg');
|
||||
const svgAttachment = canvas.getByAltText('hf-logo.svg');
|
||||
const pdfFileExtension = canvas.getByText('PDF');
|
||||
|
||||
Reference in New Issue
Block a user