mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
24 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8733e0cf6e | ||
|
|
814f795e06 | ||
|
|
d879433824 | ||
|
|
13b0a04597 | ||
|
|
bba9d945c1 | ||
|
|
bc4e1128f7 | ||
|
|
39e73ae0d6 | ||
|
|
1f73301b63 | ||
|
|
4773d7a02f | ||
|
|
6c7fd67b64 | ||
|
|
141a908a59 | ||
|
|
32916a4907 | ||
|
|
ffc727203a | ||
|
|
91a86a6f35 | ||
|
|
f4ed10b69c | ||
|
|
1e333d5bba | ||
|
|
2f54e348ad | ||
|
|
2356fb1d53 | ||
|
|
764b85627b | ||
|
|
15a28ec8c7 | ||
|
|
a7366faa5b | ||
|
|
9070365020 | ||
|
|
233461f812 | ||
|
|
b34c859146 |
8
.github/workflows/docker.yml
vendored
8
.github/workflows/docker.yml
vendored
@@ -36,10 +36,14 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
# Multi-stage build
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the arm64 images are failing, which prevents the amd64 images from being built
|
||||
# https://github.com/ggml-org/llama.cpp/issues/11888
|
||||
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the intel images are failing due to an out of disk space error
|
||||
# - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
||||
|
||||
@@ -38,15 +38,6 @@
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-llvm", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
@@ -73,10 +64,6 @@
|
||||
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
||||
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
||||
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
||||
|
||||
5
Makefile
5
Makefile
@@ -1187,11 +1187,6 @@ llama-cli: tools/main/main.cpp \
|
||||
@echo '==== Run ./llama-cli -h for help. ===='
|
||||
@echo
|
||||
|
||||
llama-infill: examples/infill/infill.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-run: tools/run/run.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
||||
@@ -1283,7 +1283,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.use_color = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||
add_opt(common_arg(
|
||||
{"-t", "--threads"}, "N",
|
||||
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
||||
@@ -1416,7 +1416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"-n", "--predict", "--n-predict"}, "N",
|
||||
string_format(
|
||||
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
|
||||
ex == LLAMA_EXAMPLE_MAIN
|
||||
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
||||
: "number of tokens to predict (default: %d, -1 = infinity)",
|
||||
params.n_predict),
|
||||
@@ -1655,7 +1655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.input_prefix = value;
|
||||
params.enable_chat_template = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--in-suffix"}, "STRING",
|
||||
"string to suffix after user inputs with (default: empty)",
|
||||
@@ -1663,7 +1663,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.input_suffix = value;
|
||||
params.enable_chat_template = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--no-warmup"},
|
||||
"skip warming up the model with an empty run",
|
||||
@@ -1680,7 +1680,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.spm_infill = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--samplers"}, "SAMPLERS",
|
||||
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
||||
@@ -2892,7 +2892,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.simple_io = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"--positive-file"}, "FNAME",
|
||||
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
||||
|
||||
@@ -66,7 +66,6 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_MAIN,
|
||||
LLAMA_EXAMPLE_INFILL,
|
||||
LLAMA_EXAMPLE_EMBEDDING,
|
||||
LLAMA_EXAMPLE_PERPLEXITY,
|
||||
LLAMA_EXAMPLE_RETRIEVAL,
|
||||
@@ -96,6 +95,7 @@ enum common_sampler_type {
|
||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
@@ -161,6 +161,7 @@ struct common_params_sampling {
|
||||
std::vector<enum common_sampler_type> samplers = {
|
||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||
COMMON_SAMPLER_TYPE_DRY,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||
COMMON_SAMPLER_TYPE_TOP_P,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "sampling.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
@@ -229,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
params.logit_bias.data()));
|
||||
|
||||
if (params.mirostat == 0) {
|
||||
if (params.top_n_sigma >= 0) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
} else {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
}
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||
@@ -475,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||
@@ -490,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||
@@ -504,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
@@ -517,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
@@ -533,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
auto sampler = sampler_canonical_name_map.find(name);
|
||||
if (sampler != sampler_canonical_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
} else {
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
||||
}
|
||||
|
||||
return samplers;
|
||||
@@ -552,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||
@@ -566,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
const auto sampler = sampler_name_map.find(c);
|
||||
if (sampler != sampler_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
} else {
|
||||
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2761,6 +2761,13 @@ class Qwen2MoeModel(TextModel):
|
||||
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
||||
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
||||
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
||||
# YaRN is not enabled by default
|
||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
@@ -3908,6 +3915,16 @@ class Gemma3VisionModel(VisionModel):
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
# calculate proj_scale_factor (used by tinygemma3 test model)
|
||||
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
|
||||
n_per_side = int(image_seq_length ** 0.5)
|
||||
image_size = self.hparams["image_size"]
|
||||
patch_size = self.hparams["patch_size"]
|
||||
proj_scale_factor = (image_size // patch_size) // n_per_side
|
||||
if proj_scale_factor > 0 and proj_scale_factor != 4:
|
||||
# we only need to write this if it's not the default value
|
||||
# in this case, we are converting a test model
|
||||
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, new_name, n_dims # unused
|
||||
@@ -3921,6 +3938,9 @@ class Gemma3VisionModel(VisionModel):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if "vision_model.head." in name:
|
||||
return [] # skip redundant tensors for tinygemma3
|
||||
|
||||
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
||||
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
|
||||
# process vision tensors
|
||||
|
||||
@@ -21,7 +21,6 @@ else()
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf)
|
||||
add_subdirectory(gritlm)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(parallel)
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-infill)
|
||||
add_executable(${TARGET} infill.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -1,47 +0,0 @@
|
||||
# llama.cpp/example/infill
|
||||
|
||||
This example shows how to use the infill mode with Code Llama models supporting infill mode.
|
||||
Currently the 7B and 13B models support infill mode.
|
||||
|
||||
Infill supports most of the options available in the main example.
|
||||
|
||||
For further information have a look at the main README.md in llama.cpp/example/main/README.md
|
||||
|
||||
## Common Options
|
||||
|
||||
In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
|
||||
|
||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
|
||||
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||
|
||||
## Input Prompts
|
||||
|
||||
The `infill` program provides several ways to interact with the LLaMA models using input prompts:
|
||||
|
||||
- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
|
||||
- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
|
||||
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
|
||||
|
||||
## Interaction
|
||||
|
||||
The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
|
||||
|
||||
### Interaction Options
|
||||
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
|
||||
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
|
||||
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
|
||||
|
||||
### Example
|
||||
|
||||
Download a model that supports infill, for example CodeLlama:
|
||||
```console
|
||||
scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
|
||||
```
|
||||
|
||||
```bash
|
||||
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
||||
```
|
||||
@@ -1,590 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static llama_context ** g_ctx;
|
||||
static llama_model ** g_model;
|
||||
static common_sampler ** g_smpl;
|
||||
static common_params * g_params;
|
||||
static std::vector<llama_token> * g_input_tokens;
|
||||
static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
|
||||
static bool is_interacting = false;
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (!is_interacting) {
|
||||
is_interacting = true;
|
||||
} else {
|
||||
console::cleanup();
|
||||
LOG("\n");
|
||||
common_perf_print(*g_ctx, *g_smpl);
|
||||
|
||||
// make sure all logs are flushed
|
||||
LOG("Interrupted by user\n");
|
||||
common_log_pause(common_log_main());
|
||||
|
||||
_exit(130);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
g_params = ¶ms;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
auto & sparams = params.sampling;
|
||||
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
if (params.logits_all) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.embedding) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
|
||||
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 0.0) {
|
||||
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 0.0) {
|
||||
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
LOG_INF("%s: llama backend init\n", __func__);
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = nullptr;
|
||||
llama_context * ctx = nullptr;
|
||||
common_sampler * smpl = nullptr;
|
||||
|
||||
g_model = &model;
|
||||
g_ctx = &ctx;
|
||||
g_smpl = &smpl;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
|
||||
model = llama_init.model.get();
|
||||
ctx = llama_init.context.get();
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const int n_ctx_train = llama_model_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG_DBG("n_ctx: %d\n", n_ctx);
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
std::vector<llama_token> embd_end;
|
||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
||||
|
||||
GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
|
||||
GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
|
||||
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
||||
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
const llama_token middle_token = llama_vocab_fim_mid(vocab);
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
LOG_DBG("add_bos: %d\n", add_bos);
|
||||
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
||||
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
||||
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
}
|
||||
|
||||
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
||||
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
if (params.interactive_first) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
LOG_CNT("'\n");
|
||||
}
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
if (params.interactive) {
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = sigint_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
LOG_INF("%s: interactive mode on.\n", __func__);
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_INF("Input prefix with BOS\n");
|
||||
}
|
||||
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
}
|
||||
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
}
|
||||
}
|
||||
smpl = common_sampler_init(model, sparams);
|
||||
|
||||
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
||||
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
||||
|
||||
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
|
||||
LOG_INF("\n");
|
||||
LOG_INF("\n##### Infill mode #####\n\n");
|
||||
if (params.interactive) {
|
||||
const char *control_message;
|
||||
if (params.multiline_input) {
|
||||
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n";
|
||||
} else {
|
||||
control_message = " - Press Return to return control to LLaMA.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n"
|
||||
" - If you want to submit another line, end your input with '\\'.\n";
|
||||
}
|
||||
LOG_INF("== Running in interactive mode. ==\n");
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
||||
#endif
|
||||
LOG_INF( "%s\n", control_message);
|
||||
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
||||
bool input_echo = true;
|
||||
|
||||
int n_past = 0;
|
||||
int n_remain = params.n_predict;
|
||||
int n_consumed = 0;
|
||||
|
||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
console::set_display(console::prompt);
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
||||
// --prompt or --file which uses the same value.
|
||||
int max_embd_size = n_ctx - 4;
|
||||
|
||||
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
||||
if ((int) embd.size() > max_embd_size) {
|
||||
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
||||
embd.resize(max_embd_size);
|
||||
|
||||
console::set_display(console::error);
|
||||
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
console::set_display(console::reset);
|
||||
}
|
||||
|
||||
// infinite text generation via context swapping
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() > n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||
|
||||
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
}
|
||||
|
||||
// evaluate tokens in batches
|
||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
||||
int n_eval = (int) embd.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
|
||||
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_past += n_eval;
|
||||
|
||||
LOG_DBG("n_past = %d\n", n_past);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||
|
||||
embd.push_back(id);
|
||||
|
||||
// echo this to console
|
||||
input_echo = true;
|
||||
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
common_sampler_accept(smpl, embd_inp[n_consumed], false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// display text
|
||||
if (input_echo) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = common_token_to_piece(ctx, id);
|
||||
LOG("%s", token_str.c_str());
|
||||
|
||||
if (embd.size() > 1) {
|
||||
input_tokens.push_back(id);
|
||||
} else {
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
}
|
||||
}
|
||||
// reset color to default if we there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
}
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
// deal with eot token in infill mode
|
||||
if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
|
||||
if (is_interacting && !params.interactive_first) {
|
||||
// print an eot token
|
||||
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
||||
}
|
||||
LOG("\n");
|
||||
console::set_display(console::user_input);
|
||||
std::string buffer;
|
||||
std::string line;
|
||||
bool another_line=true;
|
||||
// set a new prefix via stdin
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line, if so we use the old input
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_prefix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
// set a new suffix via stdin
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_suffix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
// done taking input, reset color
|
||||
console::set_display(console::reset);
|
||||
|
||||
if (params.escape) {
|
||||
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
||||
string_process_escapes(params.input_prefix);
|
||||
string_process_escapes(params.input_suffix);
|
||||
}
|
||||
|
||||
// tokenize new prefix and suffix
|
||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
||||
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
||||
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
n_remain = params.n_predict;
|
||||
n_past = 0;
|
||||
n_consumed = 0;
|
||||
is_interacting = false;
|
||||
}
|
||||
// deal with end of generation tokens in interactive mode
|
||||
else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
||||
LOG_DBG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
||||
is_interacting = true;
|
||||
LOG("\n");
|
||||
console::set_display(console::user_input);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting && !params.interactive) {
|
||||
LOG_DBG("waiting for user input\n");
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_DBG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
buffer += params.input_prefix;
|
||||
LOG("%s", buffer.c_str());
|
||||
}
|
||||
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
|
||||
// done taking input, reset color
|
||||
console::set_display(console::reset);
|
||||
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
// Entering a empty line lets the user pass control back
|
||||
if (buffer.length() > 1) {
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
buffer += params.input_suffix;
|
||||
LOG("%s", params.input_suffix.c_str());
|
||||
}
|
||||
|
||||
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
||||
|
||||
const size_t original_size = embd_inp.size();
|
||||
|
||||
const auto line_inp = common_tokenize(ctx, buffer, false);
|
||||
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
|
||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||
const llama_token token = embd_inp[i];
|
||||
output_tokens.push_back(token);
|
||||
output_ss << common_token_to_piece(ctx, token);
|
||||
}
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
LOG_DBG("empty line, passing control back\n");
|
||||
}
|
||||
|
||||
input_echo = false; // do not echo this again
|
||||
}
|
||||
|
||||
if (n_past > 0) {
|
||||
if (is_interacting) {
|
||||
common_sampler_reset(smpl);
|
||||
}
|
||||
is_interacting = false;
|
||||
}
|
||||
}
|
||||
|
||||
// end of generation
|
||||
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
|
||||
break;
|
||||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
|
||||
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
|
||||
n_remain = params.n_predict;
|
||||
is_interacting = true;
|
||||
}
|
||||
}
|
||||
if (!params.interactive && n_remain <= 0) {
|
||||
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
common_perf_print(ctx, smpl);
|
||||
|
||||
common_sampler_free(smpl);
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -366,6 +366,8 @@ if (MSVC)
|
||||
/wd4005 # Macro redefinition
|
||||
/wd4244 # Conversion from one type to another type, possible loss of data
|
||||
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
||||
/wd4996 # Disable POSIX deprecation warnings
|
||||
/wd4702 # Unreachable code warnings
|
||||
)
|
||||
function(disable_msvc_warnings target_name)
|
||||
if(TARGET ${target_name})
|
||||
|
||||
@@ -38,7 +38,7 @@ extern "C" {
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||
|
||||
@@ -59,7 +59,7 @@ extern "C" {
|
||||
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
|
||||
@@ -673,11 +673,15 @@ extern "C" {
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
|
||||
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
|
||||
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
||||
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||
|
||||
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
|
||||
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
|
||||
|
||||
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
||||
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
@@ -214,7 +214,7 @@ add_library(ggml
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
target_link_libraries(ggml PRIVATE dl stdc++fs)
|
||||
target_link_libraries(ggml PRIVATE dl)
|
||||
endif()
|
||||
|
||||
function(ggml_add_backend_library backend)
|
||||
|
||||
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
return SIZE_MAX;
|
||||
}
|
||||
|
||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||
if (buft->iface.get_alloc_size) {
|
||||
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
||||
@@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
||||
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
|
||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||
}
|
||||
|
||||
|
||||
@@ -72,8 +72,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
#elif defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
@@ -20,12 +20,6 @@
|
||||
#define GROUP_MAX_EPS_IQ1_M 1e-7f
|
||||
#define GROUP_MAX_EPS_IQ1_S 1e-12f
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
||||
|
||||
@@ -50,19 +50,6 @@
|
||||
#include "llamafile/sgemm.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
|
||||
// disable POSIX deprecation warnings
|
||||
// these functions are never going away, anyway
|
||||
#pragma warning(disable: 4996)
|
||||
|
||||
// unreachable code because of multiple instances of code after GGML_ABORT
|
||||
#pragma warning(disable: 4702)
|
||||
#endif
|
||||
|
||||
// Note: once we move threading into a separate C++ file
|
||||
// will use std::hardware_destructive_interference_size instead of hardcoding it here
|
||||
// and we'll use C++ attribute syntax.
|
||||
|
||||
@@ -8,19 +8,6 @@
|
||||
|
||||
#include <float.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
|
||||
// disable POSIX deprecation warnings
|
||||
// these functions are never going away, anyway
|
||||
#pragma warning(disable: 4996)
|
||||
|
||||
// unreachable code because of multiple instances of code after GGML_ABORT
|
||||
#pragma warning(disable: 4702)
|
||||
#endif
|
||||
|
||||
// ggml_compute_forward_dup
|
||||
|
||||
static void ggml_compute_forward_dup_same_cont(
|
||||
|
||||
@@ -2,12 +2,6 @@
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
// precomputed gelu table for f16 (128 KB)
|
||||
ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
||||
|
||||
|
||||
@@ -12,12 +12,30 @@ if (CUDAToolkit_FOUND)
|
||||
# 61 == Pascal, __dp4a instruction (per-byte integer dot product)
|
||||
# 70 == V100, FP16 tensor cores
|
||||
# 75 == Turing, int8 tensor cores
|
||||
# 80 == Ampere, asynchronous data loading, faster tensor core instructions
|
||||
# 86 == RTX 3000, needs CUDA v11.1
|
||||
# 89 == RTX 4000, needs CUDA v11.8
|
||||
#
|
||||
# XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
|
||||
# XX-real == compile CUDA code as device code for this specific architecture
|
||||
# no suffix == compile as both PTX and device code
|
||||
#
|
||||
# The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
|
||||
# for best performance and to also build real architectures for the most commonly used GPUs.
|
||||
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "native")
|
||||
elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80")
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
|
||||
endif()
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75;80")
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
||||
@@ -130,10 +130,6 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {
|
||||
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define GGML_CUDA_MAX_STREAMS 8
|
||||
|
||||
[[noreturn]]
|
||||
|
||||
@@ -719,6 +719,7 @@ void launch_fattn(
|
||||
size_t nb23 = V->nb[3];
|
||||
|
||||
if (need_f16_K && K->type != GGML_TYPE_F16) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(K));
|
||||
K_f16.alloc(ggml_nelements(K));
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
||||
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
||||
@@ -733,6 +734,7 @@ void launch_fattn(
|
||||
}
|
||||
|
||||
if (need_f16_V && V->type != GGML_TYPE_F16) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(V));
|
||||
V_f16.alloc(ggml_nelements(V));
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
||||
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
||||
|
||||
@@ -555,8 +555,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
|
||||
|
||||
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
// initialize padding to 0 to avoid possible NaN values
|
||||
size_t original_size = ggml_nbytes(tensor);
|
||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||
const size_t original_size = ggml_nbytes(tensor);
|
||||
const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||
|
||||
if (padded_size > original_size) {
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -679,6 +679,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
|
||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
}
|
||||
@@ -800,6 +801,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
@@ -851,6 +853,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
@@ -889,6 +892,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
@@ -970,6 +974,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
|
||||
|
||||
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
size_t total_size = 0;
|
||||
|
||||
@@ -1531,6 +1536,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
|
||||
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
|
||||
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
||||
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
||||
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
|
||||
@@ -2062,9 +2069,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
}
|
||||
|
||||
ggml_tensor src0_slice = *src0;
|
||||
src0_slice.ne[2] = 1;
|
||||
src0_slice.nb[3] = src0_slice.nb[2];
|
||||
src0_slice.data = (char *) src0->data + i02*nb02;
|
||||
src0_slice.ne[2] = 1;
|
||||
src0_slice.nb[3] = src0_slice.nb[2];
|
||||
src0_slice.op = GGML_OP_VIEW;
|
||||
src0_slice.view_src = dst->src[0]; // non-const pointer to src0
|
||||
src0_slice.data = (char *) src0->data + i02*nb02;
|
||||
|
||||
ggml_tensor src1_slice;
|
||||
memset(&src1_slice, 0, sizeof(src1_slice));
|
||||
|
||||
@@ -89,6 +89,17 @@ void ggml_cuda_mul_mat_q(
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
const size_t size_data = ggml_nbytes(src0);
|
||||
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||
if (size_alloc > size_data) {
|
||||
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
|
||||
const int64_t s01 = src0->nb[1] / ts_src0;
|
||||
@@ -118,7 +129,7 @@ void ggml_cuda_mul_mat_q(
|
||||
|
||||
const mmq_args args = {
|
||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
|
||||
ne00, ne01, ne1, s01, s1,
|
||||
ne00, ne01, ne1, s01, ne11, s1,
|
||||
ne02, ne12, s02, s12, s2,
|
||||
ne03, ne13, s03, s13, s3,
|
||||
use_stream_k};
|
||||
@@ -202,7 +213,7 @@ void ggml_cuda_mul_mat_q(
|
||||
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
||||
const mmq_args args = {
|
||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
|
||||
ne00, ne01, ne_get_rows, s01, s1,
|
||||
ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
|
||||
ne02, ne02, s02, s12, s2,
|
||||
ne03, ne13, s03, s13, s3,
|
||||
use_stream_k};
|
||||
@@ -241,7 +252,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
||||
const mmq_args args = {
|
||||
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
||||
ne00, row_diff, src1_ncols, stride01, nrows_dst,
|
||||
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
|
||||
1, 1, 0, 0, 0,
|
||||
1, 1, 0, 0, 0,
|
||||
use_stream_k};
|
||||
|
||||
@@ -2522,7 +2522,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
|
||||
static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||
const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
|
||||
const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
|
||||
const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||
const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
@@ -2606,7 +2606,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
static __global__ void mul_mat_q(
|
||||
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
|
||||
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
|
||||
@@ -2619,8 +2619,8 @@ static __global__ void mul_mat_q(
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
|
||||
const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; // Number of tiles x
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
|
||||
// Initialize the ids for writing back data with just the index.
|
||||
// For regular matrix multiplications this is never changed.
|
||||
@@ -2648,8 +2648,8 @@ static __global__ void mul_mat_q(
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
int col_high = ncols_y;
|
||||
int col_diff = ncols_y;
|
||||
int col_high = ncols_dst;
|
||||
int col_diff = ncols_dst;
|
||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||
|
||||
@@ -2689,7 +2689,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
constexpr bool fixup = false;
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
|
||||
return;
|
||||
}
|
||||
@@ -2720,8 +2720,8 @@ static __global__ void mul_mat_q(
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
int col_high = ncols_y;
|
||||
int col_diff = ncols_y;
|
||||
int col_high = ncols_dst;
|
||||
int col_diff = ncols_dst;
|
||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||
|
||||
@@ -2767,7 +2767,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||
|
||||
kbc += blocks_per_ne00;
|
||||
@@ -2792,8 +2792,8 @@ static __global__ void mul_mat_q(
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
int col_high = ncols_y;
|
||||
int col_diff = ncols_y;
|
||||
int col_high = ncols_dst;
|
||||
int col_diff = ncols_dst;
|
||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||
|
||||
@@ -2834,7 +2834,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||
}
|
||||
|
||||
@@ -2842,7 +2842,7 @@ static __global__ void mul_mat_q(
|
||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
|
||||
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
|
||||
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) {
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
@@ -2851,8 +2851,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
|
||||
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
|
||||
|
||||
const int ntx = (ncols_y + mmq_x - 1) / mmq_x;
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x;
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
|
||||
const int bidx0 = blockIdx.x;
|
||||
|
||||
@@ -2925,8 +2925,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
|
||||
dst += offset_dst;
|
||||
|
||||
const int i_max = nrows_x - it*mmq_y - 1;
|
||||
const int j_max = ncols_y - jt*mmq_x - 1;
|
||||
const int i_max = nrows_x - it*mmq_y - 1;
|
||||
const int j_max = ncols_dst - jt*mmq_x - 1;
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
@@ -2989,7 +2989,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
|
||||
struct mmq_args {
|
||||
const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
|
||||
int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst;
|
||||
int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst;
|
||||
int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
|
||||
int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
|
||||
bool use_stream_k;
|
||||
@@ -3025,8 +3025,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
|
||||
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (args.ncols_y + mmq_x - 1) / mmq_x;
|
||||
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x;
|
||||
const int ntzw = args.nchannels_y * args.nsamples_y;
|
||||
const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
|
||||
|
||||
@@ -3040,14 +3040,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
}
|
||||
@@ -3068,7 +3068,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
|
||||
@@ -3077,14 +3077,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
|
||||
@@ -3093,7 +3093,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -513,6 +513,17 @@ void ggml_cuda_mul_mat_vec_q(
|
||||
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
const size_t size_data = ggml_nbytes(src0);
|
||||
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||
if (size_alloc > size_data) {
|
||||
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
|
||||
{
|
||||
|
||||
@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
|
||||
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
|
||||
|
||||
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||
|
||||
@@ -19,12 +19,6 @@
|
||||
#define GROUP_MAX_EPS_IQ1_M 1e-7f
|
||||
#define GROUP_MAX_EPS_IQ1_S 1e-12f
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// reference implementation for deterministic creation of model files
|
||||
|
||||
@@ -80,10 +80,6 @@ extern int g_ggml_sycl_disable_optimize;
|
||||
// max batch size to use MMQ kernels when tensor cores are available
|
||||
#define MMQ_MAX_BATCH_SIZE 32
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable : 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
// dmmv = dequantize_mul_mat_vec
|
||||
#ifndef GGML_SYCL_DMMV_X
|
||||
#define GGML_SYCL_DMMV_X 32
|
||||
@@ -118,17 +114,12 @@ static void crash() {
|
||||
GGML_ABORT("SYCL error");
|
||||
}
|
||||
|
||||
#define SYCL_CHECK(err) \
|
||||
do { \
|
||||
auto err_ = (err); \
|
||||
if (err_ != 0) \
|
||||
ggml_sycl_error( \
|
||||
#err, \
|
||||
__func__, \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
"Meet error in this line code!"); \
|
||||
} while (0)
|
||||
#define SYCL_CHECK(err) \
|
||||
do { \
|
||||
auto err_ = (err); \
|
||||
if (err_ != 0) \
|
||||
ggml_sycl_error(#err, __func__, __FILE__, __LINE__, "Exception caught in this line of code."); \
|
||||
} while (0)
|
||||
|
||||
#if DPCT_COMPAT_RT_VERSION >= 11100
|
||||
#define GGML_SYCL_ASSUME(x) __builtin_assume(x)
|
||||
|
||||
@@ -437,41 +437,52 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
|
||||
const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
|
||||
const int64_t work_group_size = item_ct1.get_local_range(2);
|
||||
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
|
||||
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
|
||||
|
||||
const int64_t i01 = item_ct1.get_group(1);
|
||||
const int64_t i02 = item_ct1.get_group(0) % ne02;
|
||||
const int64_t i03 = item_ct1.get_group(0) / ne02;
|
||||
|
||||
// make each work-item deal with more elements since sycl global range can not exceed max int
|
||||
const src_t * x = (const src_t *) vx;
|
||||
for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
|
||||
y[i] = x[i];
|
||||
const src_t * x = static_cast<const src_t *>(vx);
|
||||
const int64_t ix = i03 * s03 + i02 * s02 + i01 * s01;
|
||||
const int64_t iy = ((i03 * ne02 + i02) * ne01 + i01) * ne00;
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t i00 = global_id; i00 < ne00; i00 += work_group_size * item_ct1.get_group_range(2)) {
|
||||
y[iy + i00] = static_cast<dst_t>(x[ix + i00]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_sycl(const void *__restrict__ vx,
|
||||
dst_t *__restrict__ y, const int64_t k,
|
||||
dpct::queue_ptr stream) {
|
||||
const int64_t num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
|
||||
static void convert_unary_nc_sycl(const void * __restrict__ vx, dst_t * __restrict__ y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03, dpct::queue_ptr queue) {
|
||||
dpct::has_capability_or_fail(queue->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
sycl::range<3> global_size(ne02 * ne03, ne01, ceil_div(ne00, SYCL_DEQUANTIZE_BLOCK_SIZE));
|
||||
|
||||
// decrease global range when it exceeds the max int
|
||||
int64_t local_size = downsample_sycl_global_range(num_blocks, SYCL_DEQUANTIZE_BLOCK_SIZE);
|
||||
sycl::range<3> block_nums(1, 1, num_blocks);
|
||||
sycl::range<3> local_range(1, 1, local_size);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
// TODO: Downsample logic is separated from the kernel, a rewrite is desirable
|
||||
int64_t downsized_workgroup = downsample_sycl_global_range(global_size[0], SYCL_DEQUANTIZE_BLOCK_SIZE);
|
||||
sycl::range<3> workgroup_size(1, 1, downsized_workgroup);
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * local_range, local_range),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
convert_unary<src_t>(vx, y, k, item_ct1);
|
||||
});
|
||||
}
|
||||
queue->parallel_for(sycl::nd_range<3>(global_size * workgroup_size, workgroup_size), [=](sycl::nd_item<3> item_ct1) {
|
||||
convert_unary_nc<src_t>(vx, y, ne00, ne01, ne02, s01, s02, s03, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr queue) {
|
||||
convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue);
|
||||
}
|
||||
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (dst->src[0]->extra &&
|
||||
@@ -574,3 +585,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_nc_sycl<float>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// Copyright (C) 2025 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
@@ -16,12 +16,19 @@
|
||||
#include "common.hpp"
|
||||
|
||||
template <typename T>
|
||||
using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
|
||||
int64_t k, dpct::queue_ptr stream);
|
||||
typedef to_t_sycl_t<float> to_fp32_sycl_t;
|
||||
using to_t_sycl_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, dpct::queue_ptr stream);
|
||||
typedef to_t_sycl_t<float> to_fp32_sycl_t;
|
||||
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
|
||||
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
|
||||
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst);
|
||||
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor * dst);
|
||||
|
||||
#endif // GGML_SYCL_CONVERT_HPP
|
||||
// Nc = Non-contiguous
|
||||
template <typename T>
|
||||
using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
|
||||
int64_t s01, int64_t s02, int64_t s03, dpct::queue_ptr queue);
|
||||
|
||||
typedef to_t_nc_sycl_t<sycl::half> to_fp16_nc_sycl_t;
|
||||
to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type);
|
||||
|
||||
#endif // GGML_SYCL_CONVERT_HPP
|
||||
|
||||
@@ -193,7 +193,7 @@ static void ggml_check_sycl() try {
|
||||
|
||||
if (!initialized) {
|
||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
|
||||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
||||
GGML_LOG_INFO("Running with Environment Variables:\n");
|
||||
@@ -338,7 +338,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) {
|
||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
tensor->extra = extra;
|
||||
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
||||
@@ -2694,35 +2694,31 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
|
||||
const sycl::half *src1_as_f16, char *dst,
|
||||
const void **ptrs_src, void **ptrs_dst,
|
||||
int64_t ne12, int64_t ne13, int64_t ne23,
|
||||
size_t nb02, size_t nb03, size_t nb12,
|
||||
size_t nb13, size_t nbd2, size_t nbd3,
|
||||
int64_t r2, int64_t r3,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, char * dst,
|
||||
const void ** ptrs_src, void ** ptrs_dst, int64_t ne12, int64_t ne13, int64_t ne23,
|
||||
size_t nb02, size_t nb03, size_t nb12, size_t nb13, size_t nbd2, size_t nbd3,
|
||||
int64_t r2, int64_t r3, const sycl::nd_item<3> & item_ct1) {
|
||||
const int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
|
||||
const int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
|
||||
|
||||
if (i13 >= ne13 || i12 >= ne12) {
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t i03 = i13 / r3;
|
||||
int64_t i02 = i12 / r2;
|
||||
const int64_t i03 = i13 / r3;
|
||||
const int64_t i02 = i12 / r2;
|
||||
|
||||
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
|
||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
||||
const uint8_t * src0_bytes = reinterpret_cast<const uint8_t *>(src0_as_f16);
|
||||
const uint8_t * src1_bytes = reinterpret_cast<const uint8_t *>(src1_as_f16);
|
||||
uint8_t * dst_bytes = reinterpret_cast<uint8_t *>(dst);
|
||||
|
||||
ptrs_src[0 * ne23 + i12 + i13 * ne12] = src0_bytes + i02 * nb02 + i03 * nb03;
|
||||
ptrs_src[1 * ne23 + i12 + i13 * ne12] = src1_bytes + i12 * nb12 + i13 * nb13;
|
||||
ptrs_dst[0 * ne23 + i12 + i13 * ne12] = dst_bytes + i12 * nbd2 + i13 * nbd3;
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
const ggml_tensor *src0,
|
||||
const ggml_tensor *src1,
|
||||
ggml_tensor *dst) try {
|
||||
static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
|
||||
const ggml_tensor * src1, ggml_tensor * dst) try {
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
|
||||
@@ -2730,102 +2726,100 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
// TODO: see https://github.com/ggml-org/llama.cpp/pull/13155
|
||||
// Batched mul_mat requires a rewrite to support both oneDNN and non-contiguous dst
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
queue_ptr main_stream = ctx.stream();;
|
||||
queue_ptr queue = ctx.stream();
|
||||
|
||||
void * src0_ddq = src0->data;
|
||||
sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
|
||||
float * src1_ddf = (float *) src1->data;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
dpct::has_capability_or_fail(queue->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
const sycl::half * src0_f16 = static_cast<const sycl::half *>(src0->data);
|
||||
float * dst_ddf = static_cast<float *>(dst->data);
|
||||
|
||||
const sycl::half * src1_f16 = static_cast<const sycl::half *>(src1->data);
|
||||
const size_t type_size_src1 = ggml_type_size(src1->type);
|
||||
GGML_ASSERT(nb10 == type_size_src1);
|
||||
|
||||
// SRC1 strides
|
||||
int64_t s11 = nb11 / type_size_src1;
|
||||
int64_t s12 = nb12 / type_size_src1;
|
||||
int64_t s13 = nb13 / type_size_src1;
|
||||
ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
|
||||
|
||||
// convert src1 to fp16
|
||||
ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
||||
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
|
||||
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
|
||||
const int64_t ne_src1 = ggml_nelements(src1);
|
||||
src1_f16_alloc.alloc(ne_src1);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
|
||||
to_fp16_nc_sycl(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, queue);
|
||||
|
||||
src1_f16 = src1_f16_alloc.get();
|
||||
s11 = ne10;
|
||||
s12 = ne11 * s11;
|
||||
s13 = ne12 * s12;
|
||||
}
|
||||
sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf
|
||||
: src1_f16_alloc.get();
|
||||
|
||||
char * dst_t;
|
||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
|
||||
char * dst_t = reinterpret_cast<char *>(dst_ddf);
|
||||
|
||||
dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float;
|
||||
dpct::library_data_t cu_data_type = dpct::library_data_t::real_float;
|
||||
dpct::library_data_t mkl_compute_type = dpct::library_data_t::real_float;
|
||||
dpct::library_data_t mkl_data_type = dpct::library_data_t::real_float;
|
||||
|
||||
// dst strides
|
||||
size_t nbd2 = dst->nb[2];
|
||||
size_t nbd3 = dst->nb[3];
|
||||
|
||||
const float alpha_f32 = 1.0f;
|
||||
const float beta_f32 = 0.0f;
|
||||
const float beta_f32 = 0.0f;
|
||||
|
||||
const void * alpha = &alpha_f32;
|
||||
const void * beta = &beta_f32;
|
||||
|
||||
dst_t = (char *) dst_ddf;
|
||||
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne13 % ne03 == 0);
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const char *) src0_as_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
||||
(const char *) src1_f16, dpct::library_data_t::real_half, nb11 / nb10, nb12 / nb10, beta, (char *) dst_t,
|
||||
cu_data_type, ne01, nb2 / nb0, ne12 * ne13, cu_compute_type)));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
||||
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_t,
|
||||
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
||||
} else {
|
||||
const int ne23 = ne12*ne13;
|
||||
const int ne23 = ne12 * ne13;
|
||||
|
||||
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
|
||||
ggml_sycl_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
|
||||
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2 * ne23);
|
||||
ggml_sycl_pool_alloc<void *> ptrs_dst(ctx.pool(), 1 * ne23);
|
||||
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
|
||||
|
||||
sycl::range<3> block_dims(1, ne12, ne13);
|
||||
/*
|
||||
DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
{
|
||||
dpct::has_capability_or_fail(main_stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
main_stream->submit([&](sycl::handler &cgh) {
|
||||
const void **ptrs_src_get = ptrs_src.get();
|
||||
void **ptrs_dst_get = ptrs_dst.get();
|
||||
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2;
|
||||
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2;
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_compute_batched_ptrs(
|
||||
src0_as_f16, src1_f16,
|
||||
dst_t, ptrs_src_get,
|
||||
ptrs_dst_get, ne12, ne13, ne23,
|
||||
nb02, nb03, nb12_scaled, nb13_scaled,
|
||||
nbd2, nbd3, r2, r3, item_ct1);
|
||||
});
|
||||
queue->submit([&](sycl::handler & cgh) {
|
||||
const void ** ptrs_src_get = ptrs_src.get();
|
||||
void ** ptrs_dst_get = ptrs_dst.get();
|
||||
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
|
||||
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_compute_batched_ptrs(src0_f16, src1_f16, dst_t, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
|
||||
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
*queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
} catch (const sycl::exception & exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
||||
@@ -2966,7 +2960,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
// The kernel from the if path is faster for that specific case, but does not support all mul mats.
|
||||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
|
||||
}
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
@@ -3873,9 +3867,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (a->ne[3] != b->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
if (!ggml_is_contiguous(b)) {
|
||||
return false;
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS ||
|
||||
a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S ||
|
||||
|
||||
@@ -1299,6 +1299,10 @@ bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
||||
return ggml_is_contiguous_n(tensor, 2);
|
||||
}
|
||||
|
||||
bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor) {
|
||||
return ggml_nbytes(tensor) == ggml_nelements(tensor) * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
||||
}
|
||||
|
||||
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
# pyright: reportUnusedImport=false
|
||||
|
||||
from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
|
||||
from .gguf_dump import main as gguf_dump_entrypoint
|
||||
from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
|
||||
from .gguf_new_metadata import main as gguf_new_metadata_entrypoint
|
||||
from .gguf_editor_gui import main as gguf_editor_gui_entrypoint
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.16.2"
|
||||
version = "0.16.3"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
@@ -36,8 +36,8 @@ requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint"
|
||||
gguf-dump = "gguf.scripts:gguf_dump_entrypoint"
|
||||
gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint"
|
||||
gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint"
|
||||
gguf-editor-gui = "gguf.scripts:gguf_editor_gui_entrypoint"
|
||||
gguf-convert-endian = "gguf.scripts.gguf_convert_endian:main"
|
||||
gguf-dump = "gguf.scripts.gguf_dump:main"
|
||||
gguf-set-metadata = "gguf.scripts.gguf_set_metadata:main"
|
||||
gguf-new-metadata = "gguf.scripts.gguf_new_metadata:main"
|
||||
gguf-editor-gui = "gguf.scripts.gguf_editor_gui:main"
|
||||
|
||||
@@ -40,5 +40,6 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
llama-convert-hf-to-gguf = "convert_hf_to_gguf:main"
|
||||
llama-convert-lora-to-gguf = "convert_lora_to_gguf:main"
|
||||
llama-convert-llama-ggml-to-gguf = "convert_llama_ggml_to_gguf:main"
|
||||
llama-ggml-vk-generate-shaders = "ggml_vk_generate_shaders:main"
|
||||
|
||||
@@ -1 +1 @@
|
||||
0482de9c63b9134eb462c7732888c0ee0dbc2755
|
||||
b59bddafe278877dfa22a80e53a637513862babb
|
||||
|
||||
@@ -782,7 +782,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||
} break;
|
||||
}
|
||||
|
||||
if (type_gate == LLM_FFN_PAR) {
|
||||
if (gate && type_gate == LLM_FFN_PAR) {
|
||||
cur = ggml_mul(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_gate_par", il);
|
||||
}
|
||||
|
||||
@@ -3510,7 +3510,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
@@ -4788,7 +4792,7 @@ struct llm_build_deci : public llm_graph_context {
|
||||
}
|
||||
|
||||
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
|
||||
if (n_head == 0 && n_ff == 0) {
|
||||
if (n_ff == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@@ -1750,23 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
||||
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||
|
||||
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// find max logit and calculate mean
|
||||
float max = cur_p->data[0].logit;
|
||||
float logits_sum = 0;
|
||||
size_t valid_count = 0;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
if (cur_p->data[i].logit > max) {
|
||||
max = cur_p->data[i].logit;
|
||||
// Only count non-negative infinity values
|
||||
if (cur_p->data[i].logit != -INFINITY) {
|
||||
if (cur_p->data[i].logit > max) {
|
||||
max = cur_p->data[i].logit;
|
||||
}
|
||||
logits_sum += cur_p->data[i].logit;
|
||||
valid_count++;
|
||||
}
|
||||
logits_sum += cur_p->data[i].logit;
|
||||
}
|
||||
float mean = logits_sum/cur_p->size;
|
||||
float mean = valid_count > 0 ? logits_sum/valid_count : 0;
|
||||
|
||||
// calculate standard deviation
|
||||
float acc = 0;
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
acc += pow(cur_p->data[i].logit - mean, 2);
|
||||
// Skip -infinity in std calculation
|
||||
if (cur_p->data[i].logit != -INFINITY) {
|
||||
acc += pow(cur_p->data[i].logit - mean, 2);
|
||||
}
|
||||
}
|
||||
float std = sqrt(acc/cur_p->size);
|
||||
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
|
||||
|
||||
//apply mask
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
|
||||
@@ -360,7 +360,7 @@ int main(void) {
|
||||
test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {});
|
||||
|
||||
test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f, 0.0f, 0.0f}, 1.00f);
|
||||
test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.00f);
|
||||
test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0.00f); // top_n_sigma == 0 now represents a no-op rather than greedy decoding as of PR#13345
|
||||
test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 3.00f);
|
||||
|
||||
test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
|
||||
|
||||
2650
tools/mtmd/clip.cpp
2650
tools/mtmd/clip.cpp
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -146,6 +146,7 @@ struct slot_params {
|
||||
{"top_k", sampling.top_k},
|
||||
{"top_p", sampling.top_p},
|
||||
{"min_p", sampling.min_p},
|
||||
{"top_n_sigma", sampling.top_n_sigma},
|
||||
{"xtc_probability", sampling.xtc_probability},
|
||||
{"xtc_threshold", sampling.xtc_threshold},
|
||||
{"typical_p", sampling.typ_p},
|
||||
@@ -248,6 +249,7 @@ struct server_task {
|
||||
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
|
||||
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
|
||||
params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
|
||||
params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma);
|
||||
params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
|
||||
params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
|
||||
params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
|
||||
|
||||
@@ -157,6 +157,9 @@ export default function ChatScreen() {
|
||||
clearExtraContext();
|
||||
};
|
||||
|
||||
// for vscode context
|
||||
textarea.refOnSubmit.current = sendNewMessage;
|
||||
|
||||
const handleEditMessage = async (msg: Message, content: string) => {
|
||||
if (!viewingChat) return;
|
||||
setCurrNodeId(msg.id);
|
||||
|
||||
@@ -37,6 +37,7 @@ export interface ChatTextareaApi {
|
||||
setValue: (value: string) => void;
|
||||
focus: () => void;
|
||||
ref: React.RefObject<HTMLTextAreaElement>;
|
||||
refOnSubmit: React.MutableRefObject<(() => void) | null>; // Submit handler
|
||||
onInput: (event: React.FormEvent<HTMLTextAreaElement>) => void; // Input handler
|
||||
}
|
||||
|
||||
@@ -46,6 +47,7 @@ export interface ChatTextareaApi {
|
||||
export function useChatTextarea(initValue: string): ChatTextareaApi {
|
||||
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
|
||||
const textareaRef = useRef<HTMLTextAreaElement>(null);
|
||||
const onSubmitRef = useRef<(() => void) | null>(null);
|
||||
|
||||
// Effect to set initial value and height on mount or when initValue changes
|
||||
useEffect(() => {
|
||||
@@ -91,6 +93,7 @@ export function useChatTextarea(initValue: string): ChatTextareaApi {
|
||||
}
|
||||
},
|
||||
ref: textareaRef,
|
||||
refOnSubmit: onSubmitRef,
|
||||
onInput: handleInput,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -33,6 +33,9 @@ export const useVSCodeContext = (textarea: ChatTextareaApi) => {
|
||||
});
|
||||
}
|
||||
textarea.focus();
|
||||
setTimeout(() => {
|
||||
textarea.refOnSubmit.current?.();
|
||||
}, 10); // wait for setExtraContext to finish
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user