mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
20 Commits
b3608
...
gg/metal-f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a95225cdfd | ||
|
|
7a3df798fc | ||
|
|
e5edb210cd | ||
|
|
0c41e03ceb | ||
|
|
f12ceaca0c | ||
|
|
436787f170 | ||
|
|
93bc3839f9 | ||
|
|
f91fc5639b | ||
|
|
e11bd856d5 | ||
|
|
8f824ffe8e | ||
|
|
3ba780e2a8 | ||
|
|
a07c32ea54 | ||
|
|
11b84eb457 | ||
|
|
1731d4238f | ||
|
|
a1631e53f6 | ||
|
|
fc54ef0d1c | ||
|
|
b40eb84895 | ||
|
|
f63f603c87 | ||
|
|
8455340b87 | ||
|
|
2f3c1466ff |
@@ -28,6 +28,7 @@
|
||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
@@ -60,6 +61,8 @@
|
||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
||||
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
|
||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
|
||||
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
|
||||
]
|
||||
}
|
||||
|
||||
@@ -106,6 +106,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
|
||||
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
||||
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
||||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||
|
||||
|
||||
29
ci/run.sh
29
ci/run.sh
@@ -13,6 +13,9 @@
|
||||
# # with SYCL support
|
||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with VULKAN support
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
@@ -40,7 +43,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||
@@ -52,6 +55,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
||||
fi
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
@@ -107,7 +114,7 @@ function gg_run_ctest_debug {
|
||||
gg_check_build_requirements
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
@@ -138,7 +145,7 @@ function gg_run_ctest_release {
|
||||
gg_check_build_requirements
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
@@ -266,7 +273,6 @@ function gg_sum_ctest_with_model_release {
|
||||
}
|
||||
|
||||
# open_llama_7b_v2
|
||||
# requires: GG_BUILD_CUDA
|
||||
|
||||
function gg_run_open_llama_7b_v2 {
|
||||
cd ${SRC}
|
||||
@@ -290,8 +296,8 @@ function gg_run_open_llama_7b_v2 {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
@@ -425,7 +431,7 @@ function gg_run_pythia_1_4b {
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
@@ -535,7 +541,6 @@ function gg_sum_pythia_1_4b {
|
||||
}
|
||||
|
||||
# pythia_2_8b
|
||||
# requires: GG_BUILD_CUDA
|
||||
|
||||
function gg_run_pythia_2_8b {
|
||||
cd ${SRC}
|
||||
@@ -556,8 +561,8 @@ function gg_run_pythia_2_8b {
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
@@ -692,7 +697,7 @@ function gg_run_embd_bge_small {
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
@@ -761,7 +766,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
fi
|
||||
|
||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
|
||||
test $ret -eq 0 && gg_run pythia_1_4b
|
||||
else
|
||||
test $ret -eq 0 && gg_run pythia_2_8b
|
||||
|
||||
@@ -77,6 +77,41 @@
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
//
|
||||
// Environment variable utils
|
||||
//
|
||||
|
||||
template<typename T>
|
||||
static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
|
||||
get_env(std::string name, T & target) {
|
||||
char * value = std::getenv(name.c_str());
|
||||
target = value ? std::string(value) : target;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
|
||||
get_env(std::string name, T & target) {
|
||||
char * value = std::getenv(name.c_str());
|
||||
target = value ? std::stoi(value) : target;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static typename std::enable_if<std::is_floating_point<T>::value, void>::type
|
||||
get_env(std::string name, T & target) {
|
||||
char * value = std::getenv(name.c_str());
|
||||
target = value ? std::stof(value) : target;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static typename std::enable_if<std::is_same<T, bool>::value, void>::type
|
||||
get_env(std::string name, T & target) {
|
||||
char * value = std::getenv(name.c_str());
|
||||
if (value) {
|
||||
std::string val(value);
|
||||
target = val == "1" || val == "true";
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// CPU utils
|
||||
//
|
||||
@@ -220,12 +255,6 @@ int32_t cpu_get_num_math() {
|
||||
// CLI argument parsing
|
||||
//
|
||||
|
||||
void gpt_params_handle_hf_token(gpt_params & params) {
|
||||
if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
|
||||
params.hf_token = std::getenv("HF_TOKEN");
|
||||
}
|
||||
}
|
||||
|
||||
void gpt_params_handle_model_default(gpt_params & params) {
|
||||
if (!params.hf_repo.empty()) {
|
||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||
@@ -273,7 +302,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
|
||||
gpt_params_handle_model_default(params);
|
||||
|
||||
gpt_params_handle_hf_token(params);
|
||||
if (params.hf_token.empty()) {
|
||||
get_env("HF_TOKEN", params.hf_token);
|
||||
}
|
||||
|
||||
if (params.escape) {
|
||||
string_process_escapes(params.prompt);
|
||||
@@ -293,6 +324,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void gpt_params_parse_from_env(gpt_params & params) {
|
||||
// we only care about server-related params for now
|
||||
get_env("LLAMA_ARG_MODEL", params.model);
|
||||
get_env("LLAMA_ARG_THREADS", params.n_threads);
|
||||
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
|
||||
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
|
||||
get_env("LLAMA_ARG_BATCH", params.n_batch);
|
||||
get_env("LLAMA_ARG_UBATCH", params.n_ubatch);
|
||||
get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers);
|
||||
get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http);
|
||||
get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template);
|
||||
get_env("LLAMA_ARG_N_PREDICT", params.n_predict);
|
||||
get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
|
||||
get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots);
|
||||
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
|
||||
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
|
||||
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
|
||||
}
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
const auto params_org = params; // the example can modify the default params
|
||||
|
||||
@@ -851,7 +901,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
|
||||
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
|
||||
CHECK_ARG
|
||||
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
@@ -1811,13 +1861,19 @@ std::string string_get_sortable_timestamp() {
|
||||
|
||||
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
if (search.empty()) {
|
||||
return; // Avoid infinite loop if 'search' is an empty string
|
||||
return;
|
||||
}
|
||||
std::string builder;
|
||||
builder.reserve(s.length());
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||
s.replace(pos, search.length(), replace);
|
||||
pos += replace.length();
|
||||
size_t last_pos = 0;
|
||||
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
||||
builder.append(s, last_pos, pos - last_pos);
|
||||
builder.append(replace);
|
||||
last_pos = pos + search.length();
|
||||
}
|
||||
builder.append(s, last_pos, std::string::npos);
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
void string_process_escapes(std::string & input) {
|
||||
|
||||
@@ -267,7 +267,7 @@ struct gpt_params {
|
||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||
};
|
||||
|
||||
void gpt_params_handle_hf_token(gpt_params & params);
|
||||
void gpt_params_parse_from_env(gpt_params & params);
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
|
||||
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
||||
|
||||
@@ -63,6 +63,7 @@ class Model:
|
||||
model_name: str | None
|
||||
metadata_override: Path | None
|
||||
dir_model_card: Path
|
||||
is_lora: bool
|
||||
|
||||
# subclasses should define this!
|
||||
model_arch: gguf.MODEL_ARCH
|
||||
@@ -70,7 +71,7 @@ class Model:
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||
use_temp_file: bool = False, eager: bool = False,
|
||||
metadata_override: Path | None = None, model_name: str | None = None,
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
|
||||
if type(self) is Model:
|
||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||
|
||||
@@ -92,6 +93,7 @@ class Model:
|
||||
self.metadata_override = metadata_override
|
||||
self.model_name = model_name
|
||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||
self.is_lora = is_lora # true if model is used inside convert_lora_to_gguf.py
|
||||
|
||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||
@@ -295,6 +297,7 @@ class Model:
|
||||
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
||||
gguf.MODEL_TENSOR.POS_EMBD,
|
||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D,
|
||||
)
|
||||
)
|
||||
or not name.endswith(".weight")
|
||||
@@ -1592,7 +1595,8 @@ class LlamaModel(Model):
|
||||
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||
|
||||
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
||||
if not self.is_lora:
|
||||
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
||||
|
||||
super().prepare_tensors()
|
||||
|
||||
@@ -2139,8 +2143,9 @@ class Phi3MiniModel(Model):
|
||||
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
||||
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
||||
|
||||
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
||||
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
||||
if not self.is_lora:
|
||||
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
||||
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
||||
|
||||
|
||||
@Model.register("PlamoForCausalLM")
|
||||
@@ -2711,7 +2716,7 @@ class StarCoder2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||
|
||||
|
||||
@Model.register("MambaForCausalLM", "MambaLMHeadModel")
|
||||
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
||||
class MambaModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.MAMBA
|
||||
|
||||
@@ -2742,7 +2747,10 @@ class MambaModel(Model):
|
||||
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
|
||||
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
|
||||
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
||||
|
||||
use_dt_b_c_norm = False
|
||||
# For falconmamba we do apply RMS norm on B / DT and C layers
|
||||
if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
|
||||
use_dt_b_c_norm = True
|
||||
# Fail early for models which don't have a block expansion factor of 2
|
||||
assert d_inner == 2 * d_model
|
||||
|
||||
@@ -2750,12 +2758,13 @@ class MambaModel(Model):
|
||||
self.gguf_writer.add_embedding_length(d_model)
|
||||
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
||||
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
||||
self.gguf_writer.add_ssm_inner_size(d_inner)
|
||||
self.gguf_writer.add_ssm_state_size(d_state)
|
||||
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
||||
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
||||
self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
_tok_embd = None
|
||||
@@ -2782,23 +2791,6 @@ class MambaModel(Model):
|
||||
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||
if bid is not None and new_name in (
|
||||
self.format_tensor_name(
|
||||
n, bid, ".weight" if name.endswith(".weight") else ""
|
||||
)
|
||||
for n in [
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D,
|
||||
gguf.MODEL_TENSOR.SSM_X,
|
||||
gguf.MODEL_TENSOR.SSM_DT,
|
||||
gguf.MODEL_TENSOR.SSM_A,
|
||||
gguf.MODEL_TENSOR.SSM_D,
|
||||
]
|
||||
):
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
@Model.register("CohereForCausalLM")
|
||||
class CommandR2Model(Model):
|
||||
@@ -3792,7 +3784,7 @@ class ExaoneModel(Model):
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
|
||||
assert(hparams["activation_function"] == "silu")
|
||||
assert (hparams["activation_function"] == "silu")
|
||||
|
||||
max_position_embeddings = hparams["max_position_embeddings"]
|
||||
embed_dim = hparams["hidden_size"]
|
||||
@@ -3851,12 +3843,13 @@ class ExaoneModel(Model):
|
||||
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||
|
||||
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
||||
if not self.is_lora:
|
||||
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
||||
|
||||
super().prepare_tensors()
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
# tree of lazy tensors
|
||||
class LazyTorchTensor(gguf.LazyBase):
|
||||
|
||||
@@ -386,6 +386,7 @@ if __name__ == '__main__':
|
||||
dry_run=args.dry_run,
|
||||
dir_lora_model=dir_lora,
|
||||
lora_alpha=alpha,
|
||||
is_lora=True,
|
||||
)
|
||||
|
||||
logger.info("Exporting model...")
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
|
||||
|
||||
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
|
||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
|
||||
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||
|
||||
@@ -28,10 +28,6 @@
|
||||
|
||||
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
|
||||
|
||||
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
||||
|
||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||
|
||||
## Recommended Release
|
||||
|
||||
The SYCL backend would be broken by some PRs due to no online CI.
|
||||
@@ -45,6 +41,10 @@ The following release is verified with good quality:
|
||||
|
||||
## News
|
||||
|
||||
|
||||
- 2024.8
|
||||
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
||||
|
||||
- 2024.5
|
||||
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
||||
- Arch Linux is verified successfully.
|
||||
@@ -196,7 +196,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li
|
||||
|
||||
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
|
||||
|
||||
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
|
||||
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
|
||||
|
||||
- **Adding support to Nvidia GPUs**
|
||||
|
||||
@@ -255,8 +255,6 @@ or
|
||||
# Export relevant ENV variables
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
|
||||
@@ -20,6 +20,10 @@
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
@@ -212,13 +216,19 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
if (search.empty()) {
|
||||
return; // Avoid infinite loop if 'search' is an empty string
|
||||
return;
|
||||
}
|
||||
std::string builder;
|
||||
builder.reserve(s.length());
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||
s.replace(pos, search.length(), replace);
|
||||
pos += replace.length();
|
||||
size_t last_pos = 0;
|
||||
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
||||
builder.append(s, last_pos, pos - last_pos);
|
||||
builder.append(replace);
|
||||
last_pos = pos + search.length();
|
||||
}
|
||||
builder.append(s, last_pos, std::string::npos);
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
@@ -1108,7 +1118,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
}
|
||||
|
||||
clip_ctx * new_clip = new clip_ctx;
|
||||
clip_ctx * new_clip = new clip_ctx{};
|
||||
|
||||
// update projector type
|
||||
{
|
||||
@@ -1142,6 +1152,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
new_clip->backend = ggml_backend_vk_init(0);
|
||||
LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
|
||||
#endif
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
|
||||
@@ -104,7 +104,7 @@ static void usage(const char * executable) {
|
||||
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
||||
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
||||
printf(" --keep-split: will generate quatized model in the same shards as input");
|
||||
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
||||
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
||||
|
||||
@@ -247,6 +247,25 @@ logging:
|
||||
--log-append Don't truncate the old log file.
|
||||
```
|
||||
|
||||
Available environment variables (if specified, these variables will override parameters specified in arguments):
|
||||
|
||||
- `LLAMA_CACHE` (cache directory, used by `--hf-repo`)
|
||||
- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`)
|
||||
- `LLAMA_ARG_MODEL`
|
||||
- `LLAMA_ARG_THREADS`
|
||||
- `LLAMA_ARG_CTX_SIZE`
|
||||
- `LLAMA_ARG_N_PARALLEL`
|
||||
- `LLAMA_ARG_BATCH`
|
||||
- `LLAMA_ARG_UBATCH`
|
||||
- `LLAMA_ARG_N_GPU_LAYERS`
|
||||
- `LLAMA_ARG_THREADS_HTTP`
|
||||
- `LLAMA_ARG_CHAT_TEMPLATE`
|
||||
- `LLAMA_ARG_N_PREDICT`
|
||||
- `LLAMA_ARG_ENDPOINT_METRICS`
|
||||
- `LLAMA_ARG_ENDPOINT_SLOTS`
|
||||
- `LLAMA_ARG_EMBEDDINGS`
|
||||
- `LLAMA_ARG_FLASH_ATTN`
|
||||
- `LLAMA_ARG_DEFRAG_THOLD`
|
||||
|
||||
## Build
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -2507,6 +2507,9 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// parse arguments from environment variables
|
||||
gpt_params_parse_from_env(params);
|
||||
|
||||
// TODO: not great to use extern vars
|
||||
server_log_json = params.log_json;
|
||||
server_verbose = params.verbosity > 0;
|
||||
|
||||
@@ -1760,7 +1760,8 @@ extern "C" {
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * mask,
|
||||
float scale,
|
||||
float max_bias);
|
||||
float max_bias,
|
||||
float logit_softcap);
|
||||
|
||||
GGML_API void ggml_flash_attn_ext_set_prec(
|
||||
struct ggml_tensor * a,
|
||||
@@ -1777,10 +1778,8 @@ extern "C" {
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * s,
|
||||
struct ggml_tensor * x,
|
||||
struct ggml_tensor * c,
|
||||
struct ggml_tensor * sq);
|
||||
struct ggml_tensor * sx,
|
||||
struct ggml_tensor * c);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_ssm_scan(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1789,8 +1788,7 @@ extern "C" {
|
||||
struct ggml_tensor * dt,
|
||||
struct ggml_tensor * A,
|
||||
struct ggml_tensor * B,
|
||||
struct ggml_tensor * C,
|
||||
struct ggml_tensor * sq);
|
||||
struct ggml_tensor * C);
|
||||
|
||||
// partition into non-overlapping windows with padding if needed
|
||||
// example:
|
||||
|
||||
@@ -549,6 +549,13 @@ if (GGML_SYCL)
|
||||
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
|
||||
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||
|
||||
find_package(DNNL)
|
||||
message("-- DNNL found:" ${DNNL_FOUND})
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
|
||||
else()
|
||||
add_compile_definitions(GGML_SYCL_DNNL=0)
|
||||
endif()
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
@@ -561,6 +568,9 @@ if (GGML_SYCL)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
|
||||
endif()
|
||||
endif()
|
||||
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_RPC)
|
||||
|
||||
@@ -22,6 +22,7 @@ typedef void (* fattn_kernel_t)(
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
@@ -657,11 +658,17 @@ void launch_fattn(
|
||||
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||
const int shmem = 0;
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
float logit_softcap = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||
memcpy(&logit_softcap, (float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (logit_softcap != 0.0f) {
|
||||
scale /= logit_softcap;
|
||||
}
|
||||
|
||||
const uint32_t n_head = Q->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
@@ -675,7 +682,7 @@ void launch_fattn(
|
||||
V_data,
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2,
|
||||
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
#define FATTN_KQ_STRIDE_TILE_F16 64
|
||||
|
||||
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
|
||||
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
@@ -20,6 +20,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
@@ -44,6 +45,12 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#ifdef FP16_AVAILABLE
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||
@@ -154,7 +161,13 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
half sum;
|
||||
if (use_logit_softcap) {
|
||||
const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
sum = logit_softcap * tanhf(tmp.x + tmp.y);
|
||||
} else {
|
||||
sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
}
|
||||
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
|
||||
@@ -270,20 +283,20 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
template <int cols_per_block, int parallel_blocks>
|
||||
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
|
||||
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = 8;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = 8;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
} break;
|
||||
default: {
|
||||
@@ -296,24 +309,45 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (Q->ne[1] <= 16) {
|
||||
constexpr int cols_per_block = 16;
|
||||
constexpr int parallel_blocks = 4;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 32) {
|
||||
constexpr int cols_per_block = 32;
|
||||
constexpr int parallel_blocks = 4;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 32;
|
||||
constexpr int parallel_blocks = 1;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
#define FATTN_KQ_STRIDE_TILE_F32 32
|
||||
|
||||
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
|
||||
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
@@ -20,6 +20,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
@@ -43,6 +44,12 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||
@@ -151,6 +158,10 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
}
|
||||
|
||||
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
|
||||
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||
@@ -267,20 +278,20 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
}
|
||||
}
|
||||
|
||||
template <int cols_per_block, int parallel_blocks>
|
||||
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
|
||||
void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
switch (Q->ne[0]) {
|
||||
case 64: {
|
||||
constexpr int D = 64;
|
||||
constexpr int nwarps = 8;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
} break;
|
||||
case 128: {
|
||||
constexpr int D = 128;
|
||||
constexpr int nwarps = 8;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
} break;
|
||||
default: {
|
||||
@@ -290,23 +301,45 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (Q->ne[1] <= 16) {
|
||||
constexpr int cols_per_block = 16;
|
||||
constexpr int parallel_blocks = 4;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 32) {
|
||||
constexpr int cols_per_block = 32;
|
||||
constexpr int parallel_blocks = 4;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 32;
|
||||
constexpr int parallel_blocks = 1;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size
|
||||
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
@@ -17,6 +17,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
@@ -41,6 +42,12 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#ifdef FP16_AVAILABLE
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
|
||||
@@ -190,6 +197,11 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum = logit_softcap*tanhf(sum);
|
||||
}
|
||||
|
||||
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||
|
||||
if (ncols == 1) {
|
||||
@@ -286,10 +298,10 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V>
|
||||
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
||||
void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V>;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
|
||||
constexpr bool need_f16_K = D != 128;
|
||||
constexpr bool need_f16_V = D != 128 && D != 64;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
|
||||
@@ -297,48 +309,81 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
|
||||
|
||||
template <int D, ggml_type type_K, ggml_type type_V>
|
||||
void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * KQV = dst;
|
||||
ggml_tensor * Q = dst->src[0];
|
||||
ggml_tensor * K = dst->src[1];
|
||||
ggml_tensor * V = dst->src[2];
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
|
||||
GGML_ASSERT(K->type == type_K);
|
||||
GGML_ASSERT(V->type == type_V);
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (Q->ne[1] == 1) {
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int parallel_blocks = 4;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 2) {
|
||||
constexpr int cols_per_block = 2;
|
||||
constexpr int parallel_blocks = 4;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 4) {
|
||||
constexpr int cols_per_block = 4;
|
||||
constexpr int parallel_blocks = 4;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 8) {
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 4;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 1;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
|
||||
#define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V) \
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size
|
||||
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
@@ -17,6 +17,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
@@ -40,6 +41,12 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K);
|
||||
@@ -180,6 +187,11 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum = logit_softcap*tanhf(sum);
|
||||
}
|
||||
|
||||
sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
|
||||
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
|
||||
@@ -267,10 +279,10 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
}
|
||||
}
|
||||
|
||||
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V>
|
||||
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
||||
void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
constexpr int nwarps = D/WARP_SIZE;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V>;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
|
||||
constexpr bool need_f16_K = D != 128;
|
||||
constexpr bool need_f16_V = D != 128 && D != 64;
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
|
||||
@@ -278,44 +290,78 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
|
||||
|
||||
template <int D, ggml_type type_K, ggml_type type_V>
|
||||
void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * Q = dst->src[0];
|
||||
ggml_tensor * K = dst->src[1];
|
||||
ggml_tensor * V = dst->src[2];
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
GGML_ASSERT(K->type == type_K);
|
||||
GGML_ASSERT(V->type == type_V);
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (Q->ne[1] == 1) {
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int parallel_blocks = 4;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 2) {
|
||||
constexpr int cols_per_block = 2;
|
||||
constexpr int parallel_blocks = 4;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 4) {
|
||||
constexpr int cols_per_block = 4;
|
||||
constexpr int parallel_blocks = 4;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 8) {
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 4;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 1;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
|
||||
}
|
||||
}
|
||||
|
||||
#define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V) \
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
|
||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
|
||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
@@ -22,6 +22,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
@@ -46,6 +47,12 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
|
||||
@@ -85,6 +92,8 @@ static __global__ void flash_attn_ext_f16(
|
||||
const half slopeh = __float2half(slopef);
|
||||
const half2 slope2 = make_half2(slopef, slopef);
|
||||
|
||||
const half2 logit_softcap_2 = make_half2(logit_softcap, logit_softcap);
|
||||
|
||||
frag_b Q_b[D/16][ncols/frag_n];
|
||||
|
||||
// A single buffer for temporarily holding tiles of KQ and VKQ parts:
|
||||
@@ -194,6 +203,10 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int k = k0 + threadIdx.x;
|
||||
|
||||
KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k];
|
||||
|
||||
if (use_logit_softcap) {
|
||||
KQ_f_tmp[k0/WARP_SIZE] = logit_softcap*tanhf(KQ_f_tmp[k0/WARP_SIZE]);
|
||||
}
|
||||
}
|
||||
|
||||
float KQ_max_new = KQ_max_f[j0/nwarps];
|
||||
@@ -237,6 +250,15 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int k = k0 + threadIdx.x;
|
||||
|
||||
KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k];
|
||||
|
||||
if (use_logit_softcap) {
|
||||
// There is no dedicated tangens hyperbolicus function for half2.
|
||||
KQ2_tmp[k0/WARP_SIZE] = h2exp(KQ2_tmp[k0/WARP_SIZE]*make_half2(2.0f, 2.0f));
|
||||
KQ2_tmp[k0/WARP_SIZE] = (KQ2_tmp[k0/WARP_SIZE] - make_half2(1.0f, 1.0f))
|
||||
/(KQ2_tmp[k0/WARP_SIZE] + make_half2(1.0f, 1.0f));
|
||||
|
||||
KQ2_tmp[k0/WARP_SIZE] *= logit_softcap_2;
|
||||
}
|
||||
}
|
||||
|
||||
half2 KQ_max_new = KQ_max_h2[j0/nwarps];
|
||||
@@ -427,7 +449,8 @@ static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
|
||||
|
||||
template <int D, int cols_per_block, typename KQ_acc_t>
|
||||
void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
constexpr int nwarps = 4;
|
||||
|
||||
@@ -435,20 +458,50 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||
const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
|
||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (4*blocks_num_pb1 < 2*nsm) {
|
||||
constexpr int parallel_blocks = 4;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
|
||||
fattn_kernel_t fattn_kernel;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||
}
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
return;
|
||||
}
|
||||
if (2*blocks_num_pb1 < 2*nsm) {
|
||||
constexpr int parallel_blocks = 2;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
|
||||
fattn_kernel_t fattn_kernel;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||
}
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
return;
|
||||
}
|
||||
constexpr int parallel_blocks = 1;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
|
||||
fattn_kernel_t fattn_kernel;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
|
||||
}
|
||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||
}
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
|
||||
if (precision != GGML_PREC_DEFAULT) {
|
||||
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
||||
@@ -301,7 +301,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
|
||||
ggml_cuda_set_device(ctx.device);
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
const int32_t precision = KQV->op_params[3];
|
||||
|
||||
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
||||
if (cc >= CC_OFFSET_AMD) {
|
||||
|
||||
@@ -2624,9 +2624,14 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
|
||||
float scale;
|
||||
float max_bias;
|
||||
float logit_softcap;
|
||||
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
|
||||
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
|
||||
memcpy(&logit_softcap, ((int32_t *) dst->op_params) + 2, sizeof(logit_softcap));
|
||||
|
||||
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
|
||||
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
|
||||
if (logit_softcap != 0.0f) {
|
||||
scale /= logit_softcap;
|
||||
}
|
||||
|
||||
const uint32_t n_head = src0->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
@@ -2677,30 +2682,31 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
} else {
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:3];
|
||||
}
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19];
|
||||
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22];
|
||||
[encoder setBytes:&scale length:sizeof( float) atIndex:23];
|
||||
[encoder setBytes:&max_bias length:sizeof( float) atIndex:24];
|
||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
|
||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
|
||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19];
|
||||
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22];
|
||||
[encoder setBytes:&scale length:sizeof( float) atIndex:23];
|
||||
[encoder setBytes:&max_bias length:sizeof( float) atIndex:24];
|
||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
|
||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
|
||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
|
||||
[encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28];
|
||||
|
||||
if (!use_vec_kernel) {
|
||||
// half8x8 kernel
|
||||
|
||||
@@ -1976,6 +1976,7 @@ typedef void (flash_attn_ext_f16_t)(
|
||||
constant float & m0,
|
||||
constant float & m1,
|
||||
constant uint32_t & n_head_log2,
|
||||
constant float & logit_softcap,
|
||||
threadgroup half * shared,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
@@ -2014,6 +2015,7 @@ kernel void kernel_flash_attn_ext_f16(
|
||||
constant float & m0,
|
||||
constant float & m1,
|
||||
constant uint32_t & n_head_log2,
|
||||
constant float & logit_softcap,
|
||||
threadgroup half * shared [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
@@ -2142,14 +2144,21 @@ kernel void kernel_flash_attn_ext_f16(
|
||||
const short tx = tiisg%4;
|
||||
const short ty = tiisg/4;
|
||||
|
||||
if (mask != q) {
|
||||
// mqk = mqk*scale + mask*slope
|
||||
ss[8*cc + ty*TF + 2*tx + 0] = scale*ss[8*cc + ty*TF + 2*tx + 0] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 0];
|
||||
ss[8*cc + ty*TF + 2*tx + 1] = scale*ss[8*cc + ty*TF + 2*tx + 1] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 1];
|
||||
} else {
|
||||
if (iq1 + ty < ne01) {
|
||||
// mqk = mqk*scale
|
||||
ss[8*cc + ty*TF + 2*tx + 0] *= scale;
|
||||
ss[8*cc + ty*TF + 2*tx + 1] *= scale;
|
||||
|
||||
if (logit_softcap != 0.0f) {
|
||||
ss[8*cc + ty*TF + 2*tx + 0] = logit_softcap*precise::tanh(ss[8*cc + ty*TF + 2*tx + 0]);
|
||||
ss[8*cc + ty*TF + 2*tx + 1] = logit_softcap*precise::tanh(ss[8*cc + ty*TF + 2*tx + 1]);
|
||||
}
|
||||
|
||||
if (mask != q) {
|
||||
// mqk = mqk + mask*slope
|
||||
ss[8*cc + ty*TF + 2*tx + 0] += slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 0];
|
||||
ss[8*cc + ty*TF + 2*tx + 1] += slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2345,6 +2354,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
constant float & m0,
|
||||
constant float & m1,
|
||||
constant uint32_t & n_head_log2,
|
||||
constant float & logit_softcap,
|
||||
threadgroup half * shared [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
@@ -2479,7 +2489,13 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
|
||||
// mqk = mqk*scale + mask*slope
|
||||
if (tiisg == 0) {
|
||||
mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);
|
||||
mqk *= scale;
|
||||
|
||||
if (logit_softcap != 0.0f) {
|
||||
mqk = logit_softcap*precise::tanh(mqk);
|
||||
}
|
||||
|
||||
mqk += (mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f;
|
||||
|
||||
ss4[cc] = mqk;
|
||||
}
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
|
||||
#include "ggml-sycl/backend.hpp"
|
||||
#include "ggml-sycl/presets.hpp"
|
||||
#include "ggml-sycl/gemm.hpp"
|
||||
|
||||
bool ggml_sycl_loaded(void);
|
||||
void ggml_sycl_free_data(struct ggml_tensor * tensor);
|
||||
@@ -2482,6 +2483,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
|
||||
const sycl::half alpha_f16 = 1.0f;
|
||||
const sycl::half beta_f16 = 0.0f;
|
||||
#if !GGML_SYCL_DNNL
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
||||
*stream, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
@@ -2491,6 +2493,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
dpct::library_data_t::real_half)));
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
#else
|
||||
auto dnnl_stream = ctx.stream_dnnl(stream);
|
||||
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||
src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
||||
#endif
|
||||
}
|
||||
else {
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
|
||||
@@ -2513,13 +2522,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
|
||||
#if !GGML_SYCL_DNNL
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
||||
*stream, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
dpct::get_value(&alpha, *stream), src0_ddf_i, ne00,
|
||||
src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
|
||||
dst_dd_i, ldc)));
|
||||
#else
|
||||
auto dnnl_stream = ctx.stream_dnnl(stream);
|
||||
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
|
||||
src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
|
||||
#endif
|
||||
}
|
||||
(void) dst;
|
||||
(void) src1_ddq_i;
|
||||
|
||||
@@ -19,6 +19,10 @@
|
||||
#include "dpct/helper.hpp"
|
||||
#include "ggml-sycl.h"
|
||||
#include "presets.hpp"
|
||||
#if GGML_SYCL_DNNL
|
||||
#include "dnnl.hpp"
|
||||
#include "dnnl_sycl.hpp"
|
||||
#endif
|
||||
|
||||
#define GGML_COMMON_DECL_SYCL
|
||||
#define GGML_COMMON_IMPL_SYCL
|
||||
@@ -277,6 +281,52 @@ struct ggml_backend_sycl_context {
|
||||
return stream(device, 0);
|
||||
}
|
||||
|
||||
#if GGML_SYCL_DNNL
|
||||
dnnl::engine make_engine(sycl::queue* q) {
|
||||
// Get the device associated with the queue
|
||||
sycl::device dev = q->get_device();
|
||||
// Get the context associated with the queue
|
||||
sycl::context ctx = q->get_context();
|
||||
const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
|
||||
return eng;
|
||||
}
|
||||
|
||||
std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
|
||||
std::unordered_map<sycl::queue*, dnnl::engine> engine_map;
|
||||
dnnl::stream stream_dnnl(int device, int _stream) {
|
||||
auto q = stream(device, _stream);
|
||||
return stream_dnnl(q);
|
||||
}
|
||||
dnnl::engine engine_dnnl(sycl::queue* qptr) {
|
||||
auto it = engine_map.find(qptr);
|
||||
if (it == engine_map.end()) {
|
||||
auto eng = make_engine(qptr);
|
||||
engine_map[qptr] = eng;
|
||||
return eng;
|
||||
}
|
||||
else
|
||||
{
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
dnnl::stream stream_dnnl(sycl::queue* qptr) {
|
||||
auto it = stream_map.find(qptr);
|
||||
if (it == stream_map.end()) {
|
||||
auto eng = engine_dnnl(qptr);
|
||||
auto stream = dnnl::sycl_interop::make_stream(eng, *qptr);
|
||||
stream_map[qptr] = stream;
|
||||
return stream;
|
||||
}
|
||||
else
|
||||
{
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
dnnl::stream stream_dnnl() {
|
||||
return stream_dnnl(device, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
// pool
|
||||
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
|
||||
101
ggml/src/ggml-sycl/gemm.hpp
Normal file
101
ggml/src/ggml-sycl/gemm.hpp
Normal file
@@ -0,0 +1,101 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_GEMM_HPP
|
||||
#define GGML_SYCL_GEMM_HPP
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "ggml-sycl.h"
|
||||
|
||||
#if GGML_SYCL_DNNL
|
||||
|
||||
#include "dnnl.hpp"
|
||||
#include "dnnl_sycl.hpp"
|
||||
|
||||
class DnnlGemmWrapper {
|
||||
public:
|
||||
using dt = dnnl::memory::data_type;
|
||||
using tag = dnnl::memory::format_tag;
|
||||
|
||||
template<typename T>
|
||||
static constexpr dt to_dt() {
|
||||
if constexpr (std::is_same_v<T, float>) return dt::f32;
|
||||
else if constexpr (std::is_same_v<T, sycl::half>) return dt::f16;
|
||||
else static_assert(0);
|
||||
}
|
||||
|
||||
static inline void row_gemm(sycl::queue& q, bool a_trans,
|
||||
bool b_trans, int m, int n, int k,
|
||||
const void* a, dt at, const void* b, dt bt, void* c, dt ct)
|
||||
{
|
||||
// Get the device associated with the queue
|
||||
sycl::device dev = q.get_device();
|
||||
// Get the context associated with the queue
|
||||
sycl::context ctx = q.get_context();
|
||||
const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
|
||||
const dnnl::stream stream = dnnl::sycl_interop::make_stream(eng, q);
|
||||
dnnl::memory::dims a_dims = { m, k };
|
||||
dnnl::memory::dims b_dims = { k, n };
|
||||
dnnl::memory::dims c_dims = { m, n };
|
||||
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
||||
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
||||
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
||||
auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
|
||||
auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
|
||||
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
||||
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
||||
|
||||
// Create the primitive.
|
||||
auto matmul_prim = dnnl::matmul(matmul_pd);
|
||||
// Primitive arguments.
|
||||
std::unordered_map<int, dnnl::memory> matmul_args;
|
||||
matmul_args.insert({ DNNL_ARG_SRC, a_mem });
|
||||
matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
|
||||
matmul_args.insert({ DNNL_ARG_DST, c_mem });
|
||||
|
||||
matmul_prim.execute(stream, matmul_args);
|
||||
}
|
||||
|
||||
|
||||
static inline void row_gemm(const dnnl::stream& stream, bool a_trans,
|
||||
bool b_trans, int m, int n, int k,
|
||||
const void* a, dt at, const void* b, dt bt, void* c, dt ct)
|
||||
{
|
||||
auto const eng = stream.get_engine();
|
||||
dnnl::memory::dims a_dims = { m, k };
|
||||
dnnl::memory::dims b_dims = { k, n };
|
||||
dnnl::memory::dims c_dims = { m, n };
|
||||
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
||||
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
||||
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
||||
auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
|
||||
auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
|
||||
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
||||
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
||||
|
||||
// Create the primitive.
|
||||
auto matmul_prim = dnnl::matmul(matmul_pd);
|
||||
// Primitive arguments.
|
||||
std::unordered_map<int, dnnl::memory> matmul_args;
|
||||
matmul_args.insert({ DNNL_ARG_SRC, a_mem });
|
||||
matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
|
||||
matmul_args.insert({ DNNL_ARG_DST, c_mem });
|
||||
|
||||
matmul_prim.execute(stream, matmul_args);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#endif // GGML_SYCL_GEMM_HPP
|
||||
@@ -180,6 +180,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
|
||||
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_acc_f32;
|
||||
vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16;
|
||||
vk_pipeline pipeline_mul_f32;
|
||||
vk_pipeline pipeline_div_f32;
|
||||
@@ -1687,6 +1688,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
@@ -3971,6 +3974,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_get_rows_f32[src0->type];
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_ACC:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_acc_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_ADD:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_add_f32;
|
||||
@@ -4463,6 +4471,28 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
||||
|
||||
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
||||
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
||||
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
||||
int offset = dst->op_params[3] / 4; // offset in bytes
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ACC, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
d_offset,
|
||||
0.0f, 0.0f, offset,
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
@@ -5621,6 +5651,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_CONCAT:
|
||||
@@ -5668,6 +5699,10 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_REPEAT:
|
||||
ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_ACC:
|
||||
ggml_vk_acc(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
@@ -5808,6 +5843,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
@@ -6539,6 +6575,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_CONCAT:
|
||||
@@ -6995,6 +7032,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone);
|
||||
} else if (tensor->op == GGML_OP_ADD) {
|
||||
tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
|
||||
} else if (tensor->op == GGML_OP_ACC) {
|
||||
tensor_clone = ggml_acc(ggml_ctx, src0_clone, src1_clone, tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
|
||||
} else if (tensor->op == GGML_OP_NORM) {
|
||||
tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_GROUP_NORM) {
|
||||
|
||||
306
ggml/src/ggml.c
306
ggml/src/ggml.c
@@ -7095,7 +7095,8 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * mask,
|
||||
float scale,
|
||||
float max_bias) {
|
||||
float max_bias,
|
||||
float logit_softcap) {
|
||||
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
||||
// TODO: check if vT can be multiplied by (k*qT)
|
||||
|
||||
@@ -7122,7 +7123,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
||||
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
|
||||
float params[] = { scale, max_bias };
|
||||
float params[] = { scale, max_bias, logit_softcap };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_FLASH_ATTN_EXT;
|
||||
@@ -7142,7 +7143,7 @@ void ggml_flash_attn_ext_set_prec(
|
||||
|
||||
const int32_t prec_i32 = (int32_t) prec;
|
||||
|
||||
ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
|
||||
ggml_set_op_params_i32(a, 3, prec_i32); // scale is on first pos, max_bias on second
|
||||
}
|
||||
|
||||
// ggml_flash_attn_back
|
||||
@@ -7229,43 +7230,34 @@ struct ggml_tensor * ggml_flash_attn_back(
|
||||
|
||||
struct ggml_tensor * ggml_ssm_conv(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * s,
|
||||
struct ggml_tensor * x,
|
||||
struct ggml_tensor * c,
|
||||
struct ggml_tensor * sq) {
|
||||
GGML_ASSERT(ggml_is_3d(s));
|
||||
GGML_ASSERT(ggml_is_matrix(x));
|
||||
struct ggml_tensor * sx,
|
||||
struct ggml_tensor * c) {
|
||||
GGML_ASSERT(ggml_is_3d(sx));
|
||||
GGML_ASSERT(ggml_is_matrix(c));
|
||||
GGML_ASSERT(ggml_is_matrix(sq));
|
||||
GGML_ASSERT(sq->type == GGML_TYPE_I32);
|
||||
|
||||
const int64_t d_conv = c->ne[0];
|
||||
const int64_t d_inner = c->ne[1];
|
||||
const int64_t n_tokens = x->ne[1];
|
||||
const int64_t n_kv = s->ne[2];
|
||||
const int64_t d_conv = c->ne[0];
|
||||
const int64_t d_inner = c->ne[1];
|
||||
const int64_t n_t = sx->ne[0] - d_conv + 1; // tokens per sequence
|
||||
const int64_t n_s = sx->ne[2];
|
||||
|
||||
GGML_ASSERT( s->ne[0] == d_conv - 1);
|
||||
GGML_ASSERT( s->ne[1] == d_inner);
|
||||
GGML_ASSERT( x->ne[0] == d_inner);
|
||||
GGML_ASSERT(sq->ne[0] == n_kv);
|
||||
GGML_ASSERT(sq->ne[1] == n_tokens);
|
||||
// TODO: maybe support other strides than 1?
|
||||
GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t);
|
||||
GGML_ASSERT(sx->ne[1] == d_inner);
|
||||
GGML_ASSERT(n_t >= 0);
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (s->grad || x->grad || c->grad || sq->grad) {
|
||||
if (sx->grad || c->grad) {
|
||||
GGML_ABORT("fatal error"); // TODO: implement
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
// 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv}
|
||||
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv));
|
||||
struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_t, n_s);
|
||||
|
||||
result->op = GGML_OP_SSM_CONV;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = s;
|
||||
result->src[1] = x;
|
||||
result->src[2] = c;
|
||||
result->src[3] = sq;
|
||||
result->src[0] = sx;
|
||||
result->src[1] = c;
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -7279,39 +7271,42 @@ struct ggml_tensor * ggml_ssm_scan(
|
||||
struct ggml_tensor * dt,
|
||||
struct ggml_tensor * A,
|
||||
struct ggml_tensor * B,
|
||||
struct ggml_tensor * C,
|
||||
struct ggml_tensor * sq) {
|
||||
struct ggml_tensor * C) {
|
||||
GGML_ASSERT(ggml_is_contiguous(s));
|
||||
GGML_ASSERT(ggml_is_contiguous(x));
|
||||
GGML_ASSERT(ggml_is_contiguous(dt));
|
||||
GGML_ASSERT(ggml_is_contiguous(A));
|
||||
GGML_ASSERT(sq->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_matrix(A));
|
||||
GGML_ASSERT(ggml_is_3d(B));
|
||||
GGML_ASSERT(ggml_is_3d(s));
|
||||
GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
|
||||
GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
|
||||
GGML_ASSERT(ggml_are_same_shape(x, dt));
|
||||
GGML_ASSERT(ggml_are_same_shape(B, C));
|
||||
|
||||
{
|
||||
const int64_t d_state = s->ne[0];
|
||||
const int64_t d_inner = s->ne[1];
|
||||
const int64_t n_tokens = x->ne[1];
|
||||
const int64_t d_state = s->ne[0];
|
||||
const int64_t d_inner = s->ne[1];
|
||||
const int64_t n_seq_tokens = x->ne[1];
|
||||
const int64_t n_seqs = x->ne[2];
|
||||
|
||||
GGML_ASSERT(s->ne[2] == n_seqs);
|
||||
GGML_ASSERT(x->ne[0] == d_inner);
|
||||
GGML_ASSERT(A->ne[0] == d_state);
|
||||
GGML_ASSERT(A->ne[1] == d_inner);
|
||||
GGML_ASSERT(B->ne[0] == d_state);
|
||||
GGML_ASSERT(B->ne[1] == n_tokens);
|
||||
GGML_ASSERT(C->ne[0] == d_state);
|
||||
GGML_ASSERT(C->ne[1] == n_tokens);
|
||||
GGML_ASSERT(B->ne[1] == n_seq_tokens);
|
||||
GGML_ASSERT(B->ne[2] == n_seqs);
|
||||
}
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
|
||||
if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad) {
|
||||
GGML_ABORT("fatal error"); // TODO: implement
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
// 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv}
|
||||
// concatenated y + ssm_states
|
||||
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
|
||||
|
||||
result->op = GGML_OP_SSM_SCAN;
|
||||
@@ -7322,7 +7317,6 @@ struct ggml_tensor * ggml_ssm_scan(
|
||||
result->src[3] = A;
|
||||
result->src[4] = B;
|
||||
result->src[5] = C;
|
||||
result->src[6] = sq;
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -10995,11 +10989,6 @@ static void ggml_compute_forward_concat_f32(
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
// TODO: support for transposed / permuted tensors
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||
@@ -15283,11 +15272,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
float logit_softcap = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||
memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float));
|
||||
|
||||
if (logit_softcap != 0) {
|
||||
scale /= logit_softcap;
|
||||
}
|
||||
|
||||
const uint32_t n_head = neq2;
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||
@@ -15351,7 +15346,13 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
|
||||
kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1);
|
||||
|
||||
s = s*scale + mv; // scale KQ value and apply mask
|
||||
s = s*scale; // scale KQ value
|
||||
|
||||
if (logit_softcap != 0.0f) {
|
||||
s = logit_softcap*tanhf(s);
|
||||
}
|
||||
|
||||
s += mv; // apply mask
|
||||
|
||||
const float Mold = M;
|
||||
|
||||
@@ -15360,7 +15361,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
|
||||
const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
|
||||
|
||||
if (v->type== GGML_TYPE_F16) {
|
||||
if (v->type == GGML_TYPE_F16) {
|
||||
if (s > M) {
|
||||
// s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
|
||||
M = s;
|
||||
@@ -15427,7 +15428,7 @@ static void ggml_compute_forward_flash_attn_ext(
|
||||
const struct ggml_tensor * v,
|
||||
const struct ggml_tensor * mask,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (dst->op_params[2]) {
|
||||
switch (dst->op_params[3]) {
|
||||
case GGML_PREC_DEFAULT:
|
||||
case GGML_PREC_F32:
|
||||
{
|
||||
@@ -15782,27 +15783,22 @@ static void ggml_compute_forward_flash_attn_back(
|
||||
static void ggml_compute_forward_ssm_conv_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // conv_state
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // x
|
||||
const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight
|
||||
const struct ggml_tensor * src3 = dst->src[3]; // state_seq
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // conv_x
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src2->ne[0]; // d_conv
|
||||
const int nr = src0->ne[1]; // d_inner
|
||||
const int n_t = src1->ne[1]; // n_tokens
|
||||
const int n_kv = src0->ne[2]; // max number of sequences in the batch
|
||||
const int nc = src1->ne[0]; // d_conv
|
||||
const int ncs = src0->ne[0]; // d_conv - 1 + n_t
|
||||
const int nr = src0->ne[1]; // d_inner
|
||||
const int n_t = dst->ne[1]; // tokens per sequence
|
||||
const int n_s = dst->ne[2]; // number of sequences in the batch
|
||||
|
||||
GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == ggml_nelements(dst));
|
||||
GGML_ASSERT( dst->ne[0] == nr);
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src3->nb[0] == sizeof(int32_t));
|
||||
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
||||
// for use with the destination state offset between sequences
|
||||
GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float));
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
@@ -15812,76 +15808,29 @@ static void ggml_compute_forward_ssm_conv_f32(
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
const int ir = ir1 - ir0;
|
||||
|
||||
if (n_kv > 1) {
|
||||
// multiple sequences means it's hard to know when it's the first time a state is read,
|
||||
// so copy them all over to the destination, just to be sure.
|
||||
for (int i3 = 0; i3 < n_kv; ++i3) {
|
||||
float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
|
||||
float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float));
|
||||
// can't use memcpy because of d_conv vs d_conv - 1
|
||||
for (int i3 = 0; i3 < n_s; ++i3) {
|
||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||
// {d_conv - 1 + n_t, d_inner, n_seqs}
|
||||
// sliding window
|
||||
const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s}
|
||||
const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner}
|
||||
float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s}
|
||||
|
||||
// TODO: transpose the output for smaller strides for big batches?
|
||||
// d_inner
|
||||
for (int i1 = 0; i1 < ir; ++i1) {
|
||||
for (int i0 = 0; i0 < nc - 1; ++i0) {
|
||||
// copy s0 to last (d_conv - 1) columns of s
|
||||
s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)];
|
||||
// rowwise dot product
|
||||
// NOTE: not using ggml_vec_dot_f32, because its sum is in double precision
|
||||
float sumf = 0.0f;
|
||||
|
||||
// d_conv
|
||||
for (int i0 = 0; i0 < nc; ++i0) {
|
||||
sumf += s[i0 + i1*ncs] * c[i0 + i1*nc];
|
||||
}
|
||||
x[i1] = sumf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||
int32_t * sq = (int32_t *) ((char *) src3->data + i2*(src3->nb[1])); // {n_kv, n_tokens}
|
||||
float * x = (float *) ((char *) dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens}
|
||||
float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv}
|
||||
float * s0; // {d_conv - 1, d_inner, n_kv}
|
||||
float * x0 = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
||||
float * c = (float *) ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner}
|
||||
int ne0s0;
|
||||
|
||||
GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
|
||||
|
||||
// avoid needing to copy the state for the first token
|
||||
if (i2 == 0) {
|
||||
s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv}
|
||||
ne0s0 = src0->ne[0];
|
||||
} else {
|
||||
// the source is the last (d_conv - 1) columns of the destination
|
||||
s0 = s + 1;
|
||||
ne0s0 = nc;
|
||||
}
|
||||
|
||||
// d_inner
|
||||
for (int i1 = 0; i1 < ir; ++i1) {
|
||||
// shift state left
|
||||
for (int i0 = 0; i0 < nc - 1; ++i0) {
|
||||
s[i0 + i1*nc] = s0[i0 + i1*ne0s0];
|
||||
}
|
||||
// insert x on the last column
|
||||
s[(nc - 1) + i1*nc] = x0[i1];
|
||||
}
|
||||
|
||||
// handle copies when there are multiple output states
|
||||
for (int i3 = 1; i3 < n_kv; ++i3) {
|
||||
int32_t seq = sq[i3];
|
||||
if (0 <= seq && seq < n_kv) {
|
||||
float * s1 = s + (seq - sq[0])*nc*nr;
|
||||
memcpy(s1, s, nc*ir*sizeof(float));
|
||||
} else {
|
||||
// stop at negative or too big seq_ids
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// it seems a little faster when this is separate from the state shift
|
||||
for (int i1 = 0; i1 < ir; ++i1) {
|
||||
// rowwise dot product
|
||||
float sumf = 0.0f;
|
||||
for (int i0 = 0; i0 < nc; ++i0) {
|
||||
int i = i0 + i1*nc;
|
||||
sumf += s[i] * c[i];
|
||||
}
|
||||
x[i1] = sumf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_ssm_conv(
|
||||
@@ -15910,15 +15859,14 @@ static void ggml_compute_forward_ssm_scan_f32(
|
||||
const struct ggml_tensor * src3 = dst->src[3]; // A
|
||||
const struct ggml_tensor * src4 = dst->src[4]; // B
|
||||
const struct ggml_tensor * src5 = dst->src[5]; // C
|
||||
const struct ggml_tensor * src6 = dst->src[6]; // sq
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t nc = src0->ne[0]; // d_state
|
||||
const int64_t nr = src0->ne[1]; // d_inner
|
||||
const int64_t n_t = src1->ne[1]; // number of tokens in the batch
|
||||
const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch
|
||||
const int64_t nc = src0->ne[0]; // d_state
|
||||
const int64_t nr = src0->ne[1]; // d_inner
|
||||
const int64_t n_t = src1->ne[1]; // number of tokens per sequence
|
||||
const int64_t n_s = src0->ne[2]; // number of sequences in the batch
|
||||
|
||||
GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
@@ -15927,12 +15875,12 @@ static void ggml_compute_forward_ssm_scan_f32(
|
||||
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
||||
// required for the dot product between s and C, and when copying the states
|
||||
// required for the dot product between s and C
|
||||
GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
|
||||
// required for per-sequence offsets for states
|
||||
GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
|
||||
// required to get correct offset for state destination (i.e. src1->nb[2])
|
||||
GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));
|
||||
// required to get correct offset for state destination (i.e. src1->nb[3])
|
||||
GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float));
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
@@ -15942,64 +15890,36 @@ static void ggml_compute_forward_ssm_scan_f32(
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
const int ir = ir1 - ir0;
|
||||
|
||||
if (n_kv > 1) {
|
||||
// it's hard to know if the source states have already been copied
|
||||
// when there are multiple, so copy them already.
|
||||
for (int i3 = 0; i3 < n_kv; ++i3) {
|
||||
float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
|
||||
float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]);
|
||||
memcpy(s, s0, nc*ir*sizeof(float));
|
||||
}
|
||||
}
|
||||
for (int i3 = 0; i3 < n_s; ++i3) {
|
||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||
const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
|
||||
const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
||||
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
|
||||
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
|
||||
float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
|
||||
|
||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||
int32_t * sq = (int32_t *) ((char *) src6->data + i2*(src6->nb[1])); // {n_kv, n_tokens}
|
||||
float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
||||
float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv}
|
||||
float * s0;
|
||||
float * x = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
|
||||
float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
|
||||
float * A = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
||||
float * B = (float *) ((char *) src4->data + i2*(src4->nb[1])); // {d_state, n_tokens}
|
||||
float * C = (float *) ((char *) src5->data + i2*(src5->nb[1])); // {d_state, n_tokens}
|
||||
// use the output as the source for the next token-wise iterations
|
||||
if (i2 > 0) { s0 = s; }
|
||||
|
||||
GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
|
||||
|
||||
// avoid needing to copy the state for the first token
|
||||
if (i2 == 0) {
|
||||
s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv}
|
||||
} else {
|
||||
// otherwise the source is the same as the destination
|
||||
s0 = s;
|
||||
}
|
||||
|
||||
// d_inner
|
||||
for (int i1 = 0; i1 < ir; ++i1) {
|
||||
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
|
||||
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
||||
float x_dt = x[i1] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
// d_state
|
||||
for (int i0 = 0; i0 < nc; ++i0) {
|
||||
int i = i0 + i1*nc;
|
||||
// state = prev_state * dA + dB * x
|
||||
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
||||
// y = rowwise_dotprod(state, C)
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
}
|
||||
y[i1] = sumf;
|
||||
}
|
||||
|
||||
// handle copies when there are multiple output states
|
||||
for (int i3 = 1; i3 < n_kv; ++i3) {
|
||||
int32_t seq = sq[i3];
|
||||
if (0 <= seq && seq < n_kv) {
|
||||
float * s1 = s + (seq - sq[0])*nc*nr;
|
||||
memcpy(s1, s, nc*ir*sizeof(float));
|
||||
} else {
|
||||
// stop at negative or too big seq_ids
|
||||
break;
|
||||
// d_inner
|
||||
for (int i1 = 0; i1 < ir; ++i1) {
|
||||
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
|
||||
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
||||
float x_dt = x[i1] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
// d_state
|
||||
for (int i0 = 0; i0 < nc; ++i0) {
|
||||
int i = i0 + i1*nc;
|
||||
// state = prev_state * dA + dB * x
|
||||
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
||||
// y = rowwise_dotprod(state, C)
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
}
|
||||
y[i1] = sumf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
24
ggml/src/vulkan-shaders/acc.comp
Normal file
24
ggml/src/vulkan-shaders/acc.comp
Normal file
@@ -0,0 +1,24 @@
|
||||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_binary_head.comp"
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.x;
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint offset = p.param3;
|
||||
const uint src1_i = idx - offset;
|
||||
const uint oz = src1_i / p.nb02;
|
||||
const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
|
||||
const uint ox = src1_i % p.nb01;
|
||||
|
||||
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
|
||||
} else {
|
||||
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -368,6 +368,10 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [] {
|
||||
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
||||
}));
|
||||
|
||||
@@ -130,6 +130,7 @@ class Keys:
|
||||
INNER_SIZE = "{arch}.ssm.inner_size"
|
||||
STATE_SIZE = "{arch}.ssm.state_size"
|
||||
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
||||
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
|
||||
|
||||
class Tokenizer:
|
||||
MODEL = "tokenizer.ggml.model"
|
||||
@@ -1372,6 +1373,7 @@ KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
|
||||
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
||||
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
||||
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
||||
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
|
||||
|
||||
# tokenization
|
||||
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
||||
|
||||
@@ -730,6 +730,9 @@ class GGUFWriter:
|
||||
def add_ssm_time_step_rank(self, value: int) -> None:
|
||||
self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
|
||||
|
||||
def add_ssm_dt_b_c_rms(self, value: bool) -> None:
|
||||
self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
|
||||
|
||||
def add_tokenizer_model(self, model: str) -> None:
|
||||
self.add_string(Keys.Tokenizer.MODEL, model)
|
||||
|
||||
|
||||
@@ -511,6 +511,9 @@ extern "C" {
|
||||
// to the decoder to start generating output sequence. For other models, it returns -1.
|
||||
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
|
||||
|
||||
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
||||
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
||||
|
||||
// Returns 0 on success
|
||||
LLAMA_API uint32_t llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
|
||||
@@ -31,11 +31,17 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
if (search.empty()) {
|
||||
return; // Avoid infinite loop if 'search' is an empty string
|
||||
return;
|
||||
}
|
||||
std::string builder;
|
||||
builder.reserve(s.length());
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
||||
s.replace(pos, search.length(), replace);
|
||||
pos += replace.length();
|
||||
size_t last_pos = 0;
|
||||
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
||||
builder.append(s, last_pos, pos - last_pos);
|
||||
builder.append(replace);
|
||||
last_pos = pos + search.length();
|
||||
}
|
||||
builder.append(s, last_pos, std::string::npos);
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
@@ -321,6 +321,21 @@ private:
|
||||
|
||||
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
|
||||
|
||||
template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
|
||||
class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
|
||||
public:
|
||||
using std::priority_queue<T, Container, Compare>::priority_queue;
|
||||
|
||||
T pop_move() {
|
||||
T item = std::move(this->c.front());
|
||||
std::pop_heap(this->c.begin(), this->c.end(), this->comp);
|
||||
this->c.pop_back();
|
||||
return item;
|
||||
}
|
||||
|
||||
void pop() = delete;
|
||||
};
|
||||
|
||||
struct llm_bigram_bpe {
|
||||
struct comparator {
|
||||
bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
|
||||
@@ -329,7 +344,7 @@ struct llm_bigram_bpe {
|
||||
};
|
||||
|
||||
using queue_storage = std::vector<llm_bigram_bpe>;
|
||||
using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
|
||||
using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
|
||||
llm_symbol::index left;
|
||||
llm_symbol::index right;
|
||||
std::string text;
|
||||
@@ -520,8 +535,7 @@ struct llm_tokenizer_bpe {
|
||||
|
||||
// build token(s)
|
||||
while (!work_queue.empty()) {
|
||||
auto bigram = work_queue.top();
|
||||
work_queue.pop();
|
||||
auto bigram = work_queue.pop_move();
|
||||
|
||||
auto & left_symbol = symbols[bigram.left];
|
||||
auto & right_symbol = symbols[bigram.right];
|
||||
|
||||
1537
src/llama.cpp
1537
src/llama.cpp
File diff suppressed because it is too large
Load Diff
@@ -1652,19 +1652,20 @@ struct test_flash_attn_ext : public test_case {
|
||||
const bool mask; // use mask
|
||||
|
||||
const float max_bias; // ALiBi
|
||||
const float logit_softcap; // Gemma 2
|
||||
|
||||
const ggml_type type_KV;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR7(hs, nh, kv, nb, mask, max_bias, type_KV);
|
||||
return VARS_TO_STR8(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 5e-4;
|
||||
}
|
||||
|
||||
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
|
||||
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {}
|
||||
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
|
||||
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
|
||||
@@ -1673,7 +1674,7 @@ struct test_flash_attn_ext : public test_case {
|
||||
ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
|
||||
ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
|
||||
ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
|
||||
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
|
||||
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
@@ -2437,11 +2438,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
for (bool mask : { true, false } ) {
|
||||
for (float max_bias : { 0.0f, 8.0f }) {
|
||||
if (!mask && max_bias > 0.0f) continue;
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
for (int nb : { 1, 2, 4, 8, }) {
|
||||
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, type_KV));
|
||||
for (float logit_softcap : {0.0f, 10.0f}) {
|
||||
if (hs != 128 && logit_softcap != 0.0f) continue;
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
for (int nb : { 1, 2, 4, 8, }) {
|
||||
for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2483,7 +2487,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
return false;
|
||||
}
|
||||
|
||||
static void usage(char ** argv) {
|
||||
|
||||
@@ -14,7 +14,7 @@ MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO
|
||||
# Clone the Hugging Face repository if the directory does not exist
|
||||
if [ ! -d "$MODELS_REPO" ]; then
|
||||
echo "Cloning the Hugging Face repository..."
|
||||
git clone $MODELS_REPO_URL
|
||||
git clone $MODELS_REPO_URL --depth 1
|
||||
else
|
||||
echo "Repository already exists. Skipping clone."
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user