mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bf5a81df37 | ||
|
|
88954f7fbd | ||
|
|
ed67bcb24f | ||
|
|
eddcb5238b | ||
|
|
be6d7c0791 | ||
|
|
4b0eff3df5 | ||
|
|
8a4bad50a8 | ||
|
|
68504f0970 | ||
|
|
f19bf99c01 | ||
|
|
3a7ac5300a | ||
|
|
96952e7181 | ||
|
|
79167d9e49 | ||
|
|
b115105f05 | ||
|
|
de280085e7 | ||
|
|
b841d07408 | ||
|
|
64cf50a0ed | ||
|
|
938943cdbf | ||
|
|
751fcfc6c3 | ||
|
|
46e47417aa | ||
|
|
e7e6487ba0 | ||
|
|
063d99ad11 | ||
|
|
081fe431aa | ||
|
|
d94c6e0ccb | ||
|
|
566daa5a5b | ||
|
|
6f11a83e4e | ||
|
|
e093dd2382 | ||
|
|
50e05353e8 | ||
|
|
628154492a |
@@ -14,7 +14,9 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
echo "GGML_SYCL_F16 is set" && \
|
||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||
echo "Building with static libs" && \
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
||||
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
||||
cmake --build build --config Release --target llama-cli
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||
|
||||
@@ -14,6 +14,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
echo "GGML_SYCL_F16 is set" && \
|
||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
echo "Building with dynamic libs" && \
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target llama-server
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
"llama-embedding"
|
||||
"llama-server"
|
||||
"llama-quantize"
|
||||
"llama-train-text-from-scratch"
|
||||
];
|
||||
mkApp = name: {
|
||||
type = "app";
|
||||
|
||||
@@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
./llama-quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
./llama-cli "$@"
|
||||
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||
./llama-finetune "$@"
|
||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
@@ -36,8 +34,6 @@ else
|
||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
|
||||
echo " See documentation for finetune for command-line parameters"
|
||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
||||
echo " ex: \"/models/\" 7B"
|
||||
echo " --server (-s): Run a model on the server"
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
# Pull requests
|
||||
# Pull requests (for contributors)
|
||||
|
||||
- Always squash-merge the PR before merging
|
||||
- Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||
- Test your changes:
|
||||
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
|
||||
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
|
||||
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
||||
|
||||
# Pull requests (for collaborators)
|
||||
|
||||
- Squash-merge PRs
|
||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
||||
|
||||
# Coding guidelines
|
||||
|
||||
|
||||
64
Makefile
64
Makefile
@@ -11,7 +11,6 @@ BUILD_TARGETS = \
|
||||
llama-embedding \
|
||||
llama-eval-callback \
|
||||
llama-export-lora \
|
||||
llama-finetune \
|
||||
llama-gbnf-validator \
|
||||
llama-gguf \
|
||||
llama-gguf-hash \
|
||||
@@ -37,7 +36,6 @@ BUILD_TARGETS = \
|
||||
llama-simple \
|
||||
llama-speculative \
|
||||
llama-tokenize \
|
||||
llama-train-text-from-scratch \
|
||||
llama-vdot \
|
||||
llama-cvector-generator \
|
||||
tests/test-c.o
|
||||
@@ -64,13 +62,13 @@ TEST_TARGETS = \
|
||||
tests/test-tokenizer-1-spm
|
||||
|
||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
|
||||
retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
|
||||
|
||||
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
|
||||
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
|
||||
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
|
||||
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
|
||||
|
||||
# Deprecation aliases
|
||||
ifdef LLAMA_CUBLAS
|
||||
@@ -876,6 +874,9 @@ OBJ_GGML += \
|
||||
|
||||
OBJ_LLAMA = \
|
||||
src/llama.o \
|
||||
src/llama-vocab.o \
|
||||
src/llama-grammar.o \
|
||||
src/llama-sampling.o \
|
||||
src/unicode.o \
|
||||
src/unicode-data.o
|
||||
|
||||
@@ -1055,6 +1056,10 @@ src/unicode-data.o: \
|
||||
|
||||
src/llama.o: \
|
||||
src/llama.cpp \
|
||||
src/llama-impl.h \
|
||||
src/llama-vocab.h \
|
||||
src/llama-grammar.h \
|
||||
src/llama-sampling.h \
|
||||
src/unicode.h \
|
||||
include/llama.h \
|
||||
ggml/include/ggml-cuda.h \
|
||||
@@ -1064,6 +1069,29 @@ src/llama.o: \
|
||||
ggml/include/ggml-backend.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
src/llama-vocab.o: \
|
||||
src/llama-vocab.cpp \
|
||||
src/llama-vocab.h \
|
||||
src/llama-impl.h \
|
||||
include/llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
src/llama-grammar.o: \
|
||||
src/llama-grammar.cpp \
|
||||
src/llama-grammar.h \
|
||||
src/llama-impl.h \
|
||||
src/llama-vocab.h \
|
||||
src/llama-sampling.h \
|
||||
include/llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
src/llama-sampling.o: \
|
||||
src/llama-sampling.cpp \
|
||||
src/llama-sampling.h \
|
||||
src/llama-impl.h \
|
||||
include/llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
$(LIB_LLAMA): \
|
||||
$(OBJ_LLAMA) \
|
||||
$(LIB_GGML)
|
||||
@@ -1266,11 +1294,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
|
||||
$(OBJ_GGML) $(OBJ_LLAMA)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
@@ -1286,13 +1309,8 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-finetune: examples/finetune/finetune.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
||||
$(OBJ_GGML) common/log.h
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
@@ -1439,7 +1457,7 @@ run-benchmark-matmult: llama-benchmark-matmult
|
||||
.PHONY: run-benchmark-matmult swift
|
||||
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
||||
$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
@@ -1548,7 +1566,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
|
||||
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
|
||||
#
|
||||
# Mark legacy binary targets as .PHONY so that they are always checked.
|
||||
.PHONY: main quantize perplexity embedding server finetune
|
||||
.PHONY: main quantize perplexity embedding server
|
||||
|
||||
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
|
||||
# Eventually we will want to remove these target from building all the time.
|
||||
@@ -1591,13 +1609,3 @@ ifneq (,$(wildcard embedding))
|
||||
@echo " Remove the 'embedding' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
|
||||
finetune: examples/deprecation-warning/deprecation-warning.cpp
|
||||
ifneq (,$(wildcard finetune))
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
@echo "#########"
|
||||
@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
|
||||
@echo " Remove the 'finetune' binary to remove this warning."
|
||||
@echo "#########"
|
||||
endif
|
||||
|
||||
@@ -4,6 +4,9 @@ import PackageDescription
|
||||
|
||||
var sources = [
|
||||
"src/llama.cpp",
|
||||
"src/llama-vocab.cpp",
|
||||
"src/llama-grammar.cpp",
|
||||
"src/llama-sampling.cpp",
|
||||
"src/unicode.cpp",
|
||||
"src/unicode-data.cpp",
|
||||
"ggml/src/ggml.c",
|
||||
|
||||
@@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
|
||||
Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
|
||||
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
||||
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
||||
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||
@@ -181,6 +182,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
|
||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||
|
||||
**Games:**
|
||||
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
||||
|
||||
## Demo
|
||||
|
||||
<details>
|
||||
|
||||
@@ -694,11 +694,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||
return true;
|
||||
}
|
||||
if (arg == "--lora-base") {
|
||||
CHECK_ARG
|
||||
params.lora_base = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--control-vector") {
|
||||
CHECK_ARG
|
||||
params.control_vectors.push_back({ 1.0f, argv[i], });
|
||||
@@ -1274,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
CHECK_ARG
|
||||
params.out_file = argv[i];
|
||||
params.cvector_outfile = argv[i];
|
||||
params.lora_outfile = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "-ofreq" || arg == "--output-frequency") {
|
||||
@@ -1583,9 +1579,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
|
||||
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
||||
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
|
||||
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
|
||||
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
|
||||
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
|
||||
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
|
||||
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
||||
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
|
||||
"note: this argument can be repeated to add multiple control vectors" });
|
||||
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
||||
@@ -1676,6 +1671,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
||||
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
||||
|
||||
options.push_back({ "export-lora" });
|
||||
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
||||
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
||||
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
|
||||
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
||||
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
|
||||
for (const auto & o : options) {
|
||||
@@ -2721,7 +2723,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
|
||||
const llama_chat_msg & new_msg,
|
||||
bool add_ass) {
|
||||
std::ostringstream ss;
|
||||
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
|
||||
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
|
||||
std::vector<llama_chat_msg> chat_new(past_msg);
|
||||
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
||||
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
||||
@@ -3166,7 +3168,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
}
|
||||
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
||||
}
|
||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||
|
||||
@@ -128,7 +128,6 @@ struct gpt_params {
|
||||
|
||||
// TODO: avoid tuple, use struct
|
||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||
|
||||
@@ -255,6 +254,8 @@ struct gpt_params {
|
||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||
|
||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||
|
||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||
};
|
||||
|
||||
void gpt_params_handle_hf_token(gpt_params & params);
|
||||
|
||||
@@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl(
|
||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
||||
|
||||
// Apply grammar constraints to the single token
|
||||
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
|
||||
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
|
||||
|
||||
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
|
||||
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||
@@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
|
||||
|
||||
// apply grammar checks before sampling logic
|
||||
if (apply_grammar && ctx_sampling->grammar != NULL) {
|
||||
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
|
||||
}
|
||||
|
||||
return cur_p;
|
||||
@@ -455,6 +455,6 @@ void llama_sampling_accept(
|
||||
ctx_sampling->prev.push_back(id);
|
||||
|
||||
if (ctx_sampling->grammar != NULL && apply_grammar) {
|
||||
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
|
||||
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -239,6 +239,10 @@ class Model:
|
||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||
|
||||
if (head_dim := self.hparams.get("head_dim")) is not None:
|
||||
self.gguf_writer.add_key_length(head_dim)
|
||||
self.gguf_writer.add_value_length(head_dim)
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
logger.info(f"gguf: file type = {self.ftype}")
|
||||
|
||||
@@ -590,9 +594,15 @@ class Model:
|
||||
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
||||
# ref: https://huggingface.co/core42/jais-13b
|
||||
res = "jais"
|
||||
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
|
||||
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
|
||||
res = "codeshell"
|
||||
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
|
||||
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
|
||||
res = "tekken"
|
||||
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
||||
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
||||
res = "smollm"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -733,7 +743,7 @@ class Model:
|
||||
added_tokens_json = json.load(f)
|
||||
for key in added_tokens_json:
|
||||
token_id = added_tokens_json[key]
|
||||
if (token_id >= vocab_size):
|
||||
if token_id >= vocab_size:
|
||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
|
||||
@@ -1481,7 +1491,12 @@ class LlamaModel(Model):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
|
||||
if "head_dim" in hparams:
|
||||
rope_dim = hparams["head_dim"]
|
||||
else:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
@@ -1996,7 +2011,7 @@ class Phi3MiniModel(Model):
|
||||
|
||||
for key in added_tokens_json:
|
||||
token_id = added_tokens_json[key]
|
||||
if (token_id >= vocab_size):
|
||||
if token_id >= vocab_size:
|
||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
|
||||
@@ -2069,10 +2084,11 @@ class Phi3MiniModel(Model):
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
||||
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
|
||||
|
||||
# write rope scaling for long context (128k) model
|
||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||
if (rope_scaling is None):
|
||||
if rope_scaling is None:
|
||||
return
|
||||
|
||||
scale = max_pos_embds / orig_max_pos_embds
|
||||
@@ -2719,7 +2735,7 @@ class JinaBertV2Model(BertModel):
|
||||
|
||||
yield name, data
|
||||
|
||||
def set_vocab(self, *args, **kwargs):
|
||||
def set_vocab(self):
|
||||
tokenizer_class = 'BertTokenizer'
|
||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_class = json.load(f)['tokenizer_class']
|
||||
@@ -2867,7 +2883,7 @@ class ArcticModel(Model):
|
||||
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
||||
for token_id, token_json in added_tokens_decoder.items():
|
||||
token_id = int(token_id)
|
||||
if (token_id >= vocab_size):
|
||||
if token_id >= vocab_size:
|
||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
|
||||
@@ -3116,7 +3132,7 @@ class T5Model(Model):
|
||||
added_tokens_json = json.load(f)
|
||||
for key in added_tokens_json:
|
||||
token_id = added_tokens_json[key]
|
||||
if (token_id >= vocab_size):
|
||||
if token_id >= vocab_size:
|
||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
|
||||
|
||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||
# will be updated with time - contributions welcome
|
||||
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
|
||||
if len(sys.argv) == 2:
|
||||
token = sys.argv[1]
|
||||
@@ -91,7 +91,9 @@ models = [
|
||||
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
||||
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
||||
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
||||
]
|
||||
|
||||
|
||||
@@ -100,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
|
||||
response = sess.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||
with open(save_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
with open(save_path, 'wb') as downloaded_file:
|
||||
downloaded_file.write(response.content)
|
||||
logger.info(f"File {save_path} downloaded successfully")
|
||||
|
||||
|
||||
@@ -160,7 +162,7 @@ for model in models:
|
||||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||
continue # Skip to the next model if the tokenizer can't be loaded
|
||||
|
||||
chktok = tokenizer.encode(chktxt)
|
||||
chktok = tokenizer.encode(CHK_TXT)
|
||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||
|
||||
logger.info(f"model: {name}")
|
||||
@@ -192,7 +194,7 @@ src_func = f"""
|
||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||
# use in llama.cpp to implement the same pre-tokenizer
|
||||
|
||||
chktxt = {repr(chktxt)}
|
||||
chktxt = {repr(CHK_TXT)}
|
||||
|
||||
chktok = tokenizer.encode(chktxt)
|
||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||
@@ -288,7 +290,7 @@ tests = [
|
||||
"333333333",
|
||||
"Cửa Việt", # llama-bpe fails on this
|
||||
" discards",
|
||||
chktxt,
|
||||
CHK_TXT,
|
||||
]
|
||||
|
||||
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
||||
|
||||
@@ -132,6 +132,10 @@ class Tensor:
|
||||
|
||||
|
||||
class GGMLModel:
|
||||
|
||||
file_format: GGMLFormat
|
||||
format_version: int
|
||||
|
||||
def __init__(self):
|
||||
self.hyperparameters = None
|
||||
self.vocab = None
|
||||
@@ -290,7 +294,7 @@ class GGMLToGGUF:
|
||||
if self.vocab_override is not None:
|
||||
vo = self.vocab_override
|
||||
logger.info('* Adding vocab item(s)')
|
||||
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
||||
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
||||
tokens.append(vbytes)
|
||||
scores.append(score)
|
||||
toktypes.append(ttype)
|
||||
|
||||
@@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
|
||||
```sh
|
||||
./build/bin/llama-ls-sycl-device
|
||||
```
|
||||
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
||||
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||
```
|
||||
found 6 SYCL devices:
|
||||
found 2 SYCL devices:
|
||||
|
||||
| | | |Compute |Max compute|Max work|Max sub| |
|
||||
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
||||
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
||||
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
||||
```
|
||||
|
||||
| Attribute | Note |
|
||||
|------------------------|-------------------------------------------------------------|
|
||||
| compute capability 1.3 | Level-zero driver/runtime, recommended |
|
||||
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
|
||||
|
||||
4. Launch inference
|
||||
|
||||
There are two device selection modes:
|
||||
|
||||
- Single device: Use one device target specified by the user.
|
||||
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|
||||
- Multiple devices: Automatically choose the devices with the same backend.
|
||||
|
||||
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
|
||||
|
||||
| Device selection | Parameter |
|
||||
|------------------|----------------------------------------|
|
||||
@@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
|
||||
build\bin\ls-sycl-device.exe
|
||||
```
|
||||
|
||||
The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
|
||||
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||
```
|
||||
found 6 SYCL devices:
|
||||
found 2 SYCL devices:
|
||||
| | | |Compute |Max compute|Max work|Max sub| |
|
||||
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
||||
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
||||
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
||||
|
||||
```
|
||||
|
||||
| Attribute | Note |
|
||||
|------------------------|-----------------------------------------------------------|
|
||||
| compute capability 1.3 | Level-zero running time, recommended |
|
||||
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
|
||||
|
||||
|
||||
4. Launch inference
|
||||
|
||||
There are two device selection modes:
|
||||
|
||||
- Single device: Use one device assigned by user.
|
||||
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
||||
- Single device: Use one device assigned by user. Default device id is 0.
|
||||
- Multiple devices: Automatically choose the devices with the same backend.
|
||||
|
||||
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
|
||||
|
||||
| Device selection | Parameter |
|
||||
|------------------|----------------------------------------|
|
||||
|
||||
@@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
|
||||
make
|
||||
```
|
||||
|
||||
- On Windows:
|
||||
- On Windows (x86/x64 only, arm64 requires cmake):
|
||||
|
||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||
2. Extract `w64devkit` on your pc.
|
||||
@@ -60,6 +60,17 @@ In order to build llama.cpp you have four different options.
|
||||
cmake -B build -G "Xcode"
|
||||
cmake --build build --config Debug
|
||||
```
|
||||
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
|
||||
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
|
||||
- Tab Workload: Desktop-development with C++
|
||||
- Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
|
||||
- Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
|
||||
- For Windows on ARM (arm64, WoA) build with:
|
||||
```bash
|
||||
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
|
||||
cmake --build build-arm64-windows-llvm-release
|
||||
```
|
||||
Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
|
||||
|
||||
- Using `gmake` (FreeBSD):
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ else()
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(finetune)
|
||||
add_subdirectory(gbnf-validator)
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf-split)
|
||||
@@ -53,5 +52,4 @@ else()
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
endif()
|
||||
|
||||
@@ -13,7 +13,6 @@ Please update all scripts and workflows to use the new binary names.
|
||||
| server | llama-server |
|
||||
| llama-bench | llama-bench |
|
||||
| embedding | llama-embedding |
|
||||
| finetune | llama-finetune |
|
||||
| quantize | llama-quantize |
|
||||
| tokenize | llama-tokenize |
|
||||
| export-lora | llama-export-lora |
|
||||
@@ -45,7 +44,6 @@ Please update all scripts and workflows to use the new binary names.
|
||||
| save-load-state | llama-save-load-state |
|
||||
| simple | llama-simple |
|
||||
| speculative | llama-speculative |
|
||||
| train-text-from-scratch | llama-train-text-from-scratch |
|
||||
| vdot | llama-vdot |
|
||||
| tests/test-c.o | tests/test-c.o |
|
||||
|
||||
|
||||
@@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model.
|
||||
usage: llama-export-lora [options]
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
-m FNAME, --model-base FNAME model path from which to load base model (default '')
|
||||
-o FNAME, --model-out FNAME path to save exported model (default '')
|
||||
-l FNAME, --lora FNAME apply LoRA adapter
|
||||
-s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S
|
||||
-t N, --threads N number of threads to use during computation (default: 4)
|
||||
-m, --model model path from which to load base model (default '')
|
||||
--lora FNAME path to LoRA adapter (can be repeated to use multiple adapters)
|
||||
--lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)
|
||||
-t, --threads N number of threads to use during computation (default: 4)
|
||||
-o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf')
|
||||
```
|
||||
|
||||
For example:
|
||||
@@ -20,7 +19,15 @@ For example:
|
||||
./bin/llama-export-lora \
|
||||
-m open-llama-3b-v2-q8_0.gguf \
|
||||
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
||||
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
||||
--lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
|
||||
```
|
||||
|
||||
Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
|
||||
Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
|
||||
|
||||
```bash
|
||||
./bin/llama-export-lora \
|
||||
-m your_base_model.gguf \
|
||||
-o your_merged_model.gguf \
|
||||
--lora-scaled lora_task_A.gguf 0.5 \
|
||||
--lora-scaled lora_task_B.gguf 0.5
|
||||
```
|
||||
|
||||
@@ -1,465 +1,406 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <fstream>
|
||||
|
||||
struct lora_info {
|
||||
std::string filename;
|
||||
static bool g_verbose = false;
|
||||
|
||||
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
|
||||
int id = gguf_find_key(ctx_gguf, key.c_str());
|
||||
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
|
||||
}
|
||||
|
||||
static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
|
||||
int id = gguf_find_key(ctx_gguf, key.c_str());
|
||||
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
|
||||
}
|
||||
|
||||
static void zeros(std::ofstream & file, size_t n) {
|
||||
char zero = 0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
file.write(&zero, 1);
|
||||
}
|
||||
}
|
||||
|
||||
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||
std::string str;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
str += std::to_string(t->ne[i]);
|
||||
if (i + 1 < GGML_MAX_DIMS) {
|
||||
str += ", ";
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ ctx_ggml,
|
||||
};
|
||||
struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error("failed to load input GGUF from " + fname);
|
||||
}
|
||||
return ctx_gguf;
|
||||
}
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
auto new_pos = s.find(search, pos);
|
||||
if (new_pos == std::string::npos) {
|
||||
result += s.substr(pos, s.size() - pos);
|
||||
break;
|
||||
}
|
||||
result += s.substr(pos, new_pos - pos) + replace;
|
||||
pos = new_pos;
|
||||
}
|
||||
s = std::move(result);
|
||||
}
|
||||
|
||||
struct file_input {
|
||||
struct ggml_context * ctx_meta = nullptr;
|
||||
struct gguf_context * ctx_gguf = nullptr;
|
||||
std::ifstream f_in;
|
||||
std::map<std::string, ggml_tensor *> tensors;
|
||||
float alpha;
|
||||
float scale;
|
||||
|
||||
file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
|
||||
if (!f_in.is_open()) {
|
||||
throw std::runtime_error("failed to open input gguf from " + fname);
|
||||
}
|
||||
|
||||
ctx_gguf = load_gguf(fname, &ctx_meta);
|
||||
alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
|
||||
printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
|
||||
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
|
||||
std::string name(cur->name);
|
||||
tensors[name] = cur;
|
||||
if (g_verbose) {
|
||||
printf("%s: %s\n", __func__, cur->name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor * get_tensor(std::string name) {
|
||||
if (tensors.find(name) == tensors.end()) {
|
||||
return nullptr;
|
||||
}
|
||||
return tensors[name];
|
||||
}
|
||||
|
||||
void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
|
||||
if (tensors.find(name) == tensors.end()) {
|
||||
throw std::runtime_error("cannot find tensor with name: " + name);
|
||||
}
|
||||
auto len = ggml_nbytes(tensors[name]);
|
||||
if (buf.size() < len) {
|
||||
buf.resize(len);
|
||||
}
|
||||
auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
|
||||
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
|
||||
f_in.seekg(offset);
|
||||
f_in.read((char* )buf.data(), len);
|
||||
}
|
||||
|
||||
~file_input() {
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx_meta);
|
||||
}
|
||||
};
|
||||
|
||||
struct export_lora_params {
|
||||
std::string fn_model_base;
|
||||
std::string fn_model_out;
|
||||
std::vector<struct lora_info> lora;
|
||||
struct lora_merge_ctx {
|
||||
// input base model + adapters
|
||||
file_input base_model;
|
||||
std::vector<std::unique_ptr<file_input>> adapters;
|
||||
|
||||
// for computing merged tensor
|
||||
int n_threads;
|
||||
};
|
||||
ggml_backend_t backend = nullptr;
|
||||
ggml_gallocr_t allocr = nullptr;
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
struct lora_data {
|
||||
struct lora_info info;
|
||||
std::vector<uint8_t> data;
|
||||
struct ggml_context * ctx;
|
||||
// output file
|
||||
struct gguf_context * ctx_out;
|
||||
struct ggml_context * ctx_out_ggml;
|
||||
std::ofstream fout;
|
||||
|
||||
uint32_t lora_r;
|
||||
uint32_t lora_alpha;
|
||||
};
|
||||
lora_merge_ctx(
|
||||
std::string & base_fname,
|
||||
std::vector<std::tuple<std::string, float>> & lora_files,
|
||||
std::string & outfile,
|
||||
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
|
||||
struct llama_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
|
||||
throw std::runtime_error("split model is not yet supported");
|
||||
}
|
||||
|
||||
llama_file(const char * fname, const char * mode) {
|
||||
fp = std::fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
size = 0;
|
||||
for (auto lora_inp : lora_files) {
|
||||
auto fname = std::get<0>(lora_inp);
|
||||
auto scale = std::get<1>(lora_inp);
|
||||
std::unique_ptr<file_input> adapter(new file_input(fname, scale));
|
||||
check_metadata_lora(adapter.get());
|
||||
adapters.push_back(std::move(adapter));
|
||||
}
|
||||
|
||||
ctx_out = gguf_init_empty();
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ctx_out_ggml = ggml_init(params);
|
||||
backend = ggml_backend_cpu_init();
|
||||
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
|
||||
}
|
||||
|
||||
void check_metadata_lora(file_input * adapter) {
|
||||
auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
|
||||
if (general_type != "adapter") {
|
||||
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
|
||||
}
|
||||
|
||||
auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
|
||||
if (adapter_type != "lora") {
|
||||
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
|
||||
}
|
||||
|
||||
auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
|
||||
auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture");
|
||||
if (general_arch_base != general_arch_lora) {
|
||||
throw std::runtime_error("model arch and LoRA arch mismatch");
|
||||
}
|
||||
}
|
||||
|
||||
ggml_type get_out_tensor_type(struct ggml_tensor * t) {
|
||||
if (t->type == GGML_TYPE_F32) {
|
||||
return GGML_TYPE_F32;
|
||||
} else {
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
#ifdef _WIN32
|
||||
__int64 ret = _ftelli64(fp);
|
||||
#else
|
||||
long ret = std::ftell(fp);
|
||||
#endif
|
||||
GGML_ASSERT(ret != -1); // this really shouldn't fail
|
||||
return (size_t) ret;
|
||||
}
|
||||
void run_merge() {
|
||||
// prepare metadata
|
||||
gguf_set_kv(ctx_out, base_model.ctx_gguf);
|
||||
// output is forced to f16 for now
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
|
||||
|
||||
void seek(size_t offset, int whence) {
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
||||
#else
|
||||
int ret = std::fseek(fp, (long) offset, whence);
|
||||
#endif
|
||||
GGML_ASSERT(ret == 0); // same
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
// check if all lora adapters have the same tensors
|
||||
// TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
|
||||
static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
|
||||
if (adapters.size() > 1) {
|
||||
for (size_t i = 1; i < adapters.size(); ++i) {
|
||||
if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
|
||||
throw std::runtime_error(err_no_subset_adapter);
|
||||
}
|
||||
for (auto & it : adapters[i]->tensors) {
|
||||
if (adapters[0]->get_tensor(it.first) == nullptr) {
|
||||
throw std::runtime_error(err_no_subset_adapter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, size, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
die_fmt("read error: %s", strerror(errno));
|
||||
|
||||
// if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile
|
||||
std::vector<std::pair<struct ggml_tensor *, bool>> base_tensors;
|
||||
for (auto & it : base_model.tensors) {
|
||||
bool t_a = true;
|
||||
bool t_b = true;
|
||||
for (auto & adapter : adapters) {
|
||||
t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
|
||||
t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
|
||||
}
|
||||
auto base_tensor = it.second;
|
||||
struct ggml_tensor * out_tensor;
|
||||
if (!t_a && !t_b) {
|
||||
// only copy
|
||||
out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
|
||||
ggml_set_name(out_tensor, base_tensor->name);
|
||||
base_tensors.push_back(std::make_pair(out_tensor, false));
|
||||
} else if (t_a && t_b) {
|
||||
// need merging
|
||||
out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
|
||||
out_tensor->type = get_out_tensor_type(base_tensor);
|
||||
ggml_set_name(out_tensor, base_tensor->name);
|
||||
base_tensors.push_back(std::make_pair(out_tensor, true));
|
||||
} else {
|
||||
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
|
||||
}
|
||||
gguf_add_tensor(ctx_out, out_tensor);
|
||||
}
|
||||
if (ret != 1) {
|
||||
die("unexpectedly reached end of file");
|
||||
|
||||
// placeholder for the meta data
|
||||
{
|
||||
size_t meta_size = gguf_get_meta_size(ctx_out);
|
||||
zeros(fout, meta_size);
|
||||
}
|
||||
}
|
||||
|
||||
std::uint32_t read_u32() {
|
||||
std::uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string read_string(std::uint32_t len) {
|
||||
std::vector<char> chars(len);
|
||||
read_raw(chars.data(), len);
|
||||
return std::string(chars.data(), len);
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
// process base model tensors
|
||||
size_t n_merged = 0;
|
||||
for (auto & it : base_tensors) {
|
||||
if (it.second) {
|
||||
merge_tensor(it.first);
|
||||
n_merged++;
|
||||
} else {
|
||||
copy_tensor(it.first);
|
||||
}
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
||||
if (ret != 1) {
|
||||
die_fmt("write error: %s", strerror(errno));
|
||||
|
||||
// write output metadata
|
||||
{
|
||||
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
||||
gguf_get_meta_data(ctx_out, data.data());
|
||||
fout.seekp(0);
|
||||
fout.write((const char *)data.data(), data.size());
|
||||
}
|
||||
|
||||
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
|
||||
printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size());
|
||||
}
|
||||
|
||||
void write_u32(std::uint32_t val) {
|
||||
write_raw(&val, sizeof(val));
|
||||
void copy_tensor(struct ggml_tensor * base) {
|
||||
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
|
||||
size_t len = ggml_nbytes(base);
|
||||
base_model.read_tensor_data(base->name, read_buf);
|
||||
fout.write((char* )read_buf.data(), len);
|
||||
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
|
||||
}
|
||||
|
||||
bool eof() {
|
||||
return tell() >= size;
|
||||
}
|
||||
void merge_tensor(struct ggml_tensor * base) {
|
||||
std::string name_base(base->name);
|
||||
std::string name_lora_a = name_base + ".lora_a";
|
||||
std::string name_lora_b = name_base + ".lora_b";
|
||||
|
||||
~llama_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
|
||||
|
||||
// context for input tensor
|
||||
std::vector<struct ggml_tensor *> inp_a(adapters.size());
|
||||
std::vector<struct ggml_tensor *> inp_b(adapters.size());
|
||||
struct ggml_init_params params {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*(1+adapters.size()*2),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
|
||||
// alloc tensors
|
||||
struct ggml_tensor * inp = ggml_dup_tensor(ctx, base);
|
||||
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||
auto t_a = adapters[i]->get_tensor(name_lora_a);
|
||||
auto t_b = adapters[i]->get_tensor(name_lora_b);
|
||||
inp_a[i] = ggml_dup_tensor(ctx, t_a);
|
||||
inp_b[i] = ggml_dup_tensor(ctx, t_b);
|
||||
}
|
||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||
|
||||
// load data to backend buffer
|
||||
base_model.read_tensor_data(name_base, read_buf);
|
||||
ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp));
|
||||
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||
adapters[i]->read_tensor_data(name_lora_a, read_buf);
|
||||
ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
|
||||
adapters[i]->read_tensor_data(name_lora_b, read_buf);
|
||||
ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
|
||||
}
|
||||
|
||||
// build graph
|
||||
struct ggml_cgraph * gf;
|
||||
{
|
||||
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
|
||||
static std::vector<uint8_t> buf(buf_size);
|
||||
struct ggml_init_params params0 = {
|
||||
/*.mem_size =*/ buf_size,
|
||||
/*.mem_buffer =*/ buf.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
struct ggml_context * ctx0 = ggml_init(params0);
|
||||
gf = ggml_new_graph(ctx0);
|
||||
struct ggml_tensor * cur = inp;
|
||||
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i]));
|
||||
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]);
|
||||
// scale
|
||||
const float alpha = adapters[i]->alpha;
|
||||
const float rank = (float) inp_b[i]->ne[0];
|
||||
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
|
||||
delta = ggml_scale(ctx0, delta, scale);
|
||||
cur = ggml_add(ctx0, cur, delta);
|
||||
printf("%s : + merging from adapter[%ld]\n", __func__, i);
|
||||
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
|
||||
}
|
||||
cur = ggml_cast(ctx0, cur, get_out_tensor_type(base));
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
ggml_free(ctx0);
|
||||
}
|
||||
|
||||
// compute
|
||||
{
|
||||
ggml_gallocr_alloc_graph(allocr, gf);
|
||||
ggml_backend_cpu_set_n_threads(backend, n_threads);
|
||||
ggml_backend_graph_compute(backend, gf);
|
||||
}
|
||||
|
||||
// write data to output file
|
||||
{
|
||||
auto result = gf->nodes[gf->n_nodes - 1];
|
||||
size_t len = ggml_nbytes(result);
|
||||
if (read_buf.size() < len) {
|
||||
read_buf.resize(len);
|
||||
}
|
||||
ggml_backend_tensor_get(result, read_buf.data(), 0, len);
|
||||
fout.write((char* )read_buf.data(), len);
|
||||
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
|
||||
}
|
||||
|
||||
ggml_free(ctx);
|
||||
ggml_backend_buffer_free(buffer);
|
||||
}
|
||||
|
||||
~lora_merge_ctx() {
|
||||
ggml_gallocr_free(allocr);
|
||||
ggml_backend_free(backend);
|
||||
gguf_free(ctx_out);
|
||||
ggml_free(ctx_out_ggml);
|
||||
}
|
||||
};
|
||||
|
||||
static struct export_lora_params get_default_export_lora_params() {
|
||||
struct export_lora_params result;
|
||||
result.fn_model_base = "";
|
||||
result.fn_model_out = "";
|
||||
result.n_threads = GGML_DEFAULT_N_THREADS;
|
||||
return result;
|
||||
}
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
|
||||
fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str());
|
||||
fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n");
|
||||
fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads);
|
||||
}
|
||||
|
||||
static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
struct export_lora_params default_params = get_default_export_lora_params();
|
||||
const std::string arg_prefix = "--";
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||
}
|
||||
|
||||
if (arg == "-m" || arg == "--model-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params->fn_model_base = argv[i];
|
||||
} else if (arg == "-o" || arg == "--model-out") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params->fn_model_out = argv[i];
|
||||
} else if (arg == "-l" || arg == "--lora") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
struct lora_info lora;
|
||||
lora.filename = argv[i];
|
||||
lora.scale = 1.0f;
|
||||
params->lora.push_back(lora);
|
||||
} else if (arg == "-s" || arg == "--lora-scaled") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
struct lora_info lora;
|
||||
lora.filename = argv[i];
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
lora.scale = std::stof(argv[i]);
|
||||
params->lora.push_back(lora);
|
||||
} else if (arg == "-t" || arg == "--threads") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params->n_threads = std::stoi(argv[i]);
|
||||
if (params->n_threads <= 0) {
|
||||
params->n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
export_lora_print_usage(argc, argv, &default_params);
|
||||
exit(0);
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
|
||||
export_lora_print_usage(argc, argv, &default_params);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (params->fn_model_base == default_params.fn_model_base) {
|
||||
fprintf(stderr, "error: please specify a filename for model-base.\n");
|
||||
export_lora_print_usage(argc, argv, &default_params);
|
||||
exit(1);
|
||||
}
|
||||
if (params->fn_model_out == default_params.fn_model_out) {
|
||||
fprintf(stderr, "error: please specify a filename for model-out.\n");
|
||||
export_lora_print_usage(argc, argv, &default_params);
|
||||
exit(1);
|
||||
}
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
|
||||
export_lora_print_usage(argc, argv, &default_params);
|
||||
exit(1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void free_lora(struct lora_data * lora) {
|
||||
if (lora->ctx != NULL) {
|
||||
ggml_free(lora->ctx);
|
||||
}
|
||||
delete lora;
|
||||
}
|
||||
|
||||
static struct lora_data * load_lora(struct lora_info * info) {
|
||||
struct lora_data * result = new struct lora_data;
|
||||
result->info = *info;
|
||||
result->ctx = NULL;
|
||||
result->lora_r = 1;
|
||||
result->lora_alpha = 1;
|
||||
|
||||
struct llama_file file(info->filename.c_str(), "rb");
|
||||
if (file.fp == NULL) {
|
||||
fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
|
||||
info->filename.c_str());
|
||||
free_lora(result);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct ggml_init_params params_ggml;
|
||||
params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
|
||||
params_ggml.mem_buffer = NULL;
|
||||
params_ggml.no_alloc = true;
|
||||
result->ctx = ggml_init(params_ggml);
|
||||
|
||||
uint32_t magic = file.read_u32();
|
||||
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||
die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
|
||||
}
|
||||
uint32_t version = file.read_u32();
|
||||
if (version != 1) {
|
||||
die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
|
||||
}
|
||||
result->lora_r = file.read_u32();
|
||||
result->lora_alpha = file.read_u32();
|
||||
// read tensor infos from file
|
||||
std::vector<char> name_buf;
|
||||
std::vector<struct ggml_tensor *> tensors;
|
||||
std::vector<size_t> tensors_offset;
|
||||
size_t total_nbytes_pad = 0;
|
||||
while(!file.eof()) {
|
||||
int64_t ne[4] = {1,1,1,1};
|
||||
uint32_t n_dims = file.read_u32();
|
||||
uint32_t namelen = file.read_u32();
|
||||
uint32_t type = file.read_u32();
|
||||
for (uint32_t k = 0; k < n_dims; ++k) {
|
||||
ne[k] = (int64_t)file.read_u32();
|
||||
}
|
||||
name_buf.clear();
|
||||
name_buf.resize(namelen + 1, '\0');
|
||||
file.read_raw(name_buf.data(), namelen);
|
||||
file.seek((0-file.tell()) & 31, SEEK_CUR);
|
||||
size_t offset = file.tell();
|
||||
struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
|
||||
ggml_set_name(tensor, name_buf.data());
|
||||
size_t nbytes = ggml_nbytes(tensor);
|
||||
size_t nbytes_pad = ggml_nbytes_pad(tensor);
|
||||
total_nbytes_pad += nbytes_pad;
|
||||
tensors.push_back(tensor);
|
||||
tensors_offset.push_back(offset);
|
||||
file.seek(nbytes, SEEK_CUR);
|
||||
}
|
||||
// read tensor data
|
||||
result->data.resize(total_nbytes_pad);
|
||||
size_t data_offset = 0;
|
||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||
struct ggml_tensor * tensor = tensors[i];
|
||||
size_t offset = tensors_offset[i];
|
||||
size_t nbytes = ggml_nbytes(tensor);
|
||||
size_t nbytes_pad = ggml_nbytes_pad(tensor);
|
||||
file.seek(offset, SEEK_SET);
|
||||
tensor->data = result->data.data() + data_offset;
|
||||
file.read_raw(tensor->data, nbytes);
|
||||
data_offset += nbytes_pad;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static struct ggml_cgraph * build_graph_lora(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * tensor,
|
||||
struct ggml_tensor * lora_a,
|
||||
struct ggml_tensor * lora_b,
|
||||
float scaling
|
||||
) {
|
||||
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
|
||||
if (scaling != 1.0f) {
|
||||
ab = ggml_scale(ctx, ab, scaling);
|
||||
}
|
||||
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
||||
ggml_build_forward_expand (gf, res);
|
||||
return gf;
|
||||
}
|
||||
|
||||
static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
|
||||
if (lora->ctx == NULL) {
|
||||
return false;
|
||||
}
|
||||
std::string name = ggml_get_name(tensor);
|
||||
std::string name_a = name + std::string(".loraA");
|
||||
std::string name_b = name + std::string(".loraB");
|
||||
struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
|
||||
struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
|
||||
if (lora_a == NULL || lora_b == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
|
||||
params.mem_buffer = NULL;
|
||||
params.no_alloc = true;
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct ggml_gallocr * alloc = NULL;
|
||||
struct ggml_cgraph * gf = NULL;
|
||||
|
||||
ctx = ggml_init(params);
|
||||
alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
||||
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
|
||||
|
||||
ggml_gallocr_alloc_graph(alloc, gf);
|
||||
|
||||
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
|
||||
static std::vector<uint8_t> data_work;
|
||||
data_work.resize(cplan.work_size);
|
||||
cplan.work_data = data_work.data();
|
||||
|
||||
ggml_graph_compute(gf, &cplan);
|
||||
|
||||
ggml_gallocr_free(alloc);
|
||||
ggml_free(ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void export_lora(struct export_lora_params * params) {
|
||||
// load all loras
|
||||
std::vector<struct lora_data *> loras;
|
||||
for (size_t i = 0; i < params->lora.size(); ++i) {
|
||||
struct lora_data * lora = load_lora(¶ms->lora[i]);
|
||||
if (lora != NULL) {
|
||||
loras.push_back(lora);
|
||||
}
|
||||
}
|
||||
if (loras.size() == 0) {
|
||||
fprintf(stderr, "warning: no lora adapters will be applied.\n");
|
||||
}
|
||||
|
||||
// open input file
|
||||
struct llama_file fin(params->fn_model_base.c_str(), "rb");
|
||||
if (!fin.fp) {
|
||||
die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
|
||||
}
|
||||
|
||||
// open base model gguf, read tensors without their data
|
||||
struct ggml_context * ctx_in;
|
||||
struct gguf_init_params params_gguf;
|
||||
params_gguf.no_alloc = true;
|
||||
params_gguf.ctx = &ctx_in;
|
||||
struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
|
||||
|
||||
// create new gguf
|
||||
struct gguf_context * gguf_out = gguf_init_empty();
|
||||
|
||||
// copy meta data from base model: kv and tensors
|
||||
gguf_set_kv(gguf_out, gguf_in);
|
||||
int n_tensors = gguf_get_n_tensors(gguf_in);
|
||||
for (int i=0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(gguf_in, i);
|
||||
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
|
||||
gguf_add_tensor(gguf_out, tensor);
|
||||
}
|
||||
|
||||
// create output file
|
||||
struct llama_file fout(params->fn_model_out.c_str(), "wb");
|
||||
if (!fout.fp) {
|
||||
die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
|
||||
}
|
||||
|
||||
// write gguf meta data
|
||||
std::vector<uint8_t> meta;
|
||||
meta.resize(gguf_get_meta_size(gguf_out));
|
||||
gguf_get_meta_data(gguf_out, meta.data());
|
||||
fout.write_raw(meta.data(), meta.size());
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
std::vector<uint8_t> padding;
|
||||
for (int i=0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(gguf_in, i);
|
||||
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
|
||||
|
||||
// read tensor data
|
||||
data.resize(ggml_nbytes(tensor));
|
||||
tensor->data = data.data();
|
||||
size_t offset = gguf_get_tensor_offset(gguf_in, i);
|
||||
fin.seek(offset + meta.size(), SEEK_SET);
|
||||
fin.read_raw(data.data(), data.size());
|
||||
|
||||
// apply all loras
|
||||
for (size_t k = 0; k < loras.size(); ++k) {
|
||||
apply_lora(tensor, loras[k], params->n_threads);
|
||||
}
|
||||
|
||||
// write tensor data + padding
|
||||
padding.clear();
|
||||
padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
|
||||
|
||||
GGML_ASSERT(fout.tell() == offset + meta.size());
|
||||
// fout.seek(offset + meta.size(), SEEK_SET);
|
||||
fout.write_raw(data.data(), data.size());
|
||||
fout.write_raw(padding.data(), padding.size());
|
||||
|
||||
if (i % 2 == 0) {
|
||||
printf(".");
|
||||
}
|
||||
}
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
|
||||
printf("\nNOTE: output model is F16\n");
|
||||
printf("\n");
|
||||
|
||||
// close gguf
|
||||
gguf_free(gguf_out);
|
||||
gguf_free(gguf_in);
|
||||
|
||||
// free loras
|
||||
for (size_t i = 0; i < loras.size(); ++i) {
|
||||
free_lora(loras[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
struct export_lora_params params = get_default_export_lora_params();
|
||||
gpt_params params;
|
||||
|
||||
if (!export_lora_params_parse(argc, argv, ¶ms)) {
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
export_lora(¶ms);
|
||||
g_verbose = (params.verbosity == 1);
|
||||
try {
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
|
||||
ctx.run_merge();
|
||||
} catch (const std::exception & err) {
|
||||
fprintf(stderr, "%s\n", err.what());
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
printf("done, output file is %s\n", params.lora_outfile.c_str());
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-finetune)
|
||||
add_executable(${TARGET} finetune.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
@@ -1,90 +0,0 @@
|
||||
# finetune
|
||||
|
||||
Basic usage instructions:
|
||||
|
||||
```bash
|
||||
# get training data
|
||||
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
||||
|
||||
# finetune LORA adapter
|
||||
./bin/llama-finetune \
|
||||
--model-base open-llama-3b-v2-q8_0.gguf \
|
||||
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
||||
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
||||
--lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
|
||||
--train-data "shakespeare.txt" \
|
||||
--save-every 10 \
|
||||
--threads 6 --adam-iter 30 --batch 4 --ctx 64 \
|
||||
--use-checkpointing
|
||||
|
||||
# predict
|
||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||
```
|
||||
|
||||
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
|
||||
So in above example after 10 iterations these files will be written:
|
||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
|
||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
|
||||
- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
|
||||
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||
|
||||
After 10 more iterations:
|
||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
|
||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
|
||||
- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
|
||||
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||
|
||||
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
||||
|
||||
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
||||
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
|
||||
|
||||
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
|
||||
|
||||
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
||||
|
||||
```bash
|
||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
||||
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
||||
```
|
||||
|
||||
You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
|
||||
|
||||
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
||||
|
||||
```bash
|
||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
||||
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||
```
|
||||
|
||||
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
|
||||
|
||||
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
|
||||
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
|
||||
|
||||
The default LORA rank can be specified with `--lora-r N`.
|
||||
The LORA rank can be configured for each model tensor type separately with these command line options:
|
||||
|
||||
```bash
|
||||
--lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
|
||||
--rank-att-norm N LORA rank for attention norm tensor (default 1)
|
||||
--rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1)
|
||||
--rank-out-norm N LORA rank for output norm tensor (default 1)
|
||||
--rank-tok-embd N LORA rank for token embeddings tensor (default 4)
|
||||
--rank-out N LORA rank for output tensor (default 4)
|
||||
--rank-wq N LORA rank for wq tensor (default 4)
|
||||
--rank-wk N LORA rank for wk tensor (default 4)
|
||||
--rank-wv N LORA rank for wv tensor (default 4)
|
||||
--rank-wo N LORA rank for wo tensor (default 4)
|
||||
--rank-ffn_gate N LORA rank for ffn_gate tensor (default 4)
|
||||
--rank-ffn_down N LORA rank for ffn_down tensor (default 4)
|
||||
--rank-ffn_up N LORA rank for ffn_up tensor (default 4)
|
||||
```
|
||||
|
||||
The LORA rank of 'norm' tensors should always be 1.
|
||||
|
||||
To see all available options use `llama-finetune --help`.
|
||||
@@ -1,487 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# finetune checkpoint --> gguf conversion
|
||||
|
||||
import argparse
|
||||
import gguf
|
||||
import struct
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# gguf constants
|
||||
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
|
||||
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
|
||||
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
|
||||
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
|
||||
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
|
||||
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
|
||||
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
|
||||
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
|
||||
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
|
||||
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
|
||||
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
|
||||
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
|
||||
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
|
||||
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
|
||||
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
|
||||
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
||||
|
||||
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
|
||||
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
|
||||
LLM_KV_TRAINING_TYPE = "training.type"
|
||||
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
||||
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
||||
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
||||
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
||||
|
||||
LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"
|
||||
LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
|
||||
LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"
|
||||
LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"
|
||||
LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"
|
||||
LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"
|
||||
LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"
|
||||
LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"
|
||||
LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"
|
||||
LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"
|
||||
LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"
|
||||
LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"
|
||||
|
||||
class Tensor:
|
||||
def __init__(self, dtype='f', ne=None):
|
||||
if ne is None:
|
||||
ne = []
|
||||
self.dtype = dtype
|
||||
self.ne = ne
|
||||
self.nbytes = 0
|
||||
if self.dtype == 'f':
|
||||
if len(self.ne) == 0:
|
||||
self.nbytes = 0
|
||||
else:
|
||||
self.nbytes = int(np.prod(self.ne)) * 4
|
||||
else:
|
||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||
|
||||
def load(self, data, offset):
|
||||
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
assert(nd == len(self.ne))
|
||||
ne = []
|
||||
for d in range(nd):
|
||||
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
ne.append(n)
|
||||
|
||||
if tuple(ne) != tuple(self.ne):
|
||||
raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
|
||||
|
||||
if self.dtype == 'f':
|
||||
assert(dtype == 0)
|
||||
else:
|
||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||
|
||||
self.name = bytes(data[offset:offset+namelen]); offset += namelen
|
||||
# 32-byte alignment
|
||||
offset += (0 - offset) & 31
|
||||
self.data = data[offset:offset+self.nbytes]
|
||||
offset += self.nbytes
|
||||
return offset
|
||||
|
||||
def max_storage_size(self):
|
||||
result = 0
|
||||
result += 4 # nd
|
||||
result += 4 # namelen
|
||||
result += 4 # dtype
|
||||
result += len(self.ne)*8 # ne
|
||||
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
|
||||
result += 31 # 32-byte alignment
|
||||
result += self.nbytes
|
||||
return result
|
||||
|
||||
def save_gguf(self, gguf_writer, name):
|
||||
gguf_writer.add_tensor(
|
||||
name=name,
|
||||
tensor=self.data,
|
||||
raw_shape=np.array(list(reversed(self.ne))),
|
||||
raw_dtype=gguf.GGMLQuantizationType.F32)
|
||||
|
||||
class OptimizationContext:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
||||
offset += 4
|
||||
|
||||
if self.version != 1:
|
||||
raise ValueError('Invalid version of optimization context in checkpoint file')
|
||||
|
||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
||||
|
||||
self.adam_m = Tensor('f', [self.nx])
|
||||
self.adam_v = Tensor('f', [self.nx])
|
||||
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
|
||||
self.lbfgs_x = Tensor('f', [self.nx])
|
||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
||||
self.lbfgs_g = Tensor('f', [self.nx])
|
||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
||||
self.lbfgs_d = Tensor('f', [self.nx])
|
||||
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
|
||||
# forgot to save type in version 1:
|
||||
# guess self.type from number of remaining bytes
|
||||
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
||||
[self.adam_m, self.adam_v]
|
||||
+([self.adam_pf] if (self.past > 0) else [])])
|
||||
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
||||
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
||||
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
||||
self.lbfgs_lmal, self.lbfgs_lmys,
|
||||
self.lbfgs_lms, self.lbfgs_lmy]
|
||||
+([self.lbfgs_pf] if (self.past > 0) else [])])
|
||||
# due to alignment padding the size might not by exact
|
||||
# but the difference in size for both types is significant,
|
||||
# so we can just use whichever is closest
|
||||
remaining = len(data) - offset
|
||||
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
|
||||
self.type = 0
|
||||
else:
|
||||
self.type = 1
|
||||
|
||||
if self.type == 0:
|
||||
offset = self.adam_m.load(data, offset)
|
||||
offset = self.adam_v.load(data, offset)
|
||||
offset = self.adam_pf.load(data,offset)
|
||||
|
||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
elif self.type == 1:
|
||||
offset = self.lbfgs_x.load(data, offset)
|
||||
offset = self.lbfgs_xp.load(data, offset)
|
||||
offset = self.lbfgs_g.load(data, offset)
|
||||
offset = self.lbfgs_gp.load(data, offset)
|
||||
offset = self.lbfgs_d.load(data, offset)
|
||||
offset = self.lbfgs_pf.load(data, offset)
|
||||
offset = self.lbfgs_lmal.load(data, offset)
|
||||
offset = self.lbfgs_lmys.load(data, offset)
|
||||
offset = self.lbfgs_lms.load(data, offset)
|
||||
offset = self.lbfgs_lmy.load(data, offset)
|
||||
|
||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
else:
|
||||
raise ValueError(f"Invalid optimizer type '{self.type}'")
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
||||
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
||||
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
||||
|
||||
if self.type == 0:
|
||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
|
||||
|
||||
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
|
||||
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
|
||||
if self.past > 0:
|
||||
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
|
||||
|
||||
elif self.type == 1:
|
||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
|
||||
|
||||
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
|
||||
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
|
||||
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
|
||||
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
|
||||
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
|
||||
if self.past > 0:
|
||||
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
|
||||
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
|
||||
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
|
||||
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
|
||||
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
|
||||
else:
|
||||
raise ValueError('Unknown optimizer type')
|
||||
|
||||
class LoraParams:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.n_rank_attention_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_wq = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_wk = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_wv = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_wo = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_ffn_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_w1 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_w2 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_w3 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_tok_embeddings = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rank_output = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, self.n_rank_tok_embeddings)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT, self.n_rank_output)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, self.n_rank_attention_norm)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q, self.n_rank_wq)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K, self.n_rank_wk)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V, self.n_rank_wv)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, self.n_rank_wo)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM, self.n_rank_ffn_norm)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE, self.n_rank_w1)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, self.n_rank_w2)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP, self.n_rank_w3)
|
||||
|
||||
class ModelParams:
|
||||
def __init__(self, n_ff = None):
|
||||
self.n_ff = n_ff
|
||||
|
||||
def load(self, data, offset):
|
||||
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
return offset
|
||||
|
||||
def get_n_ff(self):
|
||||
if self.n_ff is None:
|
||||
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
|
||||
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
|
||||
else:
|
||||
return self.n_ff
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
# self.n_vocab not saved
|
||||
gguf_writer.add_embedding_length(self.n_embd)
|
||||
gguf_writer.add_head_count(self.n_head)
|
||||
gguf_writer.add_block_count(self.n_layer)
|
||||
gguf_writer.add_rope_dimension_count(self.n_rot)
|
||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||
|
||||
def tensor_name(key, bid=None, suffix=".weight"):
|
||||
return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
|
||||
|
||||
class Layer:
|
||||
def __init__(self, params, lora_params, bid):
|
||||
self.bid = bid
|
||||
self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
|
||||
self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
|
||||
self.wq_a = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
|
||||
self.wq_b = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
|
||||
self.wk_a = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
|
||||
self.wk_b = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
|
||||
self.wv_a = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
|
||||
self.wv_b = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
|
||||
self.wo_a = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
|
||||
self.wo_b = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
|
||||
self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
|
||||
self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
|
||||
self.w1_a = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
|
||||
self.w1_b = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
|
||||
self.w2_a = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
|
||||
self.w2_b = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
|
||||
self.w3_a = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
|
||||
self.w3_b = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
|
||||
|
||||
def load(self, data, offset):
|
||||
offset = self.att_norm_a.load(data, offset)
|
||||
offset = self.att_norm_b.load(data, offset)
|
||||
offset = self.wq_a.load(data, offset)
|
||||
offset = self.wq_b.load(data, offset)
|
||||
offset = self.wk_a.load(data, offset)
|
||||
offset = self.wk_b.load(data, offset)
|
||||
offset = self.wv_a.load(data, offset)
|
||||
offset = self.wv_b.load(data, offset)
|
||||
offset = self.wo_a.load(data, offset)
|
||||
offset = self.wo_b.load(data, offset)
|
||||
offset = self.ffn_norm_a.load(data, offset)
|
||||
offset = self.ffn_norm_b.load(data, offset)
|
||||
offset = self.w1_a.load(data, offset)
|
||||
offset = self.w1_b.load(data, offset)
|
||||
offset = self.w2_a.load(data, offset)
|
||||
offset = self.w2_b.load(data, offset)
|
||||
offset = self.w3_a.load(data, offset)
|
||||
offset = self.w3_b.load(data, offset)
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
|
||||
self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
|
||||
self.wq_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_a"))
|
||||
self.wq_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_b"))
|
||||
self.wk_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_a"))
|
||||
self.wk_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_b"))
|
||||
self.wv_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_a"))
|
||||
self.wv_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_b"))
|
||||
self.wo_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_a"))
|
||||
self.wo_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_b"))
|
||||
self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_a"))
|
||||
self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_b"))
|
||||
self.w1_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_a"))
|
||||
self.w1_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_b"))
|
||||
self.w2_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_a"))
|
||||
self.w2_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_b"))
|
||||
self.w3_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_a"))
|
||||
self.w3_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_b"))
|
||||
|
||||
class LoraModel:
|
||||
def __init__(self, n_ff = None):
|
||||
self.params = ModelParams(n_ff = n_ff)
|
||||
self.lora_params = LoraParams()
|
||||
self.layers = []
|
||||
|
||||
def load(self, data, offset):
|
||||
offset = self.params.load(data, offset)
|
||||
offset = self.lora_params.load(data, offset)
|
||||
|
||||
self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
|
||||
self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
|
||||
self.norm_a = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
|
||||
self.norm_b = Tensor('f', [self.lora_params.n_rank_norm, 1])
|
||||
self.output_a = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
|
||||
self.output_b = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
|
||||
|
||||
offset = self.tok_embd_a.load(data, offset)
|
||||
offset = self.tok_embd_b.load(data, offset)
|
||||
offset = self.norm_a.load(data, offset)
|
||||
offset = self.norm_b.load(data, offset)
|
||||
offset = self.output_a.load(data, offset)
|
||||
offset = self.output_b.load(data, offset)
|
||||
|
||||
self.layers.clear()
|
||||
for bid in range(self.params.n_layer):
|
||||
layer = Layer(self.params, self.lora_params, bid)
|
||||
offset = layer.load(data, offset)
|
||||
self.layers.append(layer)
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
self.params.save_gguf(gguf_writer)
|
||||
self.lora_params.save_gguf(gguf_writer)
|
||||
|
||||
self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_a"))
|
||||
self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_b"))
|
||||
self.norm_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
|
||||
self.norm_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
|
||||
self.output_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_a"))
|
||||
self.output_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_b"))
|
||||
|
||||
for layer in self.layers:
|
||||
layer.save_gguf(gguf_writer)
|
||||
|
||||
class LoraCheckpoint:
|
||||
def __init__(self, n_ff = None):
|
||||
self.model = LoraModel(n_ff = n_ff)
|
||||
self.opt_ctx = OptimizationContext()
|
||||
|
||||
def load(self, data, offset):
|
||||
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
|
||||
if magic != b'ggcl':
|
||||
raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
|
||||
|
||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
if self.version != 0:
|
||||
raise ValueError('Invalid version of checkpoint file')
|
||||
|
||||
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
offset = self.model.load(data, offset)
|
||||
offset = self.opt_ctx.load(data, offset)
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
||||
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
||||
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
||||
self.model.save_gguf(gguf_writer)
|
||||
self.opt_ctx.save_gguf(gguf_writer)
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
|
||||
parser.add_argument('--input', '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
|
||||
parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
|
||||
parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
cfg = handle_args()
|
||||
print(cfg)
|
||||
data = np.memmap(cfg.input, mode = 'r')
|
||||
chk = LoraCheckpoint(n_ff = cfg.ff)
|
||||
offset = 0
|
||||
offset = chk.load(data, offset)
|
||||
# we should have read all available data
|
||||
assert(offset == len(data))
|
||||
|
||||
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
||||
chk.save_gguf(gguf_writer)
|
||||
print(" gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print(" gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print(" gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
gguf_writer.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,34 +0,0 @@
|
||||
#!/bin/bash
|
||||
cd `dirname $0`
|
||||
cd ../..
|
||||
|
||||
EXE="./llama-finetune"
|
||||
|
||||
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
||||
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
||||
|
||||
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
|
||||
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing.
|
||||
|
||||
while getopts "dg" opt; do
|
||||
case $opt in
|
||||
d)
|
||||
DEBUGGER="gdb --args"
|
||||
;;
|
||||
g)
|
||||
EXE="./build/bin/Release/finetune"
|
||||
GPUARG="--gpu-layers 25"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
$DEBUGGER $EXE \
|
||||
--model-base $MODEL \
|
||||
$GPUARG \
|
||||
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
|
||||
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
|
||||
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
|
||||
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
|
||||
--save-every 10 \
|
||||
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
|
||||
--use-checkpointing
|
||||
@@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
|
||||
auto decoded = decode_utf8(input_str, {});
|
||||
const auto & code_points = decoded.first;
|
||||
|
||||
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
|
||||
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
|
||||
|
||||
size_t pos = 0;
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
auto prev_stacks = grammar->stacks;
|
||||
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||
if (grammar->stacks.empty()) {
|
||||
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
|
||||
|
||||
llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
|
||||
|
||||
if (cur_stacks.empty()) {
|
||||
error_pos = pos;
|
||||
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
||||
grammar->stacks = prev_stacks;
|
||||
cur_stacks = prev_stacks;
|
||||
return false;
|
||||
}
|
||||
++pos;
|
||||
}
|
||||
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
for (const auto & stack : cur_stacks) {
|
||||
if (stack.empty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# llama.cpp/examples/imatrix
|
||||
|
||||
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
|
||||
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
|
||||
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -124,6 +124,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
|
||||
auto formatted = llama_chat_format_single(
|
||||
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||
chat_msgs.push_back({role, content});
|
||||
LOG("formatted: %s\n", formatted.c_str());
|
||||
return formatted;
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/
|
||||
Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantum models on GPU and CPU
|
||||
* LLM inference of F16 and quantized models on GPU and CPU
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||
* Parallel decoding with multi-user support
|
||||
* Continuous batching
|
||||
@@ -444,7 +444,7 @@ node index.js
|
||||
|
||||
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
|
||||
|
||||
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
|
||||
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
|
||||
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
|
||||
|
||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||
|
||||
@@ -225,7 +225,7 @@
|
||||
throw new Error("already running");
|
||||
}
|
||||
controller.value = new AbortController();
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
|
||||
const data = chunk.data;
|
||||
if (data.stop) {
|
||||
while (
|
||||
|
||||
@@ -479,7 +479,7 @@
|
||||
throw new Error("already running");
|
||||
}
|
||||
controller.value = new AbortController();
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
|
||||
const data = chunk.data;
|
||||
|
||||
if (data.stop) {
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-train-text-from-scratch)
|
||||
add_executable(${TARGET} train-text-from-scratch.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
@@ -1,27 +0,0 @@
|
||||
# train-text-from-scratch
|
||||
|
||||
Basic usage instructions:
|
||||
|
||||
```bash
|
||||
# get training data
|
||||
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
||||
|
||||
# train
|
||||
./bin/llama-train-text-from-scratch \
|
||||
--vocab-model ../models/ggml-vocab-llama.gguf \
|
||||
--ctx 64 --embd 256 --head 8 --layer 16 \
|
||||
--checkpoint-in chk-shakespeare-256x16-LATEST.gguf \
|
||||
--checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
|
||||
--model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
|
||||
--train-data "shakespeare.txt" \
|
||||
-t 6 -b 16 --seed 1 --adam-iter 256 \
|
||||
--no-checkpointing
|
||||
|
||||
# predict
|
||||
./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
|
||||
```
|
||||
|
||||
Output files will be saved every N iterations (config with `--save-every N`).
|
||||
The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
|
||||
|
||||
To train GGUF models just pass them to `--checkpoint-in FN`.
|
||||
@@ -1,499 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# train-text-from-scratch checkpoint --> gguf conversion
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py'))
|
||||
import gguf
|
||||
|
||||
# gguf constants
|
||||
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
|
||||
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
|
||||
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
|
||||
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
|
||||
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
|
||||
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
|
||||
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
|
||||
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
|
||||
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
|
||||
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
|
||||
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
|
||||
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
|
||||
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
|
||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
|
||||
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
|
||||
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
|
||||
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
|
||||
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
||||
|
||||
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
|
||||
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
|
||||
LLM_KV_TRAINING_TYPE = "training.type"
|
||||
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
||||
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
||||
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
||||
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
||||
|
||||
class Tensor:
|
||||
def __init__(self, dtype='f', ne=None):
|
||||
if ne is None:
|
||||
ne = []
|
||||
self.dtype = dtype
|
||||
self.ne = ne
|
||||
self.nbytes = 0
|
||||
if self.dtype == 'f':
|
||||
if len(self.ne) == 0:
|
||||
self.nbytes = 0
|
||||
else:
|
||||
self.nbytes = int(np.prod(self.ne)) * 4
|
||||
else:
|
||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||
|
||||
def load(self, data, offset):
|
||||
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
assert(nd == len(self.ne))
|
||||
ne = []
|
||||
for d in range(nd):
|
||||
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
ne.append(n)
|
||||
|
||||
assert(tuple(ne) == tuple(self.ne))
|
||||
|
||||
if self.dtype == 'f':
|
||||
assert(dtype == 0)
|
||||
else:
|
||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||
|
||||
self.name = bytes(data[offset:offset+namelen]); offset += namelen
|
||||
# 32-byte alignment
|
||||
offset += (0 - offset) & 31
|
||||
self.data = data[offset:offset+self.nbytes]
|
||||
offset += self.nbytes
|
||||
return offset
|
||||
|
||||
def max_storage_size(self):
|
||||
result = 0
|
||||
result += 4 # nd
|
||||
result += 4 # namelen
|
||||
result += 4 # dtype
|
||||
result += len(self.ne)*8 # ne
|
||||
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
|
||||
result += 31 # 32-byte alignment
|
||||
result += self.nbytes
|
||||
return result
|
||||
|
||||
def save_gguf(self, gguf_writer, name):
|
||||
gguf_writer.add_tensor(
|
||||
name=name,
|
||||
tensor=self.data,
|
||||
raw_shape=np.array(list(reversed(self.ne))),
|
||||
raw_dtype=gguf.GGMLQuantizationType.F32)
|
||||
|
||||
class OptimizationParamsV0:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.type = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_threads = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.delta = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.print_forward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
|
||||
self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
|
||||
self.adam_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_sched = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_decay = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_alpha = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_beta1 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_beta2 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_eps_f = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_eps_g = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_ftol = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_wolfe = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_min_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_max_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_linesearch = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
return offset
|
||||
|
||||
class OptimizationContext:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
||||
offset += 4
|
||||
|
||||
if self.version == 0:
|
||||
params = OptimizationParamsV0()
|
||||
offset = params.load(data, offset)
|
||||
self.past = params.past
|
||||
self.lbfgs_m = params.lbfgs_m
|
||||
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
||||
self.type = params.type
|
||||
|
||||
self.adam_m = Tensor('f', [self.nx])
|
||||
self.adam_v = Tensor('f', [self.nx])
|
||||
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
|
||||
self.lbfgs_x = Tensor('f', [self.nx])
|
||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
||||
self.lbfgs_g = Tensor('f', [self.nx])
|
||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
||||
self.lbfgs_d = Tensor('f', [self.nx])
|
||||
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
|
||||
if self.type == 0:
|
||||
# these tensors are stored, but we don't need their data
|
||||
x = Tensor('f', [self.nx])
|
||||
g = Tensor('f', [self.nx])
|
||||
g2 = Tensor('f', [self.nx])
|
||||
mh = Tensor('f', [self.nx])
|
||||
vh = Tensor('f', [self.nx])
|
||||
|
||||
offset = x.load(data, offset)
|
||||
offset = g.load(data, offset)
|
||||
offset = g2.load(data, offset)
|
||||
offset = self.adam_m.load(data, offset)
|
||||
offset = self.adam_v.load(data, offset)
|
||||
offset = mh.load(data, offset)
|
||||
offset = vh.load(data, offset)
|
||||
offset = self.adam_pf.load(data, offset)
|
||||
|
||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
elif self.type == 1:
|
||||
offset = self.lbfgs_x.load(data, offset)
|
||||
offset = self.lbfgs_xp.load(data, offset)
|
||||
offset = self.lbfgs_g.load(data, offset)
|
||||
offset = self.lbfgs_gp.load(data, offset)
|
||||
offset = self.lbfgs_d.load(data, offset)
|
||||
offset = self.lbfgs_pf.load(data, offset)
|
||||
offset = self.lbfgs_lmal.load(data, offset)
|
||||
offset = self.lbfgs_lmys.load(data, offset)
|
||||
offset = self.lbfgs_lms.load(data, offset)
|
||||
offset = self.lbfgs_lmy.load(data, offset)
|
||||
|
||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
else:
|
||||
raise ValueError('Unknown optimizer type')
|
||||
|
||||
|
||||
elif self.version == 1:
|
||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
||||
|
||||
self.adam_m = Tensor('f', [self.nx])
|
||||
self.adam_v = Tensor('f', [self.nx])
|
||||
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
|
||||
self.lbfgs_x = Tensor('f', [self.nx])
|
||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
||||
self.lbfgs_g = Tensor('f', [self.nx])
|
||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
||||
self.lbfgs_d = Tensor('f', [self.nx])
|
||||
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
||||
|
||||
# forgot to save type in version 1:
|
||||
# guess self.type from number of remaining bytes
|
||||
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
||||
[self.adam_m, self.adam_v]
|
||||
+([self.adam_pf] if (self.past > 0) else [])])
|
||||
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
||||
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
||||
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
||||
self.lbfgs_lmal, self.lbfgs_lmys,
|
||||
self.lbfgs_lms, self.lbfgs_lmy]
|
||||
+([self.lbfgs_pf] if (self.past > 0) else [])])
|
||||
# due to alignment padding the size might not by exact
|
||||
# but the difference in size for both types is significant,
|
||||
# so we can just use whichever is closest
|
||||
remaining = len(data) - offset
|
||||
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
|
||||
self.type = 0
|
||||
else:
|
||||
self.type = 1
|
||||
|
||||
if self.type == 0:
|
||||
offset = self.adam_m.load(data, offset)
|
||||
offset = self.adam_v.load(data, offset)
|
||||
offset = self.adam_pf.load(data,offset)
|
||||
|
||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
elif self.type == 1:
|
||||
offset = self.lbfgs_x.load(data, offset)
|
||||
offset = self.lbfgs_xp.load(data, offset)
|
||||
offset = self.lbfgs_g.load(data, offset)
|
||||
offset = self.lbfgs_gp.load(data, offset)
|
||||
offset = self.lbfgs_d.load(data, offset)
|
||||
offset = self.lbfgs_pf.load(data, offset)
|
||||
offset = self.lbfgs_lmal.load(data, offset)
|
||||
offset = self.lbfgs_lmys.load(data, offset)
|
||||
offset = self.lbfgs_lms.load(data, offset)
|
||||
offset = self.lbfgs_lmy.load(data, offset)
|
||||
|
||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
else:
|
||||
raise ValueError('Invalid version of checkpoint file')
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
||||
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
||||
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
||||
|
||||
if self.type == 0:
|
||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
|
||||
|
||||
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
|
||||
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
|
||||
if self.past > 0:
|
||||
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
|
||||
|
||||
elif self.type == 1:
|
||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
|
||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
|
||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
|
||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
|
||||
|
||||
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
|
||||
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
|
||||
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
|
||||
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
|
||||
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
|
||||
if self.past > 0:
|
||||
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
|
||||
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
|
||||
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
|
||||
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
|
||||
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
|
||||
else:
|
||||
raise ValueError('Unknown optimizer type')
|
||||
|
||||
class ModelParams:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, data, offset):
|
||||
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
return offset
|
||||
|
||||
def get_n_ff(self):
|
||||
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
|
||||
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
# self.n_vocab not saved
|
||||
gguf_writer.add_embedding_length(self.n_embd)
|
||||
gguf_writer.add_head_count(self.n_head)
|
||||
gguf_writer.add_block_count(self.n_layer)
|
||||
gguf_writer.add_rope_dimension_count(self.n_rot)
|
||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||
|
||||
def tensor_name(key, bid=None):
|
||||
return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
|
||||
|
||||
class Layer:
|
||||
def __init__(self, params, bid):
|
||||
self.bid = bid
|
||||
self.att_norm = Tensor('f', [params.n_embd])
|
||||
self.wq = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.wk = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.wv = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.wo = Tensor('f', [params.n_embd, params.n_embd])
|
||||
self.ffn_norm = Tensor('f', [params.n_embd])
|
||||
self.w1 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
||||
self.w2 = Tensor('f', [params.get_n_ff(), params.n_embd])
|
||||
self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
||||
|
||||
def load(self, data, offset):
|
||||
offset = self.att_norm.load(data, offset)
|
||||
offset = self.wq.load(data, offset)
|
||||
offset = self.wk.load(data, offset)
|
||||
offset = self.wv.load(data, offset)
|
||||
offset = self.wo.load(data, offset)
|
||||
offset = self.ffn_norm.load(data, offset)
|
||||
offset = self.w1.load(data, offset)
|
||||
offset = self.w2.load(data, offset)
|
||||
offset = self.w3.load(data, offset)
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
|
||||
self.wq.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid))
|
||||
self.wk.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid))
|
||||
self.wv.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid))
|
||||
self.wo.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid))
|
||||
self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid))
|
||||
self.w1.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid))
|
||||
self.w2.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid))
|
||||
self.w3.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid))
|
||||
|
||||
class Model:
|
||||
def __init__(self):
|
||||
self.params = ModelParams()
|
||||
self.layers = []
|
||||
|
||||
def load(self, data, offset):
|
||||
offset = self.params.load(data, offset)
|
||||
|
||||
self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
||||
self.norm = Tensor('f', [self.params.n_embd])
|
||||
self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
||||
|
||||
offset = self.tok_embd.load(data, offset)
|
||||
offset = self.norm.load(data, offset)
|
||||
offset = self.output.load(data, offset)
|
||||
|
||||
self.layers.clear()
|
||||
for bid in range(self.params.n_layer):
|
||||
layer = Layer(self.params, bid)
|
||||
offset = layer.load(data, offset)
|
||||
self.layers.append(layer)
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
self.params.save_gguf(gguf_writer)
|
||||
|
||||
self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
|
||||
self.norm.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
|
||||
self.output.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
|
||||
|
||||
for layer in self.layers:
|
||||
layer.save_gguf(gguf_writer)
|
||||
|
||||
class Checkpoint:
|
||||
def __init__(self):
|
||||
self.model = Model()
|
||||
self.opt_ctx = OptimizationContext()
|
||||
|
||||
def load(self, data, offset):
|
||||
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
|
||||
if magic != b'ggcp':
|
||||
raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
|
||||
|
||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
if self.version != 0:
|
||||
raise ValueError('Invalid version of checkpoint file')
|
||||
|
||||
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||
|
||||
offset = self.model.load(data, offset)
|
||||
offset = self.opt_ctx.load(data, offset)
|
||||
|
||||
return offset
|
||||
|
||||
def save_gguf(self, gguf_writer):
|
||||
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
||||
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
||||
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
||||
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
||||
self.model.save_gguf(gguf_writer)
|
||||
self.opt_ctx.save_gguf(gguf_writer)
|
||||
|
||||
def handle_args():
|
||||
parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
|
||||
parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename', required=True)
|
||||
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
cfg = handle_args()
|
||||
data = np.memmap(cfg.input, mode = 'r')
|
||||
chk = Checkpoint()
|
||||
offset = 0
|
||||
offset = chk.load(data, offset)
|
||||
# we should have read all available data
|
||||
assert(offset == len(data))
|
||||
|
||||
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
||||
chk.save_gguf(gguf_writer)
|
||||
print(" gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print(" gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print(" gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
gguf_writer.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2400,6 +2400,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||
GGML_API int ggml_cpu_has_cann (void);
|
||||
GGML_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
||||
@@ -467,15 +467,18 @@ if (GGML_SYCL)
|
||||
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
|
||||
endif()
|
||||
|
||||
if ( NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
|
||||
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
|
||||
if ( DEFINED ENV{ONEAPI_ROOT})
|
||||
message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
|
||||
elseif(SUPPORTS_SYCL)
|
||||
message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
|
||||
If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
|
||||
source /opt/intel/oneapi/setvars.sh")
|
||||
else()
|
||||
message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
|
||||
endif()
|
||||
#todo: AOT
|
||||
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
|
||||
message(STATUS "SYCL found")
|
||||
#todo: AOT
|
||||
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
|
||||
|
||||
@@ -487,11 +490,9 @@ if (GGML_SYCL)
|
||||
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
|
||||
endif()
|
||||
|
||||
add_compile_options(-I./) #include DPCT
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
else()
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||
@@ -504,14 +505,14 @@ if (GGML_SYCL)
|
||||
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
else()
|
||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
||||
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
@@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
@@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
if (svcntw() == 8) {
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
@@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
||||
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
@@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
float * res_ptr = s;
|
||||
@@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||
if (svcntw() == 8) {
|
||||
const void * b_ptr = vx;
|
||||
const void * a_ptr = vy;
|
||||
|
||||
@@ -459,7 +459,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||
|
||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(RDNA3)
|
||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||
|
||||
@@ -267,7 +267,7 @@ struct ggml_backend_sycl_context {
|
||||
|
||||
queue_ptr stream(int device, int stream) {
|
||||
if (qptrs[device][stream] == nullptr) {
|
||||
qptrs[device][stream] = &(dpct::get_current_device().default_queue());
|
||||
qptrs[device][stream] = &(dpct::get_device(device).default_queue());
|
||||
}
|
||||
return qptrs[device][stream];
|
||||
}
|
||||
|
||||
@@ -588,7 +588,7 @@ namespace dpct
|
||||
out = prop;
|
||||
}
|
||||
|
||||
/// dpct device extension
|
||||
/// dpct device extension
|
||||
class device_ext : public sycl::device {
|
||||
typedef std::mutex mutex_type;
|
||||
|
||||
@@ -697,7 +697,7 @@ namespace dpct
|
||||
std::unique_lock<mutex_type> lock(m_mutex);
|
||||
lock.unlock();
|
||||
for (auto &q : _queues) {
|
||||
q.wait_and_throw();
|
||||
q.wait_and_throw();
|
||||
}
|
||||
// Guard the destruct of current_queues to make sure the ref count is
|
||||
// safe.
|
||||
@@ -734,7 +734,12 @@ namespace dpct
|
||||
|
||||
void destroy_queue(sycl::queue queue) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
_queues.clear();
|
||||
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
|
||||
[=](const sycl::queue &q) -> bool
|
||||
{
|
||||
return q == queue;
|
||||
}),
|
||||
_queues.end());
|
||||
}
|
||||
void set_saved_queue(sycl::queue q) {
|
||||
std::lock_guard<mutex_type> lock(m_mutex);
|
||||
@@ -764,13 +769,13 @@ namespace dpct
|
||||
if (enable_exception_handler) {
|
||||
eh = exception_handler;
|
||||
}
|
||||
auto q = sycl::queue(*this, eh,
|
||||
sycl::property_list(
|
||||
_queues.push_back(sycl::queue(
|
||||
*this, eh,
|
||||
sycl::property_list(
|
||||
#ifdef DPCT_PROFILING_ENABLED
|
||||
sycl::property::queue::enable_profiling(),
|
||||
sycl::property::queue::enable_profiling(),
|
||||
#endif
|
||||
properties...));
|
||||
_queues.push_back(q);
|
||||
properties...)));
|
||||
|
||||
return _queues.back();
|
||||
}
|
||||
@@ -783,8 +788,8 @@ namespace dpct
|
||||
if (enable_exception_handler) {
|
||||
eh = exception_handler;
|
||||
}
|
||||
_queues.push_back(
|
||||
sycl::queue(device, eh,
|
||||
_queues.push_back(sycl::queue(
|
||||
device, eh,
|
||||
sycl::property_list(
|
||||
#ifdef DPCT_PROFILING_ENABLED
|
||||
sycl::property::queue::enable_profiling(),
|
||||
@@ -855,15 +860,75 @@ namespace dpct
|
||||
unsigned int get_device_id(const sycl::device &dev)
|
||||
{
|
||||
unsigned int id = 0;
|
||||
for (auto dev_item : _devs)
|
||||
for (auto &dev_item : _devs)
|
||||
{
|
||||
if (*dev_item == dev)
|
||||
{
|
||||
break;
|
||||
return id;
|
||||
}
|
||||
id++;
|
||||
}
|
||||
return id;
|
||||
return -1;
|
||||
}
|
||||
|
||||
inline std::string get_preferred_gpu_platform_name() {
|
||||
std::string result;
|
||||
|
||||
std::string filter = "level-zero";
|
||||
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
|
||||
if (env) {
|
||||
if (std::strstr(env, "level_zero")) {
|
||||
filter = "level-zero";
|
||||
}
|
||||
else if (std::strstr(env, "opencl")) {
|
||||
filter = "opencl";
|
||||
}
|
||||
else if (std::strstr(env, "cuda")) {
|
||||
filter = "cuda";
|
||||
}
|
||||
else if (std::strstr(env, "hip")) {
|
||||
filter = "hip";
|
||||
}
|
||||
else {
|
||||
throw std::runtime_error("invalid device filter: " + std::string(env));
|
||||
}
|
||||
}
|
||||
|
||||
auto plaform_list = sycl::platform::get_platforms();
|
||||
|
||||
for (const auto& platform : plaform_list) {
|
||||
auto devices = platform.get_devices();
|
||||
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
|
||||
return d.is_gpu();
|
||||
});
|
||||
|
||||
if (gpu_dev == devices.end()) {
|
||||
// cout << "platform [" << platform_name
|
||||
// << "] does not contain GPU devices, skipping\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
auto platform_name = platform.get_info<sycl::info::platform::name>();
|
||||
std::string platform_name_low_case;
|
||||
platform_name_low_case.resize(platform_name.size());
|
||||
|
||||
std::transform(
|
||||
platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
|
||||
|
||||
if (platform_name_low_case.find(filter) == std::string::npos) {
|
||||
// cout << "platform [" << platform_name
|
||||
// << "] does not match with requested "
|
||||
// << filter << ", skipping\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
result = platform_name;
|
||||
}
|
||||
|
||||
if (result.empty())
|
||||
throw std::runtime_error("can not find preferred GPU platform");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class DeviceSelector>
|
||||
@@ -930,10 +995,15 @@ namespace dpct
|
||||
// Keep track of the number of devices per backend
|
||||
std::map<sycl::backend, size_t> DeviceNums;
|
||||
std::map<std::string, std::vector<sycl::device>> backend_devices;
|
||||
auto preferred_platform_name = get_preferred_gpu_platform_name();
|
||||
|
||||
while (!Platforms.empty()) {
|
||||
auto Platform = Platforms.back();
|
||||
Platforms.pop_back();
|
||||
auto platform_name = Platform.get_info<sycl::info::platform::name>();
|
||||
if (platform_name.compare(preferred_platform_name) != 0) {
|
||||
continue;
|
||||
}
|
||||
auto devices = Platform.get_devices();
|
||||
std::string backend_type = get_device_backend_and_type(devices[0]);
|
||||
for (const auto &device : devices) {
|
||||
@@ -1989,6 +2059,11 @@ namespace dpct
|
||||
return dev_mgr::instance().current_device();
|
||||
}
|
||||
|
||||
static inline device_ext &get_device(unsigned int id)
|
||||
{
|
||||
return dev_mgr::instance().get_device(id);
|
||||
}
|
||||
|
||||
static inline sycl::queue &get_in_order_queue()
|
||||
{
|
||||
return dev_mgr::instance().current_device().in_order_queue();
|
||||
|
||||
@@ -152,7 +152,8 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
|
||||
|
||||
const sycl::range<3> block_dims(1, 1, nth);
|
||||
const sycl::range<3> block_nums(1, 1, nrows_x);
|
||||
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
|
||||
const size_t n_val_tmp = nth / WARP_SIZE;
|
||||
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + n_val_tmp);
|
||||
|
||||
const uint32_t n_head_kv = nrows_x/nrows_y;
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
||||
|
||||
@@ -38,8 +38,6 @@
|
||||
#define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
|
||||
#define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
|
||||
|
||||
#define VK_NUM_TYPES 16
|
||||
|
||||
#define GGML_VK_MAX_NODES 8192
|
||||
|
||||
#define MAX_VK_BUFFERS 256
|
||||
@@ -162,23 +160,23 @@ struct vk_device_struct {
|
||||
vk_matmul_pipeline pipeline_matmul_f16_f32;
|
||||
vk_pipeline pipeline_matmul_split_k_reduce;
|
||||
|
||||
vk_matmul_pipeline pipeline_dequant_mul_mat_mat[VK_NUM_TYPES];
|
||||
vk_matmul_pipeline pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
|
||||
|
||||
vk_matmul_pipeline pipeline_matmul_id_f32;
|
||||
vk_matmul_pipeline pipeline_matmul_id_f16;
|
||||
vk_matmul_pipeline pipeline_matmul_id_f16_f32;
|
||||
|
||||
vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES];
|
||||
vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
|
||||
|
||||
vk_pipeline pipeline_dequant[VK_NUM_TYPES];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[VK_NUM_TYPES];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[VK_NUM_TYPES];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES];
|
||||
vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
|
||||
|
||||
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
||||
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
|
||||
vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
|
||||
vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
|
||||
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_mul_f32;
|
||||
vk_pipeline pipeline_div_f32;
|
||||
vk_pipeline pipeline_add_f32;
|
||||
@@ -1059,25 +1057,6 @@ static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& event
|
||||
);
|
||||
}
|
||||
|
||||
static bool ggml_vk_build_shader(ggml_type type) {
|
||||
switch(type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_load_shaders(vk_device& device) {
|
||||
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
||||
|
||||
@@ -1112,6 +1091,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
|
||||
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
@@ -1126,6 +1106,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
|
||||
if (device->fp16) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
||||
@@ -1226,6 +1207,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
||||
@@ -1316,6 +1304,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
} else {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
||||
@@ -1415,6 +1410,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
|
||||
@@ -1505,6 +1507,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
|
||||
}
|
||||
|
||||
// mul mat vec
|
||||
@@ -1520,6 +1529,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
@@ -1533,6 +1543,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
@@ -1546,6 +1557,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
// dequant shaders
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||
@@ -1559,6 +1571,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||
|
||||
// get_rows
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
@@ -1568,6 +1581,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
@@ -1576,6 +1590,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
||||
|
||||
@@ -2087,6 +2102,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
break;
|
||||
default:
|
||||
return nullptr;
|
||||
@@ -2123,6 +2139,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
break;
|
||||
default:
|
||||
return nullptr;
|
||||
@@ -2148,6 +2165,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
break;
|
||||
default:
|
||||
return nullptr;
|
||||
@@ -2181,6 +2199,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
break;
|
||||
default:
|
||||
return nullptr;
|
||||
@@ -2206,6 +2225,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
break;
|
||||
default:
|
||||
return nullptr;
|
||||
@@ -3431,7 +3451,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
||||
|
||||
const uint64_t nei0 = ids->ne[0];
|
||||
const uint64_t nei1 = ids->ne[1];
|
||||
GGML_ASSERT(nei0 * nei1 <= 2048);
|
||||
GGML_ASSERT(nei0 * nei1 <= 3072);
|
||||
|
||||
const uint32_t nbi1 = ids->nb[1];
|
||||
const uint32_t nbi2 = ids->nb[2];
|
||||
@@ -3443,8 +3463,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
|
||||
|
||||
const uint64_t n_as = ne02;
|
||||
|
||||
GGML_ASSERT(n_as <= 8);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
@@ -4623,22 +4641,22 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
}
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||
|
||||
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
||||
// Resize buffer
|
||||
if (ctx->prealloc_split_k != nullptr) {
|
||||
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
||||
}
|
||||
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
}
|
||||
|
||||
vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
|
||||
Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
|
||||
@@ -4665,12 +4683,12 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
|
||||
ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
|
||||
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
|
||||
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
|
||||
|
||||
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
for (size_t i = 0; i < num_it; i++) {
|
||||
ggml_vk_ctx_begin(ctx, subctx);
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
ggml_vk_matmul(
|
||||
ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
||||
m, n, k,
|
||||
@@ -4689,7 +4707,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
||||
|
||||
// copy dst to host
|
||||
ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne);
|
||||
ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne);
|
||||
|
||||
float * d_chk = (float *) malloc(sizeof(float) * d_ne);
|
||||
|
||||
@@ -4765,7 +4783,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
|
||||
if (split_k > 1) {
|
||||
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
|
||||
ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
|
||||
ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
|
||||
|
||||
std::cerr << "d_buf0: " << std::endl << std::endl;
|
||||
ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
||||
@@ -4785,8 +4803,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
|
||||
free(d_chk);
|
||||
|
||||
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
||||
ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
|
||||
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
|
||||
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
|
||||
|
||||
ggml_vk_destroy_buffer(d_X);
|
||||
ggml_vk_destroy_buffer(d_Y);
|
||||
@@ -4834,90 +4852,23 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
|
||||
VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
|
||||
// Check transfers are correct
|
||||
vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
float * x;
|
||||
float * y;
|
||||
if (pinned) {
|
||||
x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
|
||||
y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
|
||||
} else {
|
||||
x = (float *) malloc(sizeof(float) * ne);
|
||||
y = (float *) malloc(sizeof(float) * ne);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ne; i++) {
|
||||
x[i] = rand() / (float)RAND_MAX;
|
||||
}
|
||||
|
||||
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
ggml_vk_ctx_begin(ctx, subctx);
|
||||
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne);
|
||||
|
||||
for (auto& cpy : subctx->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
subctx->in_memcpys.clear();
|
||||
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, ctx->fence);
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
||||
|
||||
ggml_vk_ctx_begin(ctx, subctx);
|
||||
|
||||
begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne);
|
||||
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, ctx->fence);
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
|
||||
for (auto& cpy : subctx->out_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
subctx->out_memcpys.clear();
|
||||
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
||||
|
||||
double avg_err = 0.0;
|
||||
for (size_t i = 0; i < ne; i++) {
|
||||
avg_err += std::fabs(x[i] - y[i]);
|
||||
}
|
||||
|
||||
double kb = ne * sizeof(float) / 1024.0;
|
||||
|
||||
std::cerr << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl;
|
||||
|
||||
ggml_vk_destroy_buffer(buffer);
|
||||
|
||||
if (pinned) {
|
||||
ggml_vk_host_free(ctx, x);
|
||||
ggml_vk_host_free(ctx, y);
|
||||
} else {
|
||||
free(x);
|
||||
free(y);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
|
||||
ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
|
||||
}
|
||||
|
||||
static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
|
||||
if (quant == GGML_TYPE_F32) {
|
||||
memcpy(to, from, sizeof(float) * ne);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
|
||||
|
||||
ggml_to_float_t dequant_fn = tt.to_float;
|
||||
|
||||
dequant_fn(from, to, ne);
|
||||
}
|
||||
|
||||
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
||||
VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
|
||||
const size_t x_sz = sizeof(float) * ne;
|
||||
@@ -4925,24 +4876,26 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||
const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
|
||||
float * x = (float *) malloc(x_sz);
|
||||
void * qx = malloc(qx_sz);
|
||||
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
float * x_ref = (float *) malloc(x_sz);
|
||||
ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
|
||||
|
||||
for (size_t i = 0; i < ne; i++) {
|
||||
x[i] = rand() / (float)RAND_MAX;
|
||||
}
|
||||
|
||||
vk_pipeline p = ctx->device->pipeline_dequant[quant];
|
||||
vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant);
|
||||
|
||||
ggml_vk_quantize_data(x, qx, ne, quant);
|
||||
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx, p, 1);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device, p, 1);
|
||||
|
||||
ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
|
||||
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
||||
|
||||
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
ggml_vk_ctx_begin(ctx, subctx);
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
|
||||
ggml_vk_ctx_end(subctx);
|
||||
@@ -4956,13 +4909,13 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
||||
ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16);
|
||||
ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16);
|
||||
|
||||
int first_err = -1;
|
||||
|
||||
double avg_err = 0.0;
|
||||
for (size_t i = 0; i < ne; i++) {
|
||||
double error = std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i]));
|
||||
double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i]));
|
||||
avg_err += error;
|
||||
|
||||
if (first_err < 0 && error > 0.05) {
|
||||
@@ -4982,7 +4935,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||
}
|
||||
std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
|
||||
for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
|
||||
std::cerr << x[i] << ", ";
|
||||
std::cerr << x_ref[i] << ", ";
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
@@ -4992,6 +4945,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||
|
||||
free(x);
|
||||
free(qx);
|
||||
free(x_ref);
|
||||
free(x_chk);
|
||||
}
|
||||
|
||||
@@ -5040,9 +4994,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
float * x = (float *) malloc(x_sz);
|
||||
float * y = (float *) malloc(y_sz);
|
||||
void * qx = malloc(qx_sz);
|
||||
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer y_buf = ggml_vk_create_buffer_check(ctx, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_buf = ggml_vk_create_buffer_check(ctx, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
float * d = (float *) malloc(d_sz);
|
||||
float * d_chk = (float *) malloc(d_sz);
|
||||
|
||||
@@ -5057,25 +5011,25 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
y[i] = (i % k == i / k) ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||
|
||||
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
||||
// Resize buffer
|
||||
if (ctx->prealloc_split_k != nullptr) {
|
||||
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
||||
}
|
||||
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
|
||||
ggml_vk_buffer_write(ctx, y_buf, 0, y, y_sz);
|
||||
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
||||
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
||||
|
||||
vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
for (size_t i = 0; i < num_it; i++) {
|
||||
ggml_vk_ctx_begin(ctx, subctx);
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
ggml_vk_matmul(
|
||||
ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
|
||||
m, n, k,
|
||||
@@ -5094,7 +5048,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
||||
ggml_vk_buffer_read(ctx, d_buf, 0, d, d_sz);
|
||||
ggml_vk_buffer_read(d_buf, 0, d, d_sz);
|
||||
|
||||
ggml_init_params iparams = {
|
||||
/*.mem_size =*/ 1024*1024*1024,
|
||||
@@ -5149,7 +5103,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
|
||||
if (split_k > 1) {
|
||||
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
|
||||
ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
|
||||
ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
|
||||
|
||||
std::cerr << "d_buf0: " << std::endl << std::endl;
|
||||
ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
||||
@@ -5302,12 +5256,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
||||
|
||||
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||
#if defined(GGML_VULKAN_RUN_TESTS)
|
||||
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
||||
ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
ggml_vk_test_transfer(ctx, 8192 * 1000, false);
|
||||
ggml_vk_test_transfer(ctx, 8192 * 1000, true);
|
||||
|
||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
|
||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
|
||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
|
||||
@@ -5319,85 +5270,90 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
|
||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
|
||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
|
||||
ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_IQ4_NL);
|
||||
|
||||
ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);
|
||||
|
||||
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
|
||||
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
|
||||
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
|
||||
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
|
||||
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
|
||||
ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
|
||||
// ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
|
||||
// ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
|
||||
// ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
|
||||
// ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
|
||||
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_IQ4_NL);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_IQ4_NL);
|
||||
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_IQ4_NL);
|
||||
|
||||
std::cerr << std::endl;
|
||||
|
||||
@@ -5429,9 +5385,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
|
||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
|
||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
|
||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
|
||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
|
||||
ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
|
||||
// ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
|
||||
// ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
|
||||
// ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
@@ -6263,6 +6219,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
@@ -6291,6 +6248,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
@@ -22005,6 +22005,14 @@ int ggml_cpu_has_cann(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_llamafile(void) {
|
||||
#if defined(GGML_USE_LLAMAFILE)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_gpublas(void) {
|
||||
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
|
||||
}
|
||||
|
||||
@@ -58,3 +58,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
const float d = float(data_a[a_offset + ib].d);
|
||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
|
||||
}
|
||||
#endif
|
||||
|
||||
30
ggml/src/vulkan-shaders/dequant_iq4_nl.comp
Normal file
30
ggml/src/vulkan-shaders/dequant_iq4_nl.comp
Normal file
@@ -0,0 +1,30 @@
|
||||
#version 450
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
||||
|
||||
const uint tid = gl_LocalInvocationID.x % 64;
|
||||
const uint il = tid/32;
|
||||
const uint ir = tid%32;
|
||||
const uint ib = 32*i + ir;
|
||||
if (ib >= p.nel / 32) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint q_idx = 8*il;
|
||||
const uint b_idx = 1024*i + 32*ir + q_idx;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
||||
data_b[b_idx + l + 0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
|
||||
data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >> 4]);
|
||||
}
|
||||
}
|
||||
@@ -18,15 +18,13 @@ void main() {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint b_idx = 1024*i + 32*ir + 8*il;
|
||||
const uint q_idx = 8*il;
|
||||
const uint b_idx = 1024*i + 32*ir + q_idx;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const float dm = -8.0f * d;
|
||||
|
||||
const uint q_idx = 8*il;
|
||||
|
||||
[[unroll]] for (uint l = 0; l < 8; ++l) {
|
||||
data_b[b_idx + l + 0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm);
|
||||
data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >> 4) + dm);
|
||||
data_b[b_idx + l + 0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
|
||||
data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >> 4) - 8.0f));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,7 +71,7 @@ shared FLOAT_TYPE buf_a[BM * (BK+1)];
|
||||
shared FLOAT_TYPE buf_b[BN * (BK+1)];
|
||||
|
||||
#ifdef MUL_MAT_ID
|
||||
shared u16vec2 row_ids[2048];
|
||||
shared u16vec2 row_ids[3072];
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
@@ -380,6 +380,19 @@ void main() {
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32));
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
|
||||
#elif defined(DATA_A_IQ4_NL)
|
||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a;
|
||||
|
||||
const uint ib = idx / 16;
|
||||
const uint iqs = idx & 0xF;
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const uint vui = uint(data_a[ib].qs[iqs]);
|
||||
const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
||||
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
||||
#endif
|
||||
}
|
||||
[[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
|
||||
|
||||
@@ -177,3 +177,24 @@ struct block_q6_K
|
||||
|
||||
#define A_TYPE block_q6_K
|
||||
#endif
|
||||
|
||||
// IQuants
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#define QUANT_K 32
|
||||
#define QUANT_R 2
|
||||
|
||||
struct block_iq4_nl
|
||||
{
|
||||
float16_t d;
|
||||
uint8_t qs[QUANT_K/2];
|
||||
};
|
||||
|
||||
#define A_TYPE block_iq4_nl
|
||||
|
||||
const int8_t kvalues_iq4nl[16] = {
|
||||
int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
|
||||
int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -52,7 +52,8 @@ const std::vector<std::string> type_names = {
|
||||
"q3_k",
|
||||
"q4_k",
|
||||
"q5_k",
|
||||
"q6_k"
|
||||
"q6_k",
|
||||
"iq4_nl"
|
||||
};
|
||||
|
||||
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
|
||||
|
||||
@@ -93,6 +93,8 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
@@ -527,12 +529,16 @@ extern "C" {
|
||||
struct llama_lora_adapter * adapter,
|
||||
float scale);
|
||||
|
||||
// Remove a LoRA adapter from given context
|
||||
// Remove a specific LoRA adapter from given context
|
||||
// Return -1 if the adapter is not present in the context
|
||||
LLAMA_API int32_t llama_lora_adapter_remove(
|
||||
struct llama_context * ctx,
|
||||
struct llama_lora_adapter * adapter);
|
||||
|
||||
// Remove all LoRA adapters from given context
|
||||
LLAMA_API void llama_lora_adapter_clear(
|
||||
struct llama_context * ctx);
|
||||
|
||||
// Manually free a LoRA adapter
|
||||
// Note: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
|
||||
@@ -904,10 +910,10 @@ extern "C" {
|
||||
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
||||
|
||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
||||
|
||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
||||
|
||||
// Codellama infill tokens
|
||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||
@@ -963,6 +969,10 @@ extern "C" {
|
||||
bool remove_special,
|
||||
bool unparse_special);
|
||||
|
||||
//
|
||||
// Chat templates
|
||||
//
|
||||
|
||||
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||
@@ -1001,6 +1011,23 @@ extern "C" {
|
||||
|
||||
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
|
||||
|
||||
/// @details Apply constraints from grammar
|
||||
LLAMA_API void llama_grammar_sample(
|
||||
const struct llama_grammar * grammar,
|
||||
const struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
LLAMA_API DEPRECATED(void llama_sample_grammar(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const struct llama_grammar * grammar),
|
||||
"use llama_grammar_sample instead");
|
||||
|
||||
/// @details Accepts the sampled token into the grammar
|
||||
LLAMA_API void llama_grammar_accept_token(
|
||||
struct llama_grammar * grammar,
|
||||
struct llama_context * ctx,
|
||||
llama_token token);
|
||||
|
||||
//
|
||||
// Sampling functions
|
||||
//
|
||||
@@ -1082,12 +1109,6 @@ extern "C" {
|
||||
llama_token_data_array * candidates,
|
||||
float temp);
|
||||
|
||||
/// @details Apply constraints from grammar
|
||||
LLAMA_API void llama_sample_grammar(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const struct llama_grammar * grammar);
|
||||
|
||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||
@@ -1125,12 +1146,6 @@ extern "C" {
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
/// @details Accepts the sampled token into the grammar
|
||||
LLAMA_API void llama_grammar_accept_token(
|
||||
struct llama_context * ctx,
|
||||
struct llama_grammar * grammar,
|
||||
llama_token token);
|
||||
|
||||
//
|
||||
// Model split
|
||||
//
|
||||
@@ -1173,38 +1188,45 @@ extern "C" {
|
||||
|
||||
struct ggml_tensor;
|
||||
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
||||
struct llama_context * ctx
|
||||
);
|
||||
|
||||
struct llama_partial_utf8 {
|
||||
uint32_t value; // bit value so far (unshifted)
|
||||
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
||||
};
|
||||
|
||||
struct llama_grammar {
|
||||
const std::vector<std::vector<llama_grammar_element>> rules;
|
||||
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
||||
|
||||
// buffer for partially generated UTF-8 sequence from accepted tokens
|
||||
llama_partial_utf8 partial_utf8;
|
||||
};
|
||||
|
||||
struct llama_grammar_candidate {
|
||||
size_t index;
|
||||
const uint32_t * code_points;
|
||||
llama_partial_utf8 partial_utf8;
|
||||
};
|
||||
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
||||
struct llama_context * ctx
|
||||
);
|
||||
using llama_grammar_rule = std::vector< llama_grammar_element>;
|
||||
using llama_grammar_stack = std::vector<const llama_grammar_element *>;
|
||||
|
||||
using llama_grammar_rules = std::vector<llama_grammar_rule>;
|
||||
using llama_grammar_stacks = std::vector<llama_grammar_stack>;
|
||||
using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
|
||||
|
||||
const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
|
||||
llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
|
||||
|
||||
void llama_grammar_accept(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
||||
const uint32_t chr,
|
||||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
|
||||
const llama_grammar_rules & rules,
|
||||
const llama_grammar_stacks & stacks,
|
||||
const uint32_t chr,
|
||||
llama_grammar_stacks & new_stacks);
|
||||
|
||||
std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
||||
const llama_grammar_rules & rules,
|
||||
const llama_grammar_stack & stack,
|
||||
const llama_grammar_candidates & candidates);
|
||||
|
||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
const std::string & src,
|
||||
llama_partial_utf8 partial_start);
|
||||
llama_partial_utf8 partial_start);
|
||||
|
||||
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
|
||||
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -14,6 +14,9 @@ endif()
|
||||
add_library(llama
|
||||
../include/llama.h
|
||||
llama.cpp
|
||||
llama-vocab.cpp
|
||||
llama-grammar.cpp
|
||||
llama-sampling.cpp
|
||||
unicode.h
|
||||
unicode.cpp
|
||||
unicode-data.cpp
|
||||
|
||||
539
src/llama-grammar.cpp
Normal file
539
src/llama-grammar.cpp
Normal file
@@ -0,0 +1,539 @@
|
||||
#include "llama-grammar.h"
|
||||
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-sampling.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
||||
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
||||
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||
const std::string & src,
|
||||
llama_partial_utf8 partial_start) {
|
||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
||||
const char * pos = src.c_str();
|
||||
std::vector<uint32_t> code_points;
|
||||
|
||||
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
||||
code_points.reserve(src.size() + 1);
|
||||
uint32_t value = partial_start.value;
|
||||
int n_remain = partial_start.n_remain;
|
||||
|
||||
// continue previous decode, if applicable
|
||||
while (*pos != 0 && n_remain > 0) {
|
||||
uint8_t next_byte = static_cast<uint8_t>(*pos);
|
||||
if ((next_byte >> 6) != 2) {
|
||||
// invalid sequence, abort
|
||||
code_points.push_back(0);
|
||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
|
||||
}
|
||||
value = (value << 6) + (next_byte & 0x3F);
|
||||
++pos;
|
||||
--n_remain;
|
||||
}
|
||||
|
||||
if (partial_start.n_remain > 0 && n_remain == 0) {
|
||||
code_points.push_back(value);
|
||||
}
|
||||
|
||||
// decode any subsequent utf-8 sequences, which may end in an incomplete one
|
||||
while (*pos != 0) {
|
||||
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
||||
uint8_t highbits = first_byte >> 4;
|
||||
n_remain = lookup[highbits] - 1;
|
||||
|
||||
if (n_remain < 0) {
|
||||
// invalid sequence, abort
|
||||
code_points.clear();
|
||||
code_points.push_back(0);
|
||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
|
||||
}
|
||||
|
||||
uint8_t mask = (1 << (7 - n_remain)) - 1;
|
||||
value = first_byte & mask;
|
||||
|
||||
++pos;
|
||||
while (*pos != 0 && n_remain > 0) {
|
||||
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
||||
++pos;
|
||||
--n_remain;
|
||||
}
|
||||
if (n_remain == 0) {
|
||||
code_points.push_back(value);
|
||||
}
|
||||
}
|
||||
code_points.push_back(0);
|
||||
|
||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
||||
}
|
||||
|
||||
const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
|
||||
return grammar->rules;
|
||||
}
|
||||
|
||||
llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
|
||||
return grammar->stacks;
|
||||
}
|
||||
|
||||
// returns true iff pos points to the end of one of the definitions of a rule
|
||||
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
||||
switch (pos->type) {
|
||||
case LLAMA_GRETYPE_END: return true; // NOLINT
|
||||
case LLAMA_GRETYPE_ALT: return true; // NOLINT
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
// returns true iff chr satisfies the char range at pos (regular or inverse range)
|
||||
// asserts that pos is pointing to a char range element
|
||||
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
||||
const llama_grammar_element * pos,
|
||||
const uint32_t chr) {
|
||||
|
||||
bool found = false;
|
||||
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
||||
|
||||
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
||||
|
||||
do {
|
||||
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
||||
// inclusive range, e.g. [a-z]
|
||||
found = found || (pos->value <= chr && chr <= pos[1].value);
|
||||
pos += 2;
|
||||
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
||||
// Any character matches "."
|
||||
found = true;
|
||||
pos += 1;
|
||||
} else {
|
||||
// exact char match, e.g. [a] or "a"
|
||||
found = found || pos->value == chr;
|
||||
pos += 1;
|
||||
}
|
||||
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
||||
|
||||
return std::make_pair(found == is_positive_char, pos);
|
||||
}
|
||||
|
||||
// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
|
||||
// range at pos (regular or inverse range)
|
||||
// asserts that pos is pointing to a char range element
|
||||
static bool llama_grammar_match_partial_char(
|
||||
const llama_grammar_element * pos,
|
||||
const llama_partial_utf8 partial_utf8) {
|
||||
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
|
||||
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
||||
|
||||
uint32_t partial_value = partial_utf8.value;
|
||||
int n_remain = partial_utf8.n_remain;
|
||||
|
||||
// invalid sequence or 7-bit char split across 2 bytes (overlong)
|
||||
if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// range of possible code points this partial UTF-8 sequence could complete to
|
||||
uint32_t low = partial_value << (n_remain * 6);
|
||||
uint32_t high = low | ((1 << (n_remain * 6)) - 1);
|
||||
|
||||
if (low == 0) {
|
||||
if (n_remain == 2) {
|
||||
low = 1 << 11;
|
||||
} else if (n_remain == 3) {
|
||||
low = 1 << 16;
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
||||
// inclusive range, e.g. [a-z]
|
||||
if (pos->value <= high && low <= pos[1].value) {
|
||||
return is_positive_char;
|
||||
}
|
||||
pos += 2;
|
||||
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
|
||||
// Any character matches "."
|
||||
return true;
|
||||
} else {
|
||||
// exact char match, e.g. [a] or "a"
|
||||
if (low <= pos->value && pos->value <= high) {
|
||||
return is_positive_char;
|
||||
}
|
||||
pos += 1;
|
||||
}
|
||||
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
||||
|
||||
return !is_positive_char;
|
||||
}
|
||||
|
||||
// transforms a grammar pushdown stack into N possible stacks, all ending
|
||||
// at a character range (terminal element)
|
||||
static void llama_grammar_advance_stack(
|
||||
const llama_grammar_rules & rules,
|
||||
const llama_grammar_stack & stack,
|
||||
llama_grammar_stacks & new_stacks) {
|
||||
if (stack.empty()) {
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
new_stacks.emplace_back(stack);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_grammar_element * pos = stack.back();
|
||||
|
||||
switch (pos->type) {
|
||||
case LLAMA_GRETYPE_RULE_REF: {
|
||||
const size_t rule_id = static_cast<size_t>(pos->value);
|
||||
const llama_grammar_element * subpos = rules[rule_id].data();
|
||||
do {
|
||||
// init new stack without the top (pos)
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
||||
// if this rule ref is followed by another element, add that to stack
|
||||
new_stack.push_back(pos + 1);
|
||||
}
|
||||
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
||||
// if alternate is nonempty, add to stack
|
||||
new_stack.push_back(subpos);
|
||||
}
|
||||
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
||||
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
||||
// scan to end of alternate def
|
||||
subpos++;
|
||||
}
|
||||
if (subpos->type == LLAMA_GRETYPE_ALT) {
|
||||
// there's another alternate def of this rule to process
|
||||
subpos++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (true);
|
||||
break;
|
||||
}
|
||||
case LLAMA_GRETYPE_CHAR:
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
// only add the stack if it's not a duplicate of one we already have
|
||||
new_stacks.emplace_back(stack);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
||||
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
||||
// those
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// takes a set of possible pushdown stacks on a grammar, which are required to
|
||||
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
||||
// produces the N possible stacks if the given char is accepted at those
|
||||
// positions
|
||||
void llama_grammar_accept(
|
||||
const llama_grammar_rules & rules,
|
||||
const llama_grammar_stacks & stacks,
|
||||
const uint32_t chr,
|
||||
llama_grammar_stacks & new_stacks) {
|
||||
new_stacks.clear();
|
||||
|
||||
for (const auto & stack : stacks) {
|
||||
if (stack.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto match = llama_grammar_match_char(stack.back(), chr);
|
||||
if (match.first) {
|
||||
const llama_grammar_element * pos = match.second;
|
||||
|
||||
// update top of stack to next element, if any
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
new_stack.push_back(pos);
|
||||
}
|
||||
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static llama_grammar_candidates llama_grammar_reject_candidates(
|
||||
const llama_grammar_rules & rules,
|
||||
const llama_grammar_stacks & stacks,
|
||||
const llama_grammar_candidates & candidates) {
|
||||
GGML_ASSERT(!stacks.empty()); // REVIEW
|
||||
|
||||
if (candidates.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
|
||||
|
||||
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
|
||||
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
|
||||
}
|
||||
return rejects;
|
||||
}
|
||||
|
||||
llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||
const llama_grammar_rules & rules,
|
||||
const llama_grammar_stack & stack,
|
||||
const llama_grammar_candidates & candidates) {
|
||||
|
||||
llama_grammar_candidates rejects;
|
||||
rejects.reserve(candidates.size());
|
||||
|
||||
if (stack.empty()) {
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
}
|
||||
return rejects;
|
||||
}
|
||||
|
||||
const llama_grammar_element * stack_pos = stack.back();
|
||||
|
||||
llama_grammar_candidates next_candidates;
|
||||
next_candidates.reserve(candidates.size());
|
||||
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points == 0) {
|
||||
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
||||
// that cannot satisfy this position in grammar
|
||||
if (tok.partial_utf8.n_remain != 0 &&
|
||||
!llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
||||
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
||||
} else {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
}
|
||||
|
||||
const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
||||
|
||||
// update top of stack to next element, if any
|
||||
llama_grammar_stack stack_after(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
|
||||
stack_after.push_back(stack_pos_after);
|
||||
}
|
||||
llama_grammar_stacks next_stacks;
|
||||
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
||||
|
||||
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
||||
for (const auto & tok : next_rejects) {
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
||||
}
|
||||
|
||||
return rejects;
|
||||
}
|
||||
|
||||
static bool llama_grammar_detect_left_recursion(
|
||||
const llama_grammar_rules & rules,
|
||||
size_t rule_index,
|
||||
std::vector<bool> * rules_visited,
|
||||
std::vector<bool> * rules_in_progress,
|
||||
std::vector<bool> * rules_may_be_empty) {
|
||||
if ((*rules_in_progress)[rule_index]) {
|
||||
return true;
|
||||
}
|
||||
|
||||
(*rules_in_progress)[rule_index] = true;
|
||||
|
||||
const llama_grammar_rule & rule = rules[rule_index];
|
||||
|
||||
// First check if the rule might produce the empty string. This could be done combined with the second
|
||||
// step but it's more readable as two steps.
|
||||
bool at_rule_start = true;
|
||||
for (size_t i = 0; i < rule.size(); i++) {
|
||||
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
||||
if (at_rule_start) {
|
||||
(*rules_may_be_empty)[rule_index] = true;
|
||||
break;
|
||||
}
|
||||
at_rule_start = true;
|
||||
} else {
|
||||
at_rule_start = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
||||
// be empty)
|
||||
bool recurse_into_nonterminal = true;
|
||||
for (size_t i = 0; i < rule.size(); i++) {
|
||||
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
||||
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
||||
return true;
|
||||
}
|
||||
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
||||
recurse_into_nonterminal = false;
|
||||
}
|
||||
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
||||
recurse_into_nonterminal = true;
|
||||
} else {
|
||||
recurse_into_nonterminal = false;
|
||||
}
|
||||
}
|
||||
|
||||
(*rules_in_progress)[rule_index] = false;
|
||||
(*rules_visited)[rule_index] = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// grammar - external
|
||||
//
|
||||
|
||||
struct llama_grammar * llama_grammar_init_impl(
|
||||
const llama_grammar_element ** rules,
|
||||
size_t n_rules,
|
||||
size_t start_rule_index) {
|
||||
const llama_grammar_element * pos;
|
||||
|
||||
// copy rule definitions into vectors
|
||||
llama_grammar_rules vec_rules(n_rules);
|
||||
for (size_t i = 0; i < n_rules; i++) {
|
||||
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
||||
vec_rules[i].push_back(*pos);
|
||||
}
|
||||
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
||||
}
|
||||
|
||||
// Check for left recursion
|
||||
std::vector<bool> rules_visited(n_rules);
|
||||
std::vector<bool> rules_in_progress(n_rules);
|
||||
std::vector<bool> rules_may_be_empty(n_rules);
|
||||
for (size_t i = 0; i < n_rules; i++) {
|
||||
if (rules_visited[i]) {
|
||||
continue;
|
||||
}
|
||||
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
||||
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// loop over alternates of start rule to build initial stacks
|
||||
llama_grammar_stacks stacks;
|
||||
pos = vec_rules[start_rule_index].data();
|
||||
do {
|
||||
llama_grammar_stack stack;
|
||||
if (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
// if alternate is nonempty, add to stack
|
||||
stack.push_back(pos);
|
||||
}
|
||||
llama_grammar_advance_stack(vec_rules, stack, stacks);
|
||||
while (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
// scan to end of alternate def
|
||||
pos++;
|
||||
}
|
||||
if (pos->type == LLAMA_GRETYPE_ALT) {
|
||||
// there's another alternate def of this rule to process
|
||||
pos++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (true);
|
||||
|
||||
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
||||
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
||||
}
|
||||
|
||||
void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
||||
delete grammar;
|
||||
}
|
||||
|
||||
struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar) {
|
||||
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
|
||||
|
||||
// redirect elements in stacks to point to new rules
|
||||
for (size_t is = 0; is < result->stacks.size(); is++) {
|
||||
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
|
||||
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
|
||||
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
|
||||
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
|
||||
result->stacks[is][ie] = &result->rules[ir0][ir1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token_data_array * candidates) {
|
||||
GGML_ASSERT(grammar);
|
||||
GGML_ASSERT(vocab);
|
||||
|
||||
int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
bool allow_eog = false;
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
allow_eog = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
||||
candidates_decoded.reserve(candidates->size);
|
||||
|
||||
llama_grammar_candidates candidates_grammar;
|
||||
candidates_grammar.reserve(candidates->size);
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string & piece = vocab->cache_token_to_piece.at(id);
|
||||
|
||||
if (llama_token_is_eog_impl(*vocab, id)) {
|
||||
if (!allow_eog) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
}
|
||||
} else if (piece.empty() || piece[0] == 0) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||
}
|
||||
}
|
||||
|
||||
const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
||||
for (const auto & reject : rejects) {
|
||||
candidates->data[reject.index].logit = -INFINITY;
|
||||
}
|
||||
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
||||
void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token token) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
if (llama_token_is_eog_impl(*vocab, token)) {
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
const std::string & piece = vocab->cache_token_to_piece.at(token);
|
||||
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
|
||||
llama_grammar_stacks tmp_new_stacks;
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
|
||||
grammar->stacks = tmp_new_stacks;
|
||||
}
|
||||
|
||||
grammar->partial_utf8 = decoded.second;
|
||||
GGML_ASSERT(!grammar->stacks.empty());
|
||||
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
41
src/llama-grammar.h
Normal file
41
src/llama-grammar.h
Normal file
@@ -0,0 +1,41 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
struct llama_vocab;
|
||||
struct llama_sampling;
|
||||
|
||||
struct llama_grammar {
|
||||
const llama_grammar_rules rules;
|
||||
llama_grammar_stacks stacks;
|
||||
|
||||
// buffer for partially generated UTF-8 sequence from accepted tokens
|
||||
llama_partial_utf8 partial_utf8;
|
||||
};
|
||||
|
||||
struct llama_grammar * llama_get_grammar(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// internal API
|
||||
//
|
||||
|
||||
struct llama_grammar * llama_grammar_init_impl(
|
||||
const llama_grammar_element ** rules,
|
||||
size_t n_rules,
|
||||
size_t start_rule_index);
|
||||
|
||||
void llama_grammar_free_impl(struct llama_grammar * grammar);
|
||||
|
||||
struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar);
|
||||
|
||||
void llama_grammar_sample_impl(
|
||||
const struct llama_grammar * grammar,
|
||||
const struct llama_vocab * vocab,
|
||||
const struct llama_sampling * smpl,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
void llama_grammar_accept_token_impl(
|
||||
struct llama_grammar * grammar,
|
||||
const struct llama_vocab * vocab,
|
||||
const struct llama_sampling * smpl,
|
||||
llama_token token);
|
||||
26
src/llama-impl.h
Normal file
26
src/llama-impl.h
Normal file
@@ -0,0 +1,26 @@
|
||||
#pragma once
|
||||
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "llama.h"
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
#else
|
||||
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
#else
|
||||
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||
#endif
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
||||
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
||||
void llama_log_internal (ggml_log_level level, const char * format, ...);
|
||||
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
635
src/llama-sampling.cpp
Normal file
635
src/llama-sampling.cpp
Normal file
@@ -0,0 +1,635 @@
|
||||
#include "llama-sampling.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <cfloat>
|
||||
#include <numeric>
|
||||
#include <unordered_map>
|
||||
|
||||
static void llama_log_softmax(float * array, size_t size) {
|
||||
float max_l = *std::max_element(array, array + size);
|
||||
float sum = 0.f;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
float p = expf(array[i] - max_l);
|
||||
sum += p;
|
||||
array[i] = p;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
array[i] = logf(array[i] / sum);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed) {
|
||||
if (seed == LLAMA_DEFAULT_SEED) {
|
||||
seed = time(NULL);
|
||||
}
|
||||
|
||||
smpl->rng.seed(seed);
|
||||
}
|
||||
|
||||
void llama_sample_softmax_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
|
||||
GGML_ASSERT(candidates->size > 0);
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Sort the logits in descending order
|
||||
if (!candidates->sorted) {
|
||||
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
candidates->sorted = true;
|
||||
}
|
||||
|
||||
float max_l = candidates->data[0].logit;
|
||||
float cum_sum = 0.0f;
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
float p = expf(candidates->data[i].logit - max_l);
|
||||
candidates->data[i].p = p;
|
||||
cum_sum += p;
|
||||
}
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
candidates->data[i].p /= cum_sum;
|
||||
}
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
||||
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
|
||||
// if (k >= (int32_t)candidates->size) {
|
||||
// return;
|
||||
// }
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
if (k <= 0) {
|
||||
k = candidates->size;
|
||||
}
|
||||
|
||||
k = std::max(k, (int) min_keep);
|
||||
k = std::min(k, (int) candidates->size);
|
||||
|
||||
// Sort scores in descending order
|
||||
if (!candidates->sorted) {
|
||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
};
|
||||
if (k <= 128) {
|
||||
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
||||
} else {
|
||||
constexpr int nbuckets = 128;
|
||||
constexpr float bucket_low = -10.0f;
|
||||
constexpr float bucket_high = 10.0f;
|
||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||
constexpr float bucker_inter = -bucket_low * bucket_scale;
|
||||
|
||||
std::vector<int> bucket_idx(candidates->size);
|
||||
std::vector<int> histo(nbuckets, 0);
|
||||
|
||||
for (int i = 0; i < (int)candidates->size; ++i) {
|
||||
const float val = candidates->data[i].logit;
|
||||
int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||
ib = std::max(0, std::min(nbuckets-1, ib));
|
||||
bucket_idx[i] = ib;
|
||||
++histo[ib];
|
||||
}
|
||||
int nhave = 0;
|
||||
int ib = nbuckets - 1;
|
||||
for ( ; ib >= 0; --ib) {
|
||||
nhave += histo[ib];
|
||||
if (nhave >= k) break;
|
||||
}
|
||||
std::vector<llama_token_data> tmp_tokens(nhave);
|
||||
auto ptr = tmp_tokens.data();
|
||||
std::vector<llama_token_data*> bucket_ptrs;
|
||||
bucket_ptrs.reserve(nbuckets - ib);
|
||||
for (int j = nbuckets - 1; j >= ib; --j) {
|
||||
bucket_ptrs.push_back(ptr);
|
||||
ptr += histo[j];
|
||||
}
|
||||
for (int i = 0; i < (int)candidates->size; ++i) {
|
||||
int j = bucket_idx[i];
|
||||
if (j >= ib) {
|
||||
*bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
|
||||
}
|
||||
}
|
||||
|
||||
ptr = tmp_tokens.data();
|
||||
int ndone = 0;
|
||||
for (int j = nbuckets-1; j > ib; --j) {
|
||||
std::sort(ptr, ptr + histo[j], comp);
|
||||
ptr += histo[j];
|
||||
ndone += histo[j];
|
||||
}
|
||||
std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
|
||||
|
||||
std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
|
||||
|
||||
}
|
||||
candidates->sorted = true;
|
||||
}
|
||||
candidates->size = k;
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_top_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
|
||||
if (p >= 1.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_sample_softmax_impl(smpl, candidates);
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Compute the cumulative probabilities
|
||||
float cum_sum = 0.0f;
|
||||
size_t last_idx = candidates->size;
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
cum_sum += candidates->data[i].p;
|
||||
|
||||
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
||||
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
||||
if (cum_sum >= p && i + 1 >= min_keep) {
|
||||
last_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the top-p tokens
|
||||
candidates->size = last_idx;
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_min_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
|
||||
if (p <= 0.0f || !candidates->size) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
bool min_p_applied = false;
|
||||
|
||||
// if the candidates aren't sorted, try the unsorted implementation first
|
||||
if (!candidates->sorted) {
|
||||
std::vector<llama_token_data> filtered_tokens;
|
||||
|
||||
float max_logit = -FLT_MAX;
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
max_logit = std::max(max_logit, candidates->data[i].logit);
|
||||
}
|
||||
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
if (candidates->data[i].logit >= min_logit) {
|
||||
filtered_tokens.push_back(candidates->data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// if we have enough values the operation was a success
|
||||
if (filtered_tokens.size() >= min_keep) {
|
||||
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
||||
candidates->size = filtered_tokens.size();
|
||||
min_p_applied = true;
|
||||
}
|
||||
}
|
||||
|
||||
// if the candidates are sorted or the unsorted implementation failed, use this implementation
|
||||
if (!min_p_applied) {
|
||||
// Sort the logits in descending order
|
||||
if (!candidates->sorted) {
|
||||
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit > b.logit;
|
||||
});
|
||||
candidates->sorted = true;
|
||||
}
|
||||
|
||||
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
||||
size_t i = 1; // first token always matches
|
||||
|
||||
for (; i < candidates->size; ++i) {
|
||||
if (candidates->data[i].logit < min_logit && i >= min_keep) {
|
||||
break; // prob too small
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the matching tokens
|
||||
candidates->size = i;
|
||||
}
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep) {
|
||||
if (z >= 1.0f || candidates->size <= 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Compute the first and second derivatives
|
||||
std::vector<float> first_derivatives(candidates->size - 1);
|
||||
std::vector<float> second_derivatives(candidates->size - 2);
|
||||
|
||||
for (size_t i = 0; i < first_derivatives.size(); ++i) {
|
||||
first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
|
||||
}
|
||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
|
||||
}
|
||||
|
||||
// Calculate absolute value of second derivatives
|
||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||
second_derivatives[i] = std::abs(second_derivatives[i]);
|
||||
}
|
||||
|
||||
// Normalize the second derivatives
|
||||
{
|
||||
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
||||
|
||||
if (second_derivatives_sum > 1e-6f) {
|
||||
for (float & value : second_derivatives) {
|
||||
value /= second_derivatives_sum;
|
||||
}
|
||||
} else {
|
||||
for (float & value : second_derivatives) {
|
||||
value = 1.0f / second_derivatives.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float cum_sum = 0.0f;
|
||||
size_t last_idx = candidates->size;
|
||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||
cum_sum += second_derivatives[i];
|
||||
|
||||
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
|
||||
if (cum_sum > z && i >= min_keep) {
|
||||
last_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the tokens above the tail location
|
||||
candidates->size = last_idx;
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
|
||||
// Reference implementation:
|
||||
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
||||
if (p >= 1.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Compute the softmax of logits and calculate entropy
|
||||
llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
float entropy = 0.0f;
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
||||
}
|
||||
|
||||
// Compute the absolute difference between negative log probability and entropy for each candidate
|
||||
std::vector<float> shifted_scores;
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
|
||||
shifted_scores.push_back(shifted_score);
|
||||
}
|
||||
|
||||
// Sort tokens based on the shifted_scores and their corresponding indices
|
||||
std::vector<size_t> indices(candidates->size);
|
||||
std::iota(indices.begin(), indices.end(), 0);
|
||||
|
||||
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
|
||||
return shifted_scores[a] < shifted_scores[b];
|
||||
});
|
||||
|
||||
// Compute the cumulative probabilities
|
||||
float cum_sum = 0.0f;
|
||||
size_t last_idx = indices.size();
|
||||
|
||||
for (size_t i = 0; i < indices.size(); ++i) {
|
||||
size_t idx = indices[i];
|
||||
cum_sum += candidates->data[idx].p;
|
||||
|
||||
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
||||
if (cum_sum > p && i >= min_keep - 1) {
|
||||
last_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Resize the output vector to keep only the locally typical tokens
|
||||
std::vector<llama_token_data> new_candidates;
|
||||
for (size_t i = 0; i < last_idx; ++i) {
|
||||
size_t idx = indices[i];
|
||||
new_candidates.push_back(candidates->data[idx]);
|
||||
}
|
||||
|
||||
// Replace the data in candidates with the new_candidates data
|
||||
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
||||
candidates->size = new_candidates.size();
|
||||
candidates->sorted = false;
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_entropy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// no need to do anything if there is only one (or zero) candidates
|
||||
if(candidates->size <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate maximum possible entropy
|
||||
float max_entropy = -logf(1.0f / candidates->size);
|
||||
|
||||
llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
|
||||
|
||||
// Calculate entropy of the softmax probabilities
|
||||
float entropy = 0.0f;
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
float prob = candidates->data[i].p;
|
||||
if (prob > 0.0f) { // Ensure no log(0)
|
||||
entropy -= prob * logf(prob);
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize the entropy (max_entropy cannot be 0 here because we checked candidates->size != 1 above)
|
||||
float normalized_entropy = entropy / max_entropy;
|
||||
|
||||
// Map the normalized entropy to the desired temperature range using the power function
|
||||
float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
|
||||
|
||||
#ifdef DEBUG
|
||||
LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
|
||||
LLAMA_LOG_INFO("Entropy: %f\n", entropy);
|
||||
LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
|
||||
LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
|
||||
LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
|
||||
LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
|
||||
#endif
|
||||
|
||||
// Apply the dynamically calculated temperature scaling
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
candidates->data[i].logit /= dyn_temp;
|
||||
}
|
||||
|
||||
// Re-compute softmax probabilities after scaling logits with dynamic temperature
|
||||
double max_l_double = candidates->data[0].logit;
|
||||
double cum_sum_double = 0.0;
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
double p = exp(candidates->data[i].logit - max_l_double);
|
||||
candidates->data[i].p = p; // Store the scaled probability
|
||||
cum_sum_double += p;
|
||||
}
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
candidates->data[i].p /= cum_sum_double; // Re-normalize the probabilities
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
// Print the updated top 25 probabilities after temperature scaling
|
||||
LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
|
||||
for (size_t i = 0; i < 25 && i < candidates->size; ++i) {
|
||||
LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates->data[i].p * 100.0f);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float temp) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
candidates->data[i].logit /= temp;
|
||||
}
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_repetition_penalties_impl(
|
||||
struct llama_sampling * smpl,
|
||||
llama_token_data_array * candidates,
|
||||
const llama_token * last_tokens,
|
||||
size_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present) {
|
||||
if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Create a frequency map to count occurrences of each token in last_tokens
|
||||
std::unordered_map<llama_token, int> token_count;
|
||||
for (size_t i = 0; i < penalty_last_n; ++i) {
|
||||
token_count[last_tokens[i]]++;
|
||||
}
|
||||
|
||||
// Apply frequency and presence penalties to the candidates
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const auto token_iter = token_count.find(candidates->data[i].id);
|
||||
if (token_iter == token_count.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const int count = token_iter->second;
|
||||
|
||||
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
||||
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
||||
if (candidates->data[i].logit <= 0) {
|
||||
candidates->data[i].logit *= penalty_repeat;
|
||||
} else {
|
||||
candidates->data[i].logit /= penalty_repeat;
|
||||
}
|
||||
|
||||
candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
|
||||
}
|
||||
|
||||
candidates->sorted = false;
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_apply_guidance_impl(
|
||||
struct llama_sampling * smpl,
|
||||
float * logits,
|
||||
float * logits_guidance,
|
||||
float scale) {
|
||||
GGML_ASSERT(smpl);
|
||||
|
||||
const auto t_start_sample_us = ggml_time_us();
|
||||
const auto n_vocab = smpl->n_vocab;
|
||||
|
||||
llama_log_softmax(logits, n_vocab);
|
||||
llama_log_softmax(logits_guidance, n_vocab);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
auto & l = logits[i];
|
||||
const auto & g = logits_guidance[i];
|
||||
|
||||
l = scale * (l - g) + g;
|
||||
}
|
||||
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
||||
llama_token llama_sample_token_mirostat_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
||||
GGML_ASSERT(smpl);
|
||||
|
||||
const int32_t n_vocab = float(smpl->n_vocab);
|
||||
|
||||
int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
|
||||
|
||||
// Estimate s_hat using the most probable m tokens
|
||||
float s_hat = 0.0;
|
||||
float sum_ti_bi = 0.0;
|
||||
float sum_ti_sq = 0.0;
|
||||
for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
|
||||
float t_i = logf(float(i + 2) / float(i + 1));
|
||||
float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
|
||||
sum_ti_bi += t_i * b_i;
|
||||
sum_ti_sq += t_i * t_i;
|
||||
}
|
||||
s_hat = sum_ti_bi / sum_ti_sq;
|
||||
|
||||
// Compute k from the estimated s_hat and target surprise value
|
||||
float epsilon_hat = s_hat - 1;
|
||||
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat);
|
||||
|
||||
// Sample the next word X using top-k sampling
|
||||
llama_sample_top_k_impl((struct llama_sampling *) nullptr, candidates, int(k), 1);
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
llama_token X = llama_sample_token_impl(smpl, candidates);
|
||||
t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Compute error as the difference between observed surprise and target surprise value
|
||||
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
||||
return candidate.id == X;
|
||||
}));
|
||||
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
||||
float e = observed_surprise - tau;
|
||||
|
||||
// Update mu using the learning rate and error
|
||||
*mu = *mu - eta * e;
|
||||
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
return X;
|
||||
}
|
||||
|
||||
llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
||||
int64_t t_start_sample_us;
|
||||
t_start_sample_us = ggml_time_us();
|
||||
|
||||
llama_sample_softmax_impl(smpl, candidates);
|
||||
|
||||
// Truncate the words with surprise values greater than mu
|
||||
candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
||||
return -log2f(candidate.p) > *mu;
|
||||
}));
|
||||
|
||||
if (candidates->size == 0) {
|
||||
candidates->size = 1;
|
||||
}
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
||||
// Normalize the probabilities of the remaining words
|
||||
llama_sample_softmax_impl(smpl, candidates);
|
||||
|
||||
// Sample the next word X from the remaining words
|
||||
llama_token X = llama_sample_token_impl(smpl, candidates);
|
||||
t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Compute error as the difference between observed surprise and target surprise value
|
||||
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
||||
return candidate.id == X;
|
||||
}));
|
||||
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
||||
float e = observed_surprise - tau;
|
||||
|
||||
// Update mu using the learning rate and error
|
||||
*mu = *mu - eta * e;
|
||||
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
return X;
|
||||
}
|
||||
|
||||
llama_token llama_sample_token_greedy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Find max element
|
||||
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit < b.logit;
|
||||
});
|
||||
|
||||
llama_token result = max_iter->id;
|
||||
if (smpl) {
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
smpl->n_sample++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
llama_token llama_sample_token_with_rng_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng) {
|
||||
GGML_ASSERT(smpl);
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
|
||||
|
||||
std::vector<float> probs;
|
||||
probs.reserve(candidates->size);
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
probs.push_back(candidates->data[i].p);
|
||||
}
|
||||
|
||||
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||
int idx = dist(rng);
|
||||
|
||||
llama_token result = candidates->data[idx].id;
|
||||
|
||||
smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
smpl->n_sample++;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
llama_token llama_sample_token_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
|
||||
return llama_sample_token_with_rng_impl(smpl, candidates, smpl->rng);
|
||||
}
|
||||
56
src/llama-sampling.h
Normal file
56
src/llama-sampling.h
Normal file
@@ -0,0 +1,56 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
struct llama_sampling {
|
||||
llama_sampling(int32_t n_vocab) : n_vocab(n_vocab) {}
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
int32_t n_vocab = 0;
|
||||
|
||||
mutable int64_t t_sample_us = 0;
|
||||
mutable int32_t n_sample = 0;
|
||||
|
||||
void reset_timings() const {
|
||||
t_sample_us = 0;
|
||||
n_sample = 0;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// internal API
|
||||
//
|
||||
|
||||
void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed);
|
||||
|
||||
void llama_sample_softmax_impl (struct llama_sampling * smpl, llama_token_data_array * candidates);
|
||||
void llama_sample_top_k_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep);
|
||||
void llama_sample_top_p_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
void llama_sample_min_p_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep);
|
||||
void llama_sample_typical_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||
void llama_sample_entropy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
|
||||
void llama_sample_temp_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
|
||||
|
||||
void llama_sample_repetition_penalties_impl(
|
||||
struct llama_sampling * smpl,
|
||||
llama_token_data_array * candidates,
|
||||
const llama_token * last_tokens,
|
||||
size_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present);
|
||||
|
||||
void llama_sample_apply_guidance_impl(
|
||||
struct llama_sampling * smpl,
|
||||
float * logits,
|
||||
float * logits_guidance,
|
||||
float scale);
|
||||
|
||||
llama_token llama_sample_token_mirostat_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu);
|
||||
llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu);
|
||||
llama_token llama_sample_token_greedy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates);
|
||||
llama_token llama_sample_token_with_rng_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng);
|
||||
llama_token llama_sample_token_impl (struct llama_sampling * smpl, llama_token_data_array * candidates);
|
||||
|
||||
1721
src/llama-vocab.cpp
Normal file
1721
src/llama-vocab.cpp
Normal file
File diff suppressed because it is too large
Load Diff
130
src/llama-vocab.h
Normal file
130
src/llama-vocab.h
Normal file
@@ -0,0 +1,130 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
|
||||
struct llama_vocab {
|
||||
using id = llama_token;
|
||||
using token = std::string;
|
||||
using tattr = llama_token_attr;
|
||||
|
||||
struct token_data {
|
||||
token text;
|
||||
float score;
|
||||
tattr attr;
|
||||
};
|
||||
|
||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
|
||||
int max_token_len = 0; // used for optimizing longest token search
|
||||
|
||||
std::unordered_map<token, id> token_to_id;
|
||||
std::vector<token_data> id_to_token;
|
||||
|
||||
std::vector<id> cache_special_tokens;
|
||||
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
|
||||
|
||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||
|
||||
// default LLaMA special tokens
|
||||
id special_bos_id = 1;
|
||||
id special_eos_id = 2;
|
||||
id special_unk_id = 0;
|
||||
id special_sep_id = -1;
|
||||
id special_pad_id = -1;
|
||||
id special_cls_id = -1;
|
||||
id special_mask_id = -1;
|
||||
|
||||
id linefeed_id = 13;
|
||||
id special_prefix_id = -1;
|
||||
id special_suffix_id = -1;
|
||||
id special_middle_id = -1;
|
||||
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
||||
|
||||
// tokenizer flags
|
||||
bool tokenizer_add_space_prefix = false;
|
||||
bool tokenizer_add_bos = false;
|
||||
bool tokenizer_add_eos = false;
|
||||
bool tokenizer_ignore_merges = false;
|
||||
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
|
||||
bool tokenizer_remove_extra_whitespaces = false;
|
||||
bool tokenizer_escape_whitespaces = true;
|
||||
bool tokenizer_treat_whitespace_as_suffix = false;
|
||||
|
||||
std::vector<char> precompiled_charsmap;
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||
};
|
||||
|
||||
const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// internal API
|
||||
//
|
||||
|
||||
// TODO: rename to llama_tokenize_impl
|
||||
// TODO: This should probably be in llama.h
|
||||
std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||
const llama_vocab & vocab,
|
||||
std::string raw_text,
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
|
||||
|
||||
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
|
||||
|
||||
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
||||
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
||||
|
||||
int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
||||
int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
||||
|
||||
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
||||
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
|
||||
|
||||
int32_t llama_tokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const char * text,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens_max,
|
||||
bool add_special,
|
||||
bool parse_special);
|
||||
|
||||
// does not write null-terminator to buf
|
||||
int32_t llama_token_to_piece_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int32_t length,
|
||||
int32_t lstrip,
|
||||
bool special);
|
||||
|
||||
int32_t llama_detokenize_impl(
|
||||
const struct llama_vocab & vocab,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
char * text,
|
||||
int32_t text_len_max,
|
||||
bool remove_special,
|
||||
bool unparse_special);
|
||||
3248
src/llama.cpp
3248
src/llama.cpp
File diff suppressed because it is too large
Load Diff
@@ -19,6 +19,12 @@
|
||||
#include <locale>
|
||||
#include <codecvt>
|
||||
|
||||
size_t unicode_len_utf8(char src) {
|
||||
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
||||
return lookup[highbits];
|
||||
}
|
||||
|
||||
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||
std::string result;
|
||||
for (size_t i = 0; i < cps.size(); ++i) {
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// TODO: prefix all symbols with "llama_"
|
||||
|
||||
struct codepoint_flags {
|
||||
enum {
|
||||
UNDEFINED = 0x0001,
|
||||
@@ -46,6 +48,7 @@ struct codepoint_flags {
|
||||
}
|
||||
};
|
||||
|
||||
size_t unicode_len_utf8(char src);
|
||||
|
||||
std::string unicode_cpt_to_utf8(uint32_t cp);
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
||||
|
||||
@@ -70,21 +70,19 @@ add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
|
||||
target_link_libraries(test-tokenizer-0 PRIVATE common)
|
||||
install(TARGETS test-tokenizer-0 RUNTIME)
|
||||
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
||||
# TODO: enable when fixed
|
||||
# https://github.com/ggerganov/llama.cpp/pull/7036
|
||||
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
||||
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
|
||||
# build test-tokenizer-1-bpe target once and add many tests
|
||||
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
||||
@@ -92,16 +90,14 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
||||
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
||||
|
||||
# TODO: disabled due to slowness
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf)
|
||||
|
||||
# build test-tokenizer-1-spm target once and add many tests
|
||||
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
@@ -133,13 +132,31 @@ int main(void) {
|
||||
);
|
||||
formatted_chat.resize(res);
|
||||
std::string output(formatted_chat.data(), formatted_chat.size());
|
||||
std::cout << output << "\n-------------------------\n";
|
||||
printf("%s\n", output.c_str());
|
||||
printf("-------------------------\n");
|
||||
assert(output == expected);
|
||||
}
|
||||
|
||||
// test llama_chat_format_single
|
||||
std::cout << "\n\n=== llama_chat_format_single ===\n\n";
|
||||
|
||||
// test llama_chat_format_single for system message
|
||||
printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
|
||||
std::vector<llama_chat_msg> chat2;
|
||||
llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
|
||||
|
||||
auto fmt_sys = [&](std::string tmpl) {
|
||||
auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
|
||||
printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
|
||||
printf("-------------------------\n");
|
||||
return output;
|
||||
};
|
||||
assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
|
||||
assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
|
||||
assert(fmt_sys("gemma") == ""); // for gemma, system message is merged with user message
|
||||
assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
|
||||
|
||||
|
||||
// test llama_chat_format_single for user message
|
||||
printf("\n\n=== llama_chat_format_single (user message) ===\n\n");
|
||||
chat2.push_back({"system", "You are a helpful assistant"});
|
||||
chat2.push_back({"user", "Hello"});
|
||||
chat2.push_back({"assistant", "I am assistant"});
|
||||
@@ -147,12 +164,13 @@ int main(void) {
|
||||
|
||||
auto fmt_single = [&](std::string tmpl) {
|
||||
auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
|
||||
std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
|
||||
printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
|
||||
printf("-------------------------\n");
|
||||
return output;
|
||||
};
|
||||
assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
|
||||
assert(fmt_single("llama2") == "[INST] How are you [/INST]");
|
||||
assert(fmt_single("gemma") == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
|
||||
assert(fmt_single("gemma") == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
|
||||
assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -44,21 +44,26 @@ static bool test_build_grammar_fails(const std::string & grammar_str) {
|
||||
return grammar_fails;
|
||||
}
|
||||
|
||||
static bool match_string(const std::string & input, llama_grammar* grammar) {
|
||||
static bool match_string(const std::string & input, llama_grammar * grammar) {
|
||||
auto decoded = decode_utf8(input, {});
|
||||
|
||||
const auto & code_points = decoded.first;
|
||||
|
||||
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
|
||||
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
|
||||
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
auto prev_stacks = grammar->stacks;
|
||||
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
||||
if (grammar->stacks.empty()) {
|
||||
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
|
||||
|
||||
llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
|
||||
|
||||
if (cur_stacks.empty()) {
|
||||
// no stacks means that the grammar failed to match at this point
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
for (const auto & stack : cur_stacks) {
|
||||
if (stack.empty()) {
|
||||
// An empty stack means that the grammar has been completed
|
||||
return true;
|
||||
@@ -75,7 +80,9 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
|
||||
auto grammar = build_grammar(grammar_str);
|
||||
|
||||
// Save the original grammar stacks so that we can reset after every new string we want to test
|
||||
auto original_stacks = grammar->stacks;
|
||||
const llama_grammar_stacks original_stacks = llama_grammar_get_stacks(grammar);
|
||||
|
||||
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
|
||||
|
||||
fprintf(stderr, " 🔵 Valid strings:\n");
|
||||
|
||||
@@ -112,7 +119,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
|
||||
assert(matched);
|
||||
|
||||
// Reset the grammar stacks
|
||||
grammar->stacks = original_stacks;
|
||||
cur_stacks = original_stacks;
|
||||
}
|
||||
|
||||
fprintf(stderr, " 🟠 Invalid strings:\n");
|
||||
@@ -132,7 +139,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
|
||||
assert(!matched);
|
||||
|
||||
// Reset the grammar stacks
|
||||
grammar->stacks = original_stacks;
|
||||
cur_stacks = original_stacks;
|
||||
}
|
||||
|
||||
// Clean up allocated memory
|
||||
|
||||
@@ -2,10 +2,12 @@
|
||||
#undef NDEBUG
|
||||
#endif
|
||||
|
||||
#include "llama.cpp" // TODO: not great
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "llama.h"
|
||||
#include "grammar-parser.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
|
||||
int main()
|
||||
{
|
||||
@@ -112,10 +114,10 @@ int main()
|
||||
}
|
||||
}
|
||||
|
||||
llama_grammar *grammar = NULL;
|
||||
llama_grammar * grammar = NULL;
|
||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||
|
||||
grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||
if (grammar == nullptr)
|
||||
{
|
||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||
@@ -172,7 +174,7 @@ int main()
|
||||
}};
|
||||
|
||||
auto index = 0;
|
||||
for (auto stack : grammar->stacks)
|
||||
for (auto stack : llama_grammar_get_stacks(grammar))
|
||||
{
|
||||
// compare stack to expected_stack
|
||||
for (uint32_t i = 0; i < stack.size(); i++)
|
||||
@@ -374,13 +376,13 @@ int main()
|
||||
},
|
||||
};
|
||||
|
||||
std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates);
|
||||
std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
|
||||
|
||||
std::vector<std::vector<llama_grammar_candidate>> all_rejects;
|
||||
|
||||
for (std::size_t count = 0; count < grammar->stacks.size(); ++count)
|
||||
for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
|
||||
{
|
||||
rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates);
|
||||
rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
|
||||
all_rejects.push_back(rejects);
|
||||
}
|
||||
|
||||
@@ -401,6 +403,6 @@ int main()
|
||||
delete[] candidate.code_points;
|
||||
candidate.code_points = nullptr;
|
||||
}
|
||||
delete grammar;
|
||||
llama_grammar_free(grammar);
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user