mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
2 Commits
b2510
...
gg/hf-args
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8c3d5b5a79 | ||
|
|
1b2f0a9ee8 |
26
.github/workflows/build.yml
vendored
26
.github/workflows/build.yml
vendored
@@ -15,10 +15,6 @@ on:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GGML_NLOOP: 3
|
||||
@@ -139,9 +135,6 @@ jobs:
|
||||
|
||||
ubuntu-focal-make:
|
||||
runs-on: ubuntu-20.04
|
||||
env:
|
||||
LLAMA_NODE_AVAILABLE: true
|
||||
LLAMA_PYTHON_AVAILABLE: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -154,14 +147,6 @@ jobs:
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential gcc-8
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "20"
|
||||
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Build
|
||||
id: make_build
|
||||
env:
|
||||
@@ -225,17 +210,6 @@ jobs:
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
# ubuntu-latest-cmake-sanitizer:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
|
||||
4
.github/workflows/code-coverage.yml
vendored
4
.github/workflows/code-coverage.yml
vendored
@@ -5,10 +5,6 @@ env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run:
|
||||
runs-on: ubuntu-20.04
|
||||
|
||||
4
.github/workflows/docker.yml
vendored
4
.github/workflows/docker.yml
vendored
@@ -15,10 +15,6 @@ on:
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
|
||||
4
.github/workflows/editorconfig.yml
vendored
4
.github/workflows/editorconfig.yml
vendored
@@ -14,10 +14,6 @@ on:
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
editorconfig:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
4
.github/workflows/nix-ci-aarch64.yml
vendored
4
.github/workflows/nix-ci-aarch64.yml
vendored
@@ -17,10 +17,6 @@ on:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/*.nix', 'flake.lock']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
nix-build-aarch64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
4
.github/workflows/nix-ci.yml
vendored
4
.github/workflows/nix-ci.yml
vendored
@@ -8,10 +8,6 @@ on:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
nix-eval:
|
||||
strategy:
|
||||
|
||||
@@ -16,10 +16,6 @@ on:
|
||||
- 'requirements.txt'
|
||||
- 'requirements/*.txt'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
python-check-requirements:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
4
.github/workflows/python-lint.yml
vendored
4
.github/workflows/python-lint.yml
vendored
@@ -2,10 +2,6 @@ name: flake8 Lint
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
flake8-lint:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
4
.github/workflows/server.yml
vendored
4
.github/workflows/server.yml
vendored
@@ -18,10 +18,6 @@ on:
|
||||
schedule:
|
||||
- cron: '0 0 * * *'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
4
.github/workflows/zig-build.yml
vendored
4
.github/workflows/zig-build.yml
vendored
@@ -6,10 +6,6 @@ on:
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
strategy:
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -58,9 +58,6 @@ models-mnt
|
||||
/llava-cli
|
||||
/lookahead
|
||||
/lookup
|
||||
/lookup-create
|
||||
/lookup-merge
|
||||
/lookup-stats
|
||||
/main
|
||||
/metal
|
||||
/passkey
|
||||
|
||||
@@ -99,7 +99,6 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
|
||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"llama: max. batch size for using peer access")
|
||||
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
|
||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||
@@ -388,9 +387,6 @@ if (LLAMA_CUBLAS)
|
||||
endif()
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
if (LLAMA_CUDA_NO_PEER_COPY)
|
||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||
endif()
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
if (WIN32)
|
||||
@@ -535,10 +531,6 @@ if (LLAMA_HIPBLAS)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUDA_NO_PEER_COPY)
|
||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||
endif()
|
||||
|
||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
|
||||
22
Makefile
22
Makefile
@@ -452,9 +452,9 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||
else
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
||||
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||
ifdef LLAMA_CUDA_NO_PEER_COPY
|
||||
MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
||||
endif # LLAMA_CUDA_NO_PEER_COPY
|
||||
#ifdef LLAMA_CUDA_CUBLAS
|
||||
# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
|
||||
#endif # LLAMA_CUDA_CUBLAS
|
||||
ifdef LLAMA_CUDA_CCBIN
|
||||
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||
endif
|
||||
@@ -535,9 +535,6 @@ endif # LLAMA_HIP_UMA
|
||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
endif # LLAMA_CUDA_FORCE_DMMV
|
||||
ifdef LLAMA_CUDA_NO_PEER_COPY
|
||||
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
|
||||
endif # LLAMA_CUDA_NO_PEER_COPY
|
||||
OBJS += ggml-cuda.o
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
||||
@@ -676,9 +673,6 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
|
||||
train.o: common/train.cpp common/train.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
libllama.so: llama.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
@@ -686,7 +680,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||
|
||||
clean:
|
||||
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
find examples pocs -type f -name "*.o" -delete
|
||||
|
||||
#
|
||||
@@ -816,15 +810,9 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
|
||||
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
|
||||
|
||||
passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
||||
@@ -22,7 +22,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
|
||||
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
|
||||
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
|
||||
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
|
||||
|
||||
----
|
||||
|
||||
|
||||
@@ -65,8 +65,6 @@ add_library(${TARGET} STATIC
|
||||
json.hpp
|
||||
train.h
|
||||
train.cpp
|
||||
ngram-cache.h
|
||||
ngram-cache.cpp
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
|
||||
@@ -963,22 +963,6 @@ static bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg,
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-lcs" || arg == "--lookup-cache-static") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
params.lookup_cache_static = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
params.lookup_cache_dynamic = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -1236,11 +1220,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
throw std::invalid_argument("error: unknown argument: " + arg);
|
||||
}
|
||||
}
|
||||
|
||||
if (invalid_param) {
|
||||
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||
}
|
||||
|
||||
if (params.prompt_cache_all &&
|
||||
(params.interactive || params.interactive_first ||
|
||||
params.instruct)) {
|
||||
@@ -1248,11 +1230,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||
}
|
||||
|
||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||
if (!params.hf_repo.empty() && params.hf_file.empty()) {
|
||||
params.hf_file = params.model;
|
||||
}
|
||||
|
||||
if (params.escape) {
|
||||
process_escapes(params.prompt);
|
||||
process_escapes(params.input_prefix);
|
||||
@@ -1452,10 +1429,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" Hugging Face model file (default: unused)\n");
|
||||
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||
printf(" path under which to save YAML logs (no logging if unset)\n");
|
||||
printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
|
||||
printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n");
|
||||
printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
|
||||
printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
|
||||
@@ -88,22 +88,20 @@ struct gpt_params {
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
|
||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string model_url = ""; // model url to download
|
||||
std::string hf_repo = ""; // HF repo
|
||||
std::string hf_file = ""; // HF file
|
||||
std::string prompt = "";
|
||||
std::string prompt_file = ""; // store the external prompt file name
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string model_url = ""; // model url to download
|
||||
std::string hf_repo = ""; // HF repo
|
||||
std::string hf_file = ""; // HF file
|
||||
std::string prompt = "";
|
||||
std::string prompt_file = ""; // store the external prompt file name
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
||||
std::string logits_file = ""; // file for saving *all* logits
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
std::string logits_file = ""; // file for saving *all* logits
|
||||
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
using json = nlohmann::json;
|
||||
|
||||
const std::string SPACE_RULE = "\" \"?";
|
||||
|
||||
@@ -124,7 +124,7 @@ static std::string replacePattern(const std::string & input, const std::regex &
|
||||
}
|
||||
|
||||
static std::string format_literal(const std::string & literal) {
|
||||
std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
|
||||
std::string escaped = replacePattern(json(literal).dump(), GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
|
||||
char c = match.str()[0];
|
||||
return GRAMMAR_LITERAL_ESCAPES.at(c);
|
||||
});
|
||||
@@ -137,7 +137,7 @@ private:
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
std::map<std::string, std::string> _rules;
|
||||
std::unordered_map<std::string, json> _refs;
|
||||
std::unordered_map<std::string, nlohmann::json> _refs;
|
||||
std::unordered_set<std::string> _refs_being_resolved;
|
||||
std::vector<std::string> _errors;
|
||||
std::vector<std::string> _warnings;
|
||||
@@ -413,7 +413,7 @@ private:
|
||||
std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name);
|
||||
prop_kv_rule_names[prop_name] = _add_rule(
|
||||
name + (name.empty() ? "" : "-") + prop_name + "-kv",
|
||||
format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name
|
||||
format_literal(prop_name) + " space \":\" space " + prop_rule_name
|
||||
);
|
||||
if (required.find(prop_name) != required.end()) {
|
||||
required_props.push_back(prop_name);
|
||||
@@ -495,7 +495,7 @@ public:
|
||||
_rules["space"] = SPACE_RULE;
|
||||
}
|
||||
|
||||
void resolve_refs(json & schema, const std::string & url) {
|
||||
void resolve_refs(nlohmann::json & schema, const std::string & url) {
|
||||
/*
|
||||
* Resolves all $ref fields in the given schema, fetching any remote schemas,
|
||||
* replacing each $ref with absolute reference URL and populates _refs with the
|
||||
@@ -557,7 +557,11 @@ public:
|
||||
}
|
||||
|
||||
std::string _generate_constant_rule(const json & value) {
|
||||
return format_literal(value.dump());
|
||||
if (!value.is_string()) {
|
||||
_errors.push_back("Only std::string constants are supported, got " + value.dump());
|
||||
return "";
|
||||
}
|
||||
return format_literal(value.get<std::string>());
|
||||
}
|
||||
|
||||
std::string visit(const json & schema, const std::string & name) {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#pragma once
|
||||
#include "json.hpp"
|
||||
|
||||
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
|
||||
std::string json_schema_to_grammar(const nlohmann::json& schema);
|
||||
|
||||
@@ -1,280 +0,0 @@
|
||||
#include "ngram-cache.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <fstream>
|
||||
|
||||
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
||||
const int64_t t_start_ms = ggml_time_ms();
|
||||
const int64_t inp_size = inp.size();
|
||||
|
||||
const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
|
||||
int64_t n_done = 0;
|
||||
|
||||
for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
|
||||
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
|
||||
for (int64_t i = i_start; i < inp_size; ++i) {
|
||||
const int64_t ngram_start = i - ngram_size;
|
||||
llama_ngram ngram(&inp[ngram_start], ngram_size);
|
||||
const llama_token token = inp[i];
|
||||
|
||||
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
||||
if (part_it == ngram_cache.end()) {
|
||||
llama_ngram_cache_part part;
|
||||
part.emplace(token, 1);
|
||||
ngram_cache.emplace(ngram, part);
|
||||
} else {
|
||||
llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
|
||||
if (token_count_it == part_it->second.end()) {
|
||||
part_it->second.emplace(token, 1);
|
||||
} else {
|
||||
token_count_it->second++;
|
||||
}
|
||||
}
|
||||
++n_done;
|
||||
|
||||
if (print_progress && n_done % 10000000 == 0) {
|
||||
const int64_t t_now_ms = ggml_time_ms();
|
||||
const int64_t eta_ms = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done;
|
||||
const int64_t eta_min = eta_ms / (60*1000);
|
||||
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
|
||||
|
||||
fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to get a token from the combined, speculative sequence of inp and draft.
|
||||
static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
|
||||
return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
|
||||
}
|
||||
|
||||
// If sample size or percentage are below these thresholds the draft is aborted early:
|
||||
constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1};
|
||||
constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
|
||||
constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
|
||||
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
||||
|
||||
// Helper function that tries to draft a token from only the static ngram cache:
|
||||
static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
|
||||
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||
if (part_static_it == nc_static.end()) {
|
||||
return -1;
|
||||
}
|
||||
const llama_ngram_cache_part part_static = part_static_it->second;
|
||||
|
||||
int max_count_static = 0;
|
||||
int sum_count_static = 0;
|
||||
llama_token max_token = -1;
|
||||
|
||||
for (std::pair<llama_token, int> token_count_static : part_static) {
|
||||
const llama_token token = token_count_static.first;
|
||||
const int32_t count_static = token_count_static.second;
|
||||
|
||||
if (count_static > max_count_static) {
|
||||
max_token = token;
|
||||
max_count_static = count_static;
|
||||
}
|
||||
sum_count_static += count_static;
|
||||
}
|
||||
|
||||
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
|
||||
return -1;
|
||||
}
|
||||
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
|
||||
return -1;
|
||||
}
|
||||
return max_token;
|
||||
}
|
||||
|
||||
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
||||
static llama_token try_draft(
|
||||
llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
|
||||
const int * min_sample_size, const int * min_percent) {
|
||||
|
||||
llama_token drafted_token = -1;
|
||||
|
||||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
||||
const llama_ngram ngram_primary = ngrams_primary[i];
|
||||
|
||||
llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
||||
if (part_primary_it == nc_primary.end()) {
|
||||
continue;
|
||||
}
|
||||
const llama_ngram_cache_part part_primary = part_primary_it->second;
|
||||
|
||||
int max_count_primary = 0;
|
||||
int max_count_static = 0;
|
||||
int sum_count_primary = 0;
|
||||
llama_token max_token = -1;
|
||||
|
||||
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
||||
const llama_token token = token_count_primary.first;
|
||||
|
||||
llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
||||
|
||||
const int32_t count_primary = token_count_primary.second;
|
||||
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
|
||||
|
||||
if (count_primary*count_static > max_count_primary*max_count_static) {
|
||||
max_token = token;
|
||||
max_count_primary = count_primary;
|
||||
max_count_static = count_static;
|
||||
}
|
||||
sum_count_primary += count_primary;
|
||||
}
|
||||
|
||||
if (sum_count_primary < min_sample_size[i]) {
|
||||
continue;
|
||||
}
|
||||
if (100*max_count_primary < min_percent[i]*sum_count_primary) {
|
||||
continue;;
|
||||
}
|
||||
drafted_token = max_token;
|
||||
}
|
||||
|
||||
return drafted_token;
|
||||
}
|
||||
|
||||
void llama_ngram_cache_draft(
|
||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
|
||||
) {
|
||||
GGML_ASSERT(draft.size() == 1);
|
||||
const int inp_size = inp.size();
|
||||
|
||||
if (inp_size < LLAMA_NGRAM_STATIC) {
|
||||
return;
|
||||
}
|
||||
|
||||
while ((int) draft.size()-1 < n_draft) {
|
||||
llama_token drafted_token = -1;
|
||||
|
||||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
||||
llama_ngram ngram_static;
|
||||
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
||||
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
||||
}
|
||||
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||
llama_ngram_cache_part part_static;
|
||||
if (part_static_it != nc_static.end()) {
|
||||
part_static = part_static_it->second;
|
||||
}
|
||||
|
||||
// cd = context + dynamic
|
||||
std::vector<llama_ngram> ngrams_cd;
|
||||
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
|
||||
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
|
||||
llama_ngram ngram_cd;
|
||||
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
|
||||
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
|
||||
}
|
||||
ngrams_cd.push_back(ngram_cd);
|
||||
}
|
||||
if (drafted_token == -1) {
|
||||
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
|
||||
}
|
||||
if (drafted_token == -1) {
|
||||
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
|
||||
}
|
||||
if (drafted_token == -1) {
|
||||
drafted_token = try_draft(nc_static, ngram_static);
|
||||
}
|
||||
|
||||
if (drafted_token == -1) {
|
||||
break;
|
||||
}
|
||||
|
||||
LOG(" - draft candidate: token=%d\n", drafted_token);
|
||||
draft.push_back(drafted_token);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
|
||||
std::ofstream file_out(filename, std::ios::binary);
|
||||
for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
|
||||
const llama_ngram ngram = item.first;
|
||||
llama_ngram_cache_part token_counts = item.second;
|
||||
GGML_ASSERT(!token_counts.empty());
|
||||
const int32_t ntokens = token_counts.size();
|
||||
GGML_ASSERT(ntokens > 0);
|
||||
|
||||
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram));
|
||||
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
||||
for (std::pair<llama_token, int32_t> item2 : token_counts) {
|
||||
const llama_token token = item2.first;
|
||||
const int32_t count = item2.second;
|
||||
GGML_ASSERT(count > 0);
|
||||
|
||||
file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
|
||||
file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
||||
std::ifstream hashmap_file(filename, std::ios::binary);
|
||||
if (!hashmap_file) {
|
||||
throw std::ifstream::failure("Unable to open file " + filename);
|
||||
}
|
||||
llama_ngram_cache ngram_cache;
|
||||
|
||||
llama_ngram ngram;
|
||||
int32_t ntokens;
|
||||
llama_token token;
|
||||
int32_t count;
|
||||
|
||||
char * ngramc = reinterpret_cast<char*>(&ngram);
|
||||
char * ntokensc = reinterpret_cast<char*>(&ntokens);
|
||||
char * tokenc = reinterpret_cast<char*>(&token);
|
||||
char * countc = reinterpret_cast<char*>(&count);
|
||||
while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
|
||||
GGML_ASSERT(!hashmap_file.eof());
|
||||
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
|
||||
GGML_ASSERT(ntokens > 0);
|
||||
llama_ngram_cache_part token_counts;
|
||||
|
||||
for (int i = 0; i < ntokens; ++i) {
|
||||
GGML_ASSERT(!hashmap_file.eof());
|
||||
GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
|
||||
GGML_ASSERT(!hashmap_file.eof());
|
||||
GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
|
||||
GGML_ASSERT(count > 0);
|
||||
token_counts.emplace(token, count);
|
||||
}
|
||||
|
||||
ngram_cache.emplace(ngram, token_counts);
|
||||
}
|
||||
GGML_ASSERT(hashmap_file.eof());
|
||||
|
||||
return ngram_cache;
|
||||
}
|
||||
|
||||
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
|
||||
for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
|
||||
const llama_ngram ngram = ngram_part.first;
|
||||
llama_ngram_cache_part part = ngram_part.second;
|
||||
|
||||
llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
|
||||
if (part_merged_it == ngram_cache_target.end()) {
|
||||
ngram_cache_target.emplace(ngram, part);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (std::pair<llama_token, int32_t> token_count : part) {
|
||||
const llama_token token = token_count.first;
|
||||
const int32_t count = token_count.second;
|
||||
GGML_ASSERT(count > 0);
|
||||
|
||||
llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
|
||||
if (token_count_merged_it == part_merged_it->second.end()) {
|
||||
part_merged_it->second.emplace(token, count);
|
||||
continue;
|
||||
}
|
||||
|
||||
token_count_merged_it->second += count;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,94 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define LLAMA_NGRAM_MIN 1
|
||||
#define LLAMA_NGRAM_MAX 4
|
||||
#define LLAMA_NGRAM_STATIC 2
|
||||
|
||||
// Data structures to map n-grams to empirical token probabilities:
|
||||
|
||||
struct llama_ngram {
|
||||
llama_token tokens[LLAMA_NGRAM_MAX];
|
||||
|
||||
llama_ngram() {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
tokens[i] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
llama_ngram(const llama_token * input, const int ngram_size) {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
tokens[i] = i < ngram_size ? input[i] : -1;
|
||||
}
|
||||
}
|
||||
|
||||
bool operator==(const llama_ngram & other) const {
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
if (tokens[i] != other.tokens[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_ngram_hash_function {
|
||||
size_t operator()(const llama_ngram & ngram) const {
|
||||
size_t hash = 0;
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
};
|
||||
|
||||
// token -> number of times token has been seen
|
||||
typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
|
||||
|
||||
// n-gram -> empirical distribution of following tokens
|
||||
typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
|
||||
|
||||
|
||||
// Update an ngram cache with tokens.
|
||||
// ngram_cache: the cache to modify.
|
||||
// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
|
||||
// inp_data: the token sequence with which to update ngram_cache.
|
||||
// nnew: how many new tokens have been appended to inp_data since the last call to this function.
|
||||
// print_progress: whether to print progress to stderr.
|
||||
//
|
||||
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
||||
// Changes in the middle need a complete rebuild.
|
||||
void llama_ngram_cache_update(
|
||||
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
||||
|
||||
// Try to draft tokens from ngram caches.
|
||||
// inp: the tokens generated so far.
|
||||
// draft: the token sequence to draft. Expected to initially contain the previously sampled token.
|
||||
// n_draft: maximum number of tokens to add to draft.
|
||||
// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
|
||||
// nc_context: ngram cache based on current context.
|
||||
// nc_dynamic: ngram cache based on previous user generations.
|
||||
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
||||
void llama_ngram_cache_draft(
|
||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
|
||||
|
||||
// Save an ngram cache to a file.
|
||||
// ngram_cache: the ngram cache to save.
|
||||
// filename: the path under which to save the ngram cache.
|
||||
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
|
||||
|
||||
// Load an ngram cache saved with llama_ngram_cache_save.
|
||||
// filename: the path from which to load the ngram cache.
|
||||
// returns: an ngram cache containing the information saved to filename.
|
||||
llama_ngram_cache llama_ngram_cache_load(std::string & filename);
|
||||
|
||||
// Merge two ngram caches.
|
||||
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
||||
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
|
||||
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
|
||||
@@ -21,8 +21,6 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface.
|
||||
|
||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
||||
|
||||
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
||||
|
||||
Now you can use the model with a command like:
|
||||
|
||||
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
@@ -79,101 +78,111 @@ typedef struct {
|
||||
|
||||
struct TransformerWeights {
|
||||
// token embedding table
|
||||
std::vector<float> token_embedding_table; // (vocab_size, dim)
|
||||
float* token_embedding_table; // (vocab_size, dim)
|
||||
// weights for rmsnorms
|
||||
std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
|
||||
std::vector<float> rms_ffn_weight; // (layer, dim)
|
||||
float* rms_att_weight; // (layer, dim) rmsnorm weights
|
||||
float* rms_ffn_weight; // (layer, dim)
|
||||
// weights for matmuls
|
||||
std::vector<float> wq; // (layer, dim, dim)
|
||||
std::vector<float> wk; // (layer, dim, dim)
|
||||
std::vector<float> wv; // (layer, dim, dim)
|
||||
std::vector<float> wo; // (layer, dim, dim)
|
||||
float* wq; // (layer, dim, dim)
|
||||
float* wk; // (layer, dim, dim)
|
||||
float* wv; // (layer, dim, dim)
|
||||
float* wo; // (layer, dim, dim)
|
||||
// weights for ffn
|
||||
std::vector<float> w1; // (layer, hidden_dim, dim)
|
||||
std::vector<float> w2; // (layer, dim, hidden_dim)
|
||||
std::vector<float> w3; // (layer, hidden_dim, dim)
|
||||
float* w1; // (layer, hidden_dim, dim)
|
||||
float* w2; // (layer, dim, hidden_dim)
|
||||
float* w3; // (layer, hidden_dim, dim)
|
||||
// final rmsnorm
|
||||
std::vector<float> rms_final_weight; // (dim,)
|
||||
float* rms_final_weight; // (dim,)
|
||||
// freq_cis for RoPE relatively positional embeddings
|
||||
// std::vector<float> freq_cis_real; // (seq_len, dim/2)
|
||||
// std::vector<float> freq_cis_imag; // (seq_len, dim/2)
|
||||
// float* freq_cis_real; // (seq_len, dim/2)
|
||||
// float* freq_cis_imag; // (seq_len, dim/2)
|
||||
// (optional) classifier weights for the logits, on the last layer
|
||||
std::vector<float> wcls;
|
||||
float* wcls;
|
||||
|
||||
~TransformerWeights() {
|
||||
delete[] token_embedding_table;
|
||||
delete[] rms_att_weight;
|
||||
delete[] rms_ffn_weight;
|
||||
delete[] wq;
|
||||
delete[] wk;
|
||||
delete[] wv;
|
||||
delete[] wo;
|
||||
delete[] w1;
|
||||
delete[] w2;
|
||||
delete[] w3;
|
||||
delete[] rms_final_weight;
|
||||
delete[] wcls;
|
||||
}
|
||||
};
|
||||
|
||||
static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
|
||||
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
|
||||
try {
|
||||
w->token_embedding_table.resize(p->vocab_size * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||
// we calloc instead of malloc to keep valgrind happy
|
||||
w->token_embedding_table = new float[p->vocab_size * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
|
||||
w->rms_att_weight.resize(p->n_layers * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
||||
w->rms_att_weight = new float[p->n_layers * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
||||
|
||||
w->rms_ffn_weight.resize(p->n_layers * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
||||
w->rms_ffn_weight = new float[p->n_layers * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
||||
|
||||
w->wq.resize(p->n_layers * p->dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
w->wq = new float[p->n_layers * p->dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
|
||||
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
w->wk = new float[p->n_layers * p->dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
|
||||
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
w->wv = new float[p->n_layers * p->dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
|
||||
w->wo.resize(p->n_layers * p->dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
w->wo = new float[p->n_layers * p->dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
|
||||
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
|
||||
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
|
||||
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
|
||||
w->rms_final_weight.resize(p->dim);
|
||||
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
||||
w->rms_final_weight = new float[p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
||||
|
||||
if (shared_weights) {
|
||||
w->wcls = {};
|
||||
} else {
|
||||
w->wcls.resize(p->vocab_size * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
}
|
||||
}
|
||||
catch (std::length_error &) {
|
||||
die("Invalid configuration. Failed to allocate memory for weights");
|
||||
if (shared_weights) {
|
||||
w->wcls = NULL;
|
||||
} else {
|
||||
w->wcls = new float[p->vocab_size * p->dim]();
|
||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
}
|
||||
}
|
||||
|
||||
static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
|
||||
if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
|
||||
if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
|
||||
if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
|
||||
if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
|
||||
if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
|
||||
if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
|
||||
if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
|
||||
if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
|
||||
if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
|
||||
if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
|
||||
if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;
|
||||
static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
||||
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
||||
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||
if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
||||
if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
|
||||
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
|
||||
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
|
||||
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
|
||||
|
||||
// Skip freq_cis_real & freq_cis_imag
|
||||
int head_size = p->dim / p->n_heads;
|
||||
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
|
||||
|
||||
if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;
|
||||
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||
|
||||
// Check we didn't forget to read anything
|
||||
auto curr = ftell(f);
|
||||
fseek(f, 0, SEEK_END);
|
||||
auto end = ftell(f);
|
||||
if (curr != end) {
|
||||
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
|
||||
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -181,20 +190,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
|
||||
}
|
||||
|
||||
static void print_sample_weights(TransformerWeights *w){
|
||||
LOG("----- Quick print of first of the weight vales of all the variables\n");
|
||||
LOG("%f\n", w->token_embedding_table[0]);
|
||||
LOG("%f\n", w->rms_att_weight[0]);
|
||||
LOG("%f\n", w->rms_ffn_weight[0]);
|
||||
printf("----- Quick print of first of the weight vales of all the variables\n");
|
||||
printf("%f\n", w->token_embedding_table[0]);
|
||||
printf("%f\n", w->rms_att_weight[0]);
|
||||
printf("%f\n", w->rms_ffn_weight[0]);
|
||||
|
||||
LOG("%f\n", w->wq[0]);
|
||||
LOG("%f\n", w->wk[0]);
|
||||
LOG("%f\n", w->wv[0]);
|
||||
LOG("%f\n", w->wo[0]);
|
||||
LOG("%f\n", w->w1[0]);
|
||||
LOG("%f\n", w->w2[0]);
|
||||
LOG("%f\n", w->w3[0]);
|
||||
LOG("%f\n", w->rms_att_weight[0]);
|
||||
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
|
||||
printf("%f\n", w->wq[0]);
|
||||
printf("%f\n", w->wk[0]);
|
||||
printf("%f\n", w->wv[0]);
|
||||
printf("%f\n", w->wo[0]);
|
||||
printf("%f\n", w->w1[0]);
|
||||
printf("%f\n", w->w2[0]);
|
||||
printf("%f\n", w->w3[0]);
|
||||
printf("%f\n", w->rms_att_weight[0]);
|
||||
if (w->wcls) printf("%f\n", w->wcls[0]);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -216,16 +225,14 @@ struct llama_vocab {
|
||||
};
|
||||
|
||||
struct my_llama_hparams {
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx = 512; // this is provided as user input?
|
||||
uint32_t n_embd = 4096;
|
||||
uint32_t n_ff = 11008;
|
||||
uint32_t n_mult = 4;
|
||||
uint32_t n_head = 32;
|
||||
uint32_t n_head_kv = 32;
|
||||
uint32_t n_layer = 32;
|
||||
uint32_t n_rot = 64;
|
||||
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx = 512; // this is provided as user input?
|
||||
uint32_t n_embd = 4096;
|
||||
uint32_t n_ff = 11008;
|
||||
uint32_t n_mult = 4;
|
||||
uint32_t n_head = 32;
|
||||
uint32_t n_layer = 32;
|
||||
uint32_t n_rot = 64;
|
||||
bool operator!=(const my_llama_hparams& other) const {
|
||||
return memcmp(this, &other, sizeof(my_llama_hparams));
|
||||
}
|
||||
@@ -318,30 +325,14 @@ struct train_params {
|
||||
};
|
||||
|
||||
static void print_params(struct my_llama_hparams * params) {
|
||||
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||
LOG("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||
LOG("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||
LOG("%s: n_head: %u\n", __func__, params->n_head);
|
||||
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
|
||||
LOG("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||
LOG("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||
LOG("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||
}
|
||||
|
||||
static void print_tensor_info(const struct ggml_context * ctx) {
|
||||
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
LOG("%s: Allocating ", __func__);
|
||||
int64_t total = 1;
|
||||
int i = 0;
|
||||
for (; i < ggml_n_dims(t); ++i) {
|
||||
if (i > 0) LOG("x ");
|
||||
LOG("[%" PRId64 "] ", t->ne[i]);
|
||||
total *= t->ne[i];
|
||||
}
|
||||
if (i > 1) LOG("= [%" PRId64 "] ", total);
|
||||
LOG("float space for %s\n", ggml_get_name(t));
|
||||
}
|
||||
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||
printf("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||
printf("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||
printf("%s: n_head: %u\n", __func__, params->n_head);
|
||||
printf("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||
printf("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||
printf("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||
}
|
||||
|
||||
static void init_model(struct my_llama_model * model) {
|
||||
@@ -351,8 +342,6 @@ static void init_model(struct my_llama_model * model) {
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_vocab = hparams.n_vocab;
|
||||
|
||||
const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
struct ggml_context * ctx = model->ctx;
|
||||
|
||||
@@ -361,8 +350,25 @@ static void init_model(struct my_llama_model * model) {
|
||||
model->train_tokens = 0;
|
||||
|
||||
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||
printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
|
||||
|
||||
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
|
||||
|
||||
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
|
||||
|
||||
// printing the per-layer allocations here so we dont print in the for loop.
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
|
||||
|
||||
printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
|
||||
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
|
||||
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
|
||||
|
||||
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
|
||||
ggml_set_name(model->norm, "norm.weight");
|
||||
@@ -377,8 +383,8 @@ static void init_model(struct my_llama_model * model) {
|
||||
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
|
||||
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
|
||||
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||
|
||||
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
@@ -400,8 +406,6 @@ static void init_model(struct my_llama_model * model) {
|
||||
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
|
||||
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
|
||||
}
|
||||
|
||||
print_tensor_info(ctx);
|
||||
}
|
||||
|
||||
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
@@ -417,9 +421,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||
static void print_row(struct ggml_tensor * probs, int i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
float p = get_f32_2d(probs, k, i);
|
||||
LOG(" %f", p);
|
||||
printf(" %f", p);
|
||||
}
|
||||
LOG("\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void print_matrix(struct ggml_tensor * probs) {
|
||||
@@ -427,12 +431,33 @@ static void print_matrix(struct ggml_tensor * probs) {
|
||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||
float p = get_f32_2d(probs, k, i);
|
||||
LOG(" %.2f", p);
|
||||
printf(" %.2f", p);
|
||||
}
|
||||
LOG("\n");
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
__attribute__((format(gnu_printf, 1, 2)))
|
||||
#else
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif
|
||||
#endif
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap, ap2;
|
||||
va_start(ap, fmt);
|
||||
va_copy(ap2, ap);
|
||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||
GGML_ASSERT(size >= 0 && size < INT_MAX);
|
||||
std::vector<char> buf(size + 1);
|
||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||
GGML_ASSERT(size2 == size);
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
}
|
||||
|
||||
struct llama_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
@@ -524,9 +549,8 @@ static std::string llama_escape_whitespaces(const std::string & text) {
|
||||
return out.str();
|
||||
}
|
||||
|
||||
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
|
||||
static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||
if (is_ggml_file(filename)) {
|
||||
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
@@ -554,9 +578,6 @@ static void load_vocab(const char * filename, const Config * config, struct llam
|
||||
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||
|
||||
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
||||
if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
|
||||
die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
|
||||
}
|
||||
|
||||
vocab->id_to_token.resize(n_vocab);
|
||||
|
||||
@@ -574,7 +595,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
|
||||
gguf_free(ctx);
|
||||
} else {
|
||||
// assume llama2.c vocabulary
|
||||
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
||||
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
|
||||
llama_file file(filename, "rb");
|
||||
if (!file.fp) {
|
||||
die_fmt("%s: %s", strerror(errno), filename);
|
||||
@@ -617,15 +638,38 @@ static void load_vocab(const char * filename, const Config * config, struct llam
|
||||
}
|
||||
|
||||
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||
int size = 1;
|
||||
for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
|
||||
size *= gg_weights->ne[dim];
|
||||
}
|
||||
for (int ct = 0; ct < size; ++ct) {
|
||||
int64_t i0 = 0; int64_t i1 = 0;
|
||||
int64_t i2 = 0; int64_t i3 = 0;
|
||||
ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
|
||||
ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
|
||||
int ct;
|
||||
switch (ggml_n_dims(gg_weights)) {
|
||||
case 1:
|
||||
ct = 0;
|
||||
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
|
||||
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
|
||||
*ptr = karpathy_weights[ct];
|
||||
ct++;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
ct = 0;
|
||||
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
|
||||
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
|
||||
*ptr = karpathy_weights[ct];
|
||||
ct++;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
ct = 0;
|
||||
for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
|
||||
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
|
||||
*ptr = karpathy_weights[ct];
|
||||
ct++;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -635,18 +679,16 @@ static void save_as_llama_model(
|
||||
// convert AK weights into GG weights one by one.
|
||||
// w->token_embedding_table -> model->tok_embeddings
|
||||
// float* -> struct ggml_tensor
|
||||
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
|
||||
convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
|
||||
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
|
||||
convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
|
||||
|
||||
convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
|
||||
convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
|
||||
//print_row(model->norm, 0);
|
||||
|
||||
// for rms-att-weight
|
||||
int row_length = model->hparams.n_embd;
|
||||
int n_ff = model->hparams.n_ff;
|
||||
|
||||
const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
|
||||
|
||||
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
|
||||
auto & layer = model->layers[i];
|
||||
// 1d
|
||||
@@ -655,10 +697,9 @@ static void save_as_llama_model(
|
||||
|
||||
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
|
||||
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
|
||||
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
|
||||
// from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
|
||||
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length/n_multiqueries]);
|
||||
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length/n_multiqueries]);
|
||||
|
||||
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
|
||||
convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
|
||||
@@ -695,8 +736,8 @@ static void save_as_llama_model(
|
||||
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
||||
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
|
||||
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
|
||||
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
|
||||
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
|
||||
// n_head_kv is optional, default to n_head
|
||||
// gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
|
||||
gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
|
||||
gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
|
||||
gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
|
||||
@@ -748,12 +789,12 @@ static void save_as_llama_model(
|
||||
|
||||
static struct train_params get_default_train_params() {
|
||||
struct train_params params;
|
||||
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
||||
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
||||
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
||||
params.fn_train_data = "shakespeare.txt";
|
||||
params.fn_checkpoint_in = "checkpoint.bin";
|
||||
params.fn_checkpoint_out = "checkpoint.bin";
|
||||
params.fn_model_out = "ggml-checkpoint-f32.bin";
|
||||
params.fn_train_data = "shakespeare.txt";
|
||||
params.fn_checkpoint_in = "checkpoint.bin";
|
||||
params.fn_checkpoint_out = "checkpoint.bin";
|
||||
params.fn_model_out = "ggml-checkpoint-f32.bin";
|
||||
|
||||
params.seed = -1;
|
||||
|
||||
@@ -788,8 +829,8 @@ static struct train_params get_default_train_params() {
|
||||
params.adam_alpha = 1e-3f;
|
||||
params.adam_decay = 1e-3f;
|
||||
|
||||
params.mem_model_gb = 2;
|
||||
params.mem_compute_gb = 24;
|
||||
params.mem_model_gb = 2;
|
||||
params.mem_compute_gb = 24;
|
||||
params.mem_compute0_gb = 8;
|
||||
params.mem_compute1_gb = 2;
|
||||
|
||||
@@ -875,30 +916,19 @@ int main(int argc, char ** argv) {
|
||||
if (!params_parse(argc, argv, ¶ms)) {
|
||||
return 1;
|
||||
}
|
||||
log_set_target(stdout);
|
||||
Config config;
|
||||
TransformerWeights weights = {};
|
||||
{
|
||||
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
|
||||
FILE *file = fopen(params.fn_llama2c_model, "r");
|
||||
if (!file) {
|
||||
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
|
||||
return 1;
|
||||
}
|
||||
FILE *file = fopen(params.fn_llama2c_model, "rb");
|
||||
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
|
||||
// read in the config header
|
||||
if (fread(&config, sizeof(Config), 1, file) != 1) {
|
||||
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
|
||||
return 1;
|
||||
}
|
||||
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
|
||||
auto shared_weights = config.vocab_size > 0;
|
||||
config.vocab_size = abs(config.vocab_size);
|
||||
|
||||
// read in the Transformer weights
|
||||
alloc_weights(&weights, &config, shared_weights);
|
||||
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
|
||||
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
|
||||
return 1;
|
||||
}
|
||||
malloc_weights(&weights, &config, shared_weights);
|
||||
if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
@@ -906,18 +936,15 @@ int main(int argc, char ** argv) {
|
||||
load_vocab(params.fn_vocab_model, &config, &vocab);
|
||||
|
||||
struct my_llama_model model;
|
||||
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
|
||||
model.hparams.n_ctx = params.n_ctx;
|
||||
model.hparams.n_embd = config.dim; //params.n_embd;
|
||||
model.hparams.n_ff = config.hidden_dim;
|
||||
model.hparams.n_mult = 32;//params.n_mult;
|
||||
model.hparams.n_head = config.n_heads; //params.n_head;
|
||||
model.hparams.n_head_kv = config.n_kv_heads;
|
||||
model.hparams.n_layer = config.n_layers; //params.n_layer;
|
||||
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
|
||||
|
||||
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
|
||||
model.hparams.n_ctx = params.n_ctx;
|
||||
model.hparams.n_embd = config.dim; //params.n_embd;
|
||||
model.hparams.n_ff = config.hidden_dim;
|
||||
model.hparams.n_mult = 32;//params.n_mult;
|
||||
model.hparams.n_head = config.n_heads; //params.n_head;
|
||||
model.hparams.n_layer = config.n_layers; //params.n_layer;
|
||||
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
|
||||
print_params(&model.hparams);
|
||||
|
||||
struct ggml_init_params lcparams;
|
||||
lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
|
||||
lcparams.mem_buffer = NULL;
|
||||
@@ -929,7 +956,7 @@ int main(int argc, char ** argv) {
|
||||
model.name = basename(params.fn_llama2c_model);
|
||||
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
||||
|
||||
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||
|
||||
ggml_free(model.ctx);
|
||||
return 0;
|
||||
|
||||
@@ -1,34 +1,31 @@
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <ios>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
#include <climits>
|
||||
#include <stdexcept>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#ifndef PATH_MAX
|
||||
#define PATH_MAX MAX_PATH
|
||||
#endif
|
||||
#include <io.h>
|
||||
#endif
|
||||
|
||||
enum split_operation : uint8_t {
|
||||
SPLIT_OP_SPLIT,
|
||||
SPLIT_OP_MERGE,
|
||||
};
|
||||
|
||||
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
|
||||
static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
|
||||
|
||||
static const int SPLIT_FILENAME_MAX = 256;
|
||||
|
||||
static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
|
||||
|
||||
struct split_params {
|
||||
split_operation operation = SPLIT_OP_SPLIT;
|
||||
@@ -119,13 +116,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para
|
||||
try {
|
||||
if (!split_params_parse_ex(argc, argv, params)) {
|
||||
split_print_usage(argv[0]);
|
||||
exit(EXIT_FAILURE);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
catch (const std::invalid_argument & ex) {
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
split_print_usage(argv[0]);
|
||||
exit(EXIT_FAILURE);
|
||||
exit(1);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@@ -137,6 +134,12 @@ static void zeros(std::ofstream & file, size_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
static std::string split_file_name(const std::string & path, int i_split, int n_split) {
|
||||
char f_split[SPLIT_FILENAME_MAX] = {0};
|
||||
snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
|
||||
return std::string(f_split);
|
||||
}
|
||||
|
||||
struct split_strategy {
|
||||
const split_params params;
|
||||
std::ifstream & f_input;
|
||||
@@ -177,9 +180,8 @@ struct split_strategy {
|
||||
if (i_split == 0) {
|
||||
gguf_set_kv(ctx_out, ctx_gguf);
|
||||
}
|
||||
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
|
||||
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
|
||||
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
|
||||
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
|
||||
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
|
||||
|
||||
// populate the original tensors, so we get an initial metadata
|
||||
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
|
||||
@@ -187,11 +189,10 @@ struct split_strategy {
|
||||
gguf_add_tensor(ctx_out, meta);
|
||||
}
|
||||
|
||||
char split_path[PATH_MAX] = {0};
|
||||
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
|
||||
auto split_name = split_file_name(params.output, i_split, n_split);
|
||||
|
||||
fprintf(stderr, "%s: %s ...", __func__, split_path);
|
||||
fout = std::ofstream(split_path, std::ios::binary);
|
||||
fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
|
||||
fout = std::ofstream(split_name, std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
|
||||
auto meta_size = gguf_get_meta_size(ctx_out);
|
||||
@@ -249,23 +250,19 @@ static void gguf_split(const split_params & split_params) {
|
||||
std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
|
||||
if (!f_input.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
|
||||
|
||||
char first_split_path[PATH_MAX] = {0};
|
||||
llama_split_path(first_split_path, sizeof(first_split_path),
|
||||
split_params.output.c_str(), strategy.i_split, strategy.n_split);
|
||||
fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
|
||||
__func__, split_params.input.c_str(),
|
||||
first_split_path,
|
||||
split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
|
||||
split_params.n_split_tensors);
|
||||
|
||||
strategy.split_start();
|
||||
@@ -301,9 +298,7 @@ static void gguf_merge(const split_params & split_params) {
|
||||
std::vector<ggml_context *> ctx_metas;
|
||||
std::vector<gguf_context *> ctx_ggufs;
|
||||
|
||||
char split_path[PATH_MAX] = {0};
|
||||
strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
|
||||
char split_prefix[PATH_MAX] = {0};
|
||||
std::string split_prefix;
|
||||
|
||||
// First pass to find KV and tensors metadata
|
||||
for (int i_split = 0; i_split < n_split; i_split++) {
|
||||
@@ -314,66 +309,89 @@ static void gguf_merge(const split_params & split_params) {
|
||||
/*.ctx = */ &ctx_meta,
|
||||
};
|
||||
|
||||
auto split_name = split_params.input;
|
||||
if (i_split > 0) {
|
||||
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
|
||||
split_name = split_file_name(split_prefix, i_split, n_split);
|
||||
}
|
||||
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
|
||||
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
|
||||
|
||||
auto * ctx_gguf = gguf_init_from_file(split_path, params);
|
||||
auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
exit(1);
|
||||
}
|
||||
ctx_ggufs.push_back(ctx_gguf);
|
||||
ctx_metas.push_back(ctx_meta);
|
||||
|
||||
if (i_split == 0) {
|
||||
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
||||
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
|
||||
if (key_n_split < 0) {
|
||||
fprintf(stderr,
|
||||
"\n%s: input file does not contain %s metadata\n",
|
||||
__func__,
|
||||
LLM_KV_SPLIT_COUNT);
|
||||
LLM_KV_GENERAL_SPLIT_N_SPLIT);
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx_meta);
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(EXIT_FAILURE);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
|
||||
n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
|
||||
if (n_split < 1) {
|
||||
fprintf(stderr,
|
||||
"\n%s: input file does not contain a valid split count %d\n",
|
||||
__func__,
|
||||
n_split);
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx_meta);
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Verify the file naming and extract split_prefix
|
||||
if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
|
||||
fprintf(stderr, "\n%s: unexpected input file name: %s"
|
||||
" i_split=%d"
|
||||
" n_split=%d\n", __func__,
|
||||
split_path, i_split, n_split);
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx_meta);
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(EXIT_FAILURE);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Do not trigger merge if we try to merge again the output
|
||||
gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
|
||||
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
|
||||
|
||||
// Set metadata from the first split
|
||||
gguf_set_kv(ctx_out, ctx_gguf);
|
||||
}
|
||||
|
||||
// Verify the file naming
|
||||
{
|
||||
int i_split_file = 0;
|
||||
int n_split_file = 0;
|
||||
const char * i_split_format = "-00000-of-00000.gguf";
|
||||
|
||||
if (split_name.size() < strlen(i_split_format)) {
|
||||
fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
|
||||
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||
gguf_free(_ctx_gguf);
|
||||
}
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
|
||||
|
||||
const char * split_name_c_str = split_name.c_str();
|
||||
int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
|
||||
|
||||
if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
|
||||
fprintf(stderr, "\n%s: unexpected input file name: %s"
|
||||
" i_split=%d i_split_file=%d"
|
||||
" n_split=%d n_split_file=%d\n", __func__,
|
||||
split_params.input.c_str(),
|
||||
i_split, i_split_file,
|
||||
n_split, n_split_file);
|
||||
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||
gguf_free(_ctx_gguf);
|
||||
}
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
|
||||
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
|
||||
@@ -393,19 +411,18 @@ static void gguf_merge(const split_params & split_params) {
|
||||
|
||||
// Write tensors data
|
||||
for (int i_split = 0; i_split < n_split; i_split++) {
|
||||
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
|
||||
std::ifstream f_input(split_path, std::ios::binary);
|
||||
auto split_name = split_file_name(split_prefix, i_split, n_split);
|
||||
std::ifstream f_input(split_name.c_str(), std::ios::binary);
|
||||
if (!f_input.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path);
|
||||
for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
|
||||
gguf_free(ctx_ggufs[i]);
|
||||
ggml_free(ctx_metas[i]);
|
||||
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str());
|
||||
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||
gguf_free(_ctx_gguf);
|
||||
}
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(EXIT_FAILURE);
|
||||
exit(1);
|
||||
}
|
||||
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
|
||||
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
|
||||
|
||||
auto * ctx_gguf = ctx_ggufs[i_split];
|
||||
auto * ctx_meta = ctx_metas[i_split];
|
||||
@@ -464,8 +481,8 @@ int main(int argc, const char ** argv) {
|
||||
break;
|
||||
case SPLIT_OP_MERGE: gguf_merge(params);
|
||||
break;
|
||||
default: split_print_usage(argv[0]);
|
||||
exit(EXIT_FAILURE);
|
||||
default:split_print_usage(argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -61,7 +61,7 @@ class SchemaConverter:
|
||||
|
||||
def _format_literal(self, literal):
|
||||
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
|
||||
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
|
||||
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
|
||||
)
|
||||
return f'"{escaped}"'
|
||||
|
||||
@@ -308,7 +308,8 @@ class SchemaConverter:
|
||||
return ref_name
|
||||
|
||||
def _generate_constant_rule(self, value):
|
||||
return self._format_literal(json.dumps(value))
|
||||
assert isinstance(value, str), f'Only string constants are supported, got {value}'
|
||||
return self._format_literal(value)
|
||||
|
||||
def visit(self, schema, name):
|
||||
schema_type = schema.get('type')
|
||||
@@ -427,7 +428,7 @@ class SchemaConverter:
|
||||
prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
|
||||
prop_kv_rule_names[prop_name] = self._add_rule(
|
||||
f'{name}{"-" if name else ""}{prop_name}-kv',
|
||||
fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}'
|
||||
fr'{self._format_literal(prop_name)} space ":" space {prop_rule_name}'
|
||||
)
|
||||
required_props = [k for k in sorted_props if k in required]
|
||||
optional_props = [k for k in sorted_props if k not in required]
|
||||
|
||||
@@ -3,21 +3,3 @@ add_executable(${TARGET} lookup.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TARGET lookup-create)
|
||||
add_executable(${TARGET} lookup-create.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TARGET lookup-merge)
|
||||
add_executable(${TARGET} lookup-merge.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TARGET lookup-stats)
|
||||
add_executable(${TARGET} lookup-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "ngram-cache.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
return 1;
|
||||
}
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = NULL;
|
||||
llama_context * ctx = NULL;
|
||||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
GGML_ASSERT(model != nullptr);
|
||||
|
||||
// tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||
|
||||
|
||||
llama_ngram_cache ngram_cache;
|
||||
llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
||||
|
||||
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
||||
}
|
||||
@@ -1,47 +0,0 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "ngram-cache.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage() {
|
||||
fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
|
||||
fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv){
|
||||
if (argc < 3) {
|
||||
print_usage();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::vector<std::string> args;
|
||||
args.resize(argc-1);
|
||||
for (int i = 0; i < argc-1; ++i) {
|
||||
args[i] = argv[i+1];
|
||||
if (args[i] == "-h" || args[i] == "--help") {
|
||||
print_usage();
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
|
||||
llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
|
||||
|
||||
for (size_t i = 1; i < args.size()-1; ++i) {
|
||||
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
|
||||
llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
|
||||
|
||||
llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
|
||||
}
|
||||
|
||||
fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
|
||||
llama_ngram_cache_save(ngram_cache_merged, args.back());
|
||||
}
|
||||
@@ -1,163 +0,0 @@
|
||||
#include "ggml.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
#include "ngram-cache.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int n_draft = params.n_draft;
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = NULL;
|
||||
llama_context * ctx = NULL;
|
||||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
LOG("add_bos tgt: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
|
||||
llama_ngram_cache ngram_cache_context;
|
||||
llama_ngram_cache ngram_cache_dynamic;
|
||||
llama_ngram_cache ngram_cache_static;
|
||||
int64_t t_draft_flat_us = 0;
|
||||
int64_t t_draft_us = 0;
|
||||
|
||||
{
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
|
||||
if (!params.lookup_cache_static.empty()) {
|
||||
try {
|
||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||
} catch (std::ifstream::failure const &) {
|
||||
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.lookup_cache_dynamic.empty()) {
|
||||
try {
|
||||
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
|
||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
||||
}
|
||||
|
||||
t_draft_flat_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
|
||||
const int n_input = inp.size();
|
||||
const int n_ctx = params.n_ctx;
|
||||
|
||||
int n_drafted = 0;
|
||||
int n_accept = 0;
|
||||
|
||||
const int64_t t_start_ms = ggml_time_ms();
|
||||
|
||||
// Iterate over input tokens in chunks of size n_ctx.
|
||||
// Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility.
|
||||
for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) {
|
||||
const std::vector<llama_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
|
||||
std::vector<llama_token> pseudo_output;
|
||||
pseudo_output.push_back(inp_slice[0]);
|
||||
|
||||
while ((int) pseudo_output.size() < n_ctx) {
|
||||
// Simulate drafting and decoding from draft:
|
||||
std::vector<llama_token> draft;
|
||||
draft.push_back(pseudo_output.back());
|
||||
|
||||
{
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
|
||||
n_drafted += draft.size() - 1;
|
||||
|
||||
for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) {
|
||||
const llama_token ground_truth = inp_slice[pseudo_output.size()];
|
||||
const llama_token drafted = draft[j];
|
||||
|
||||
if (ground_truth != drafted) {
|
||||
break;
|
||||
}
|
||||
|
||||
++n_accept;
|
||||
pseudo_output.push_back(ground_truth);
|
||||
|
||||
{
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
}
|
||||
|
||||
// After each simulated batch decoding simulate the sampling of a single token:
|
||||
if ((int) pseudo_output.size() < n_ctx) {
|
||||
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
|
||||
{
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
}
|
||||
|
||||
draft.erase(draft.begin());
|
||||
|
||||
}
|
||||
if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) {
|
||||
const int64_t t_now_ms = ggml_time_ms();
|
||||
const int64_t eta_ms = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start;
|
||||
const int64_t eta_min = eta_ms / (60*1000);
|
||||
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
|
||||
|
||||
LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
|
||||
}
|
||||
|
||||
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
||||
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||
ngram_cache_context.clear();
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("n_draft = %d\n", n_draft);
|
||||
LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx);
|
||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
||||
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,15 +1,12 @@
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "ngram-cache.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
@@ -18,7 +15,11 @@ int main(int argc, char ** argv){
|
||||
return 1;
|
||||
}
|
||||
|
||||
// max. number of additional tokens to draft if match is found
|
||||
// max/min n-grams size to search for in prompt
|
||||
const int ngram_max = 4;
|
||||
const int ngram_min = 1;
|
||||
|
||||
// length of the candidate / draft sequence, if match is found
|
||||
const int n_draft = params.n_draft;
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
@@ -38,8 +39,6 @@ int main(int argc, char ** argv){
|
||||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
@@ -48,35 +47,6 @@ int main(int argc, char ** argv){
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
|
||||
llama_ngram_cache ngram_cache_context;
|
||||
llama_ngram_cache ngram_cache_dynamic;
|
||||
llama_ngram_cache ngram_cache_static;
|
||||
int64_t t_draft_flat_us = 0;
|
||||
int64_t t_draft_us = 0;
|
||||
|
||||
{
|
||||
// Fill up context ngram cache with tokens from user input:
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
||||
|
||||
if (!params.lookup_cache_static.empty()) {
|
||||
try {
|
||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||
} catch (std::ifstream::failure const &) {
|
||||
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.lookup_cache_dynamic.empty()) {
|
||||
try {
|
||||
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
|
||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
||||
}
|
||||
|
||||
t_draft_flat_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
||||
@@ -106,6 +76,8 @@ int main(int argc, char ** argv){
|
||||
int n_drafted = 0;
|
||||
int n_accept = 0;
|
||||
|
||||
int64_t t_draft_us = 0;
|
||||
|
||||
int n_past = inp.size();
|
||||
|
||||
bool has_eos = false;
|
||||
@@ -157,12 +129,6 @@ int main(int argc, char ** argv){
|
||||
++n_past;
|
||||
++i_dft;
|
||||
inp.push_back(id);
|
||||
{
|
||||
// Update context ngram cache with the newly accepted token:
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
|
||||
if (params.use_color) {
|
||||
// color accepted draft token
|
||||
@@ -183,12 +149,6 @@ int main(int argc, char ** argv){
|
||||
draft.clear();
|
||||
draft.push_back(id);
|
||||
inp.push_back(id);
|
||||
{
|
||||
// Update context ngram cache with the newly accepted token:
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -203,19 +163,44 @@ int main(int argc, char ** argv){
|
||||
llama_batch_clear(batch_tgt);
|
||||
llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||
|
||||
// Draft already contains a single token sampled from the model:
|
||||
GGML_ASSERT(draft.size() == 1);
|
||||
GGML_ASSERT(draft[0] == inp.back());
|
||||
// generate n_pred tokens through prompt lookup
|
||||
auto prompt_lookup = [&]() -> void {
|
||||
const int inp_size = inp.size();
|
||||
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
|
||||
const llama_token * ngram = &inp[inp_size - ngram_size];
|
||||
|
||||
for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
|
||||
bool match = true;
|
||||
for (int j = 0; j < ngram_size; ++j) {
|
||||
if (inp[i + j] != ngram[j]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (match) {
|
||||
const int startIdx = i + ngram_size;
|
||||
const int endIdx = startIdx + n_draft;
|
||||
if (endIdx < inp_size) {
|
||||
for (int j = startIdx; j < endIdx; ++j) {
|
||||
LOG(" - draft candidate %d: %d\n", j, inp[j]);
|
||||
draft.push_back(inp[j]);
|
||||
llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
|
||||
++n_drafted;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
};
|
||||
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
|
||||
llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||
|
||||
for (size_t i = 1; i < draft.size(); ++i) {
|
||||
llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||
}
|
||||
prompt_lookup();
|
||||
|
||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||
n_drafted += draft.size() - 1;
|
||||
|
||||
llama_decode(ctx, batch_tgt);
|
||||
++n_past;
|
||||
@@ -225,24 +210,19 @@ int main(int argc, char ** argv){
|
||||
|
||||
auto t_dec_end = ggml_time_us();
|
||||
|
||||
// Update dynamic ngram cache with context ngram cache and save it to disk:
|
||||
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("n_draft = %d\n", n_draft);
|
||||
LOG_TEE("n_predict = %d\n", n_predict);
|
||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
||||
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||
LOG_TEE("n_draft = %d\n", n_draft);
|
||||
LOG_TEE("n_predict = %d\n", n_predict);
|
||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
LOG_TEE("\ntarget:\n");
|
||||
llama_print_timings(ctx);
|
||||
|
||||
@@ -189,18 +189,6 @@ static void prepare_imatrix(const std::string& imatrix_file,
|
||||
}
|
||||
}
|
||||
|
||||
static ggml_type parse_ggml_type(const char * arg) {
|
||||
ggml_type result = GGML_TYPE_COUNT;
|
||||
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
|
||||
auto type = ggml_type(j);
|
||||
const auto * name = ggml_type_name(type);
|
||||
if (name && strcmp(arg, name) == 0) {
|
||||
result = type; break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
@@ -215,18 +203,6 @@ int main(int argc, char ** argv) {
|
||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
||||
params.quantize_output_tensor = false;
|
||||
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||
params.allow_requantize = true;
|
||||
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -48,7 +48,7 @@ export class SchemaConverter {
|
||||
}
|
||||
|
||||
_formatLiteral(literal) {
|
||||
const escaped = literal.replace(
|
||||
const escaped = JSON.stringify(literal).replace(
|
||||
GRAMMAR_LITERAL_ESCAPE_RE,
|
||||
m => GRAMMAR_LITERAL_ESCAPES[m]
|
||||
);
|
||||
@@ -327,7 +327,10 @@ export class SchemaConverter {
|
||||
}
|
||||
|
||||
_generateConstantRule(value) {
|
||||
return this._formatLiteral(JSON.stringify(value));
|
||||
if (typeof value !== 'string') {
|
||||
throw new Error('Only string constants are supported, got ' + JSON.stringify(value));
|
||||
}
|
||||
return this._formatLiteral(value);
|
||||
}
|
||||
|
||||
visit(schema, name) {
|
||||
@@ -343,6 +346,9 @@ export class SchemaConverter {
|
||||
} else if (Array.isArray(schemaType)) {
|
||||
return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
|
||||
} else if ('const' in schema) {
|
||||
if (typeof schema.const !== 'string') {
|
||||
throw new Error('Only string constants are supported, got ' + JSON.stringify(schema.const));
|
||||
}
|
||||
return this._addRule(ruleName, this._generateConstantRule(schema.const));
|
||||
} else if ('enum' in schema) {
|
||||
const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
|
||||
@@ -451,7 +457,7 @@ export class SchemaConverter {
|
||||
const propRuleName = this.visit(propSchema, `${name ?? ''}${name ? '-' : ''}${propName}`);
|
||||
propKvRuleNames[propName] = this._addRule(
|
||||
`${name ?? ''}${name ? '-' : ''}${propName}-kv`,
|
||||
`${this._formatLiteral(JSON.stringify(propName))} space ":" space ${propRuleName}`
|
||||
`${this._formatLiteral(propName)} space ":" space ${propRuleName}`
|
||||
);
|
||||
}
|
||||
const requiredProps = sortedProps.filter(k => required.has(k));
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
#include <signal.h>
|
||||
#include <memory>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
using json = nlohmann::json;
|
||||
|
||||
bool server_verbose = false;
|
||||
bool server_log_json = true;
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
|
||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
using json = nlohmann::json;
|
||||
|
||||
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
|
||||
enum error_type {
|
||||
@@ -95,8 +95,8 @@ static inline void server_log(const char *level, const char *function, int line,
|
||||
|
||||
const std::string str = ss.str();
|
||||
printf("%.*s\n", (int)str.size(), str.data());
|
||||
fflush(stdout);
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
28
ggml-cuda.cu
28
ggml-cuda.cu
@@ -771,11 +771,7 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t
|
||||
if (src_ctx->device == dst_ctx->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
|
||||
} else {
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
return false;
|
||||
#else
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
|
||||
#endif
|
||||
}
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
return true;
|
||||
@@ -11326,23 +11322,19 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
|
||||
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
|
||||
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
|
||||
|
||||
// copy on src stream
|
||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
||||
} else {
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
return false;
|
||||
#else
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
||||
#endif
|
||||
}
|
||||
|
||||
// record event on src stream
|
||||
if (!cuda_ctx_src->copy_event) {
|
||||
ggml_cuda_set_device(cuda_ctx_src->device);
|
||||
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
|
||||
}
|
||||
|
||||
// copy on src stream
|
||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
|
||||
}
|
||||
|
||||
// record event on src stream
|
||||
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
|
||||
|
||||
// wait on dst stream for the copy to complete
|
||||
@@ -11538,9 +11530,6 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
|
||||
}
|
||||
|
||||
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
return nullptr;
|
||||
#else
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
@@ -11552,7 +11541,6 @@ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend)
|
||||
/* .backend = */ backend,
|
||||
/* .context = */ event,
|
||||
};
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
|
||||
|
||||
529
llama.cpp
529
llama.cpp
@@ -52,9 +52,6 @@
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#ifndef PATH_MAX
|
||||
#define PATH_MAX MAX_PATH
|
||||
#endif
|
||||
#include <io.h>
|
||||
#endif
|
||||
|
||||
@@ -293,10 +290,6 @@ enum llm_kv {
|
||||
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
||||
LLM_KV_ROPE_SCALING_FINETUNED,
|
||||
|
||||
LLM_KV_SPLIT_NO,
|
||||
LLM_KV_SPLIT_COUNT,
|
||||
LLM_KV_SPLIT_TENSORS_COUNT,
|
||||
|
||||
LLM_KV_SSM_INNER_SIZE,
|
||||
LLM_KV_SSM_CONV_KERNEL,
|
||||
LLM_KV_SSM_STATE_SIZE,
|
||||
@@ -362,10 +355,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
||||
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
||||
|
||||
{ LLM_KV_SPLIT_NO, "split.no" },
|
||||
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
||||
{ LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
|
||||
|
||||
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
|
||||
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
||||
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
||||
@@ -1110,7 +1099,6 @@ struct llama_file {
|
||||
}
|
||||
}
|
||||
};
|
||||
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
||||
|
||||
struct llama_mmap {
|
||||
void * addr;
|
||||
@@ -1311,7 +1299,6 @@ struct llama_mmap {
|
||||
}
|
||||
#endif
|
||||
};
|
||||
using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
|
||||
|
||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
||||
// will automatically unlock on destruction.
|
||||
@@ -1461,7 +1448,6 @@ struct llama_mlock {
|
||||
static void raw_unlock(const void * addr, size_t len) {}
|
||||
#endif
|
||||
};
|
||||
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
||||
|
||||
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
||||
std::vector<char> result(8, 0);
|
||||
@@ -2037,12 +2023,12 @@ struct llama_model {
|
||||
// the model memory buffers for the tensor data
|
||||
std::vector<ggml_backend_buffer_t> bufs;
|
||||
|
||||
// model memory mapped files
|
||||
llama_mmaps mappings;
|
||||
// model memory mapped file
|
||||
std::unique_ptr<llama_mmap> mapping;
|
||||
|
||||
// objects representing data potentially being locked in memory
|
||||
llama_mlocks mlock_bufs;
|
||||
llama_mlocks mlock_mmaps;
|
||||
std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
|
||||
llama_mlock mlock_mmap;
|
||||
|
||||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
@@ -2806,8 +2792,6 @@ namespace GGUFMeta {
|
||||
};
|
||||
}
|
||||
|
||||
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
||||
|
||||
struct llama_model_loader {
|
||||
int n_kv = 0;
|
||||
int n_tensors = 0;
|
||||
@@ -2818,133 +2802,54 @@ struct llama_model_loader {
|
||||
|
||||
bool use_mmap = false;
|
||||
|
||||
llama_files files;
|
||||
llama_file file;
|
||||
llama_ftype ftype;
|
||||
llama_fver fver;
|
||||
|
||||
llama_mmaps mappings;
|
||||
|
||||
// Holds information on a model weights
|
||||
struct llama_tensor_weights {
|
||||
uint16_t idx; // source file index
|
||||
size_t offs; // tensor data offset in the original file
|
||||
|
||||
ggml_tensor * tensor;
|
||||
|
||||
llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
||||
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
||||
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
||||
}
|
||||
};
|
||||
std::vector<llama_tensor_weights> weights;
|
||||
|
||||
std::unique_ptr<llama_mmap> mapping;
|
||||
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
||||
|
||||
struct gguf_context * meta = NULL;
|
||||
std::vector<ggml_context *> contexts;
|
||||
struct gguf_context * ctx_gguf = NULL;
|
||||
struct ggml_context * ctx_meta = NULL;
|
||||
|
||||
std::string arch_name;
|
||||
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||
|
||||
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
|
||||
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
|
||||
int trace = 0;
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
trace = atoi(getenv("LLAMA_TRACE"));
|
||||
}
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx_meta,
|
||||
};
|
||||
|
||||
if (param_overrides_p != nullptr) {
|
||||
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
||||
kv_overrides.insert({std::string(p->key), *p});
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
|
||||
meta = gguf_init_from_file(fname.c_str(), params);
|
||||
if (!meta) {
|
||||
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur));
|
||||
}
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
||||
contexts.emplace_back(ctx);
|
||||
n_kv = gguf_get_n_kv(ctx_gguf);
|
||||
n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||
|
||||
uint16_t n_split = 0;
|
||||
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
||||
fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
|
||||
|
||||
// Load additional GGML contexts
|
||||
if (n_split > 1) {
|
||||
uint16_t idx = 0;
|
||||
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
|
||||
if (idx != 0) {
|
||||
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
|
||||
}
|
||||
|
||||
char split_prefix[PATH_MAX] = {0};
|
||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
|
||||
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
|
||||
}
|
||||
|
||||
if (trace > 0) {
|
||||
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
||||
}
|
||||
|
||||
char split_path[PATH_MAX] = {0};
|
||||
for (idx = 1; idx < n_split; idx++) {
|
||||
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
||||
|
||||
struct gguf_init_params split_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
||||
}
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur));
|
||||
}
|
||||
files.emplace_back(new llama_file(split_path, "rb"));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
||||
|
||||
// sanity check
|
||||
{
|
||||
const int n_tensors_loaded = (int) weights.size();
|
||||
if (n_tensors != n_tensors_loaded) {
|
||||
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split);
|
||||
}
|
||||
|
||||
n_kv = gguf_get_n_kv(meta);
|
||||
n_tensors = weights.size();
|
||||
|
||||
fver = (enum llama_fver) gguf_get_version(meta);
|
||||
|
||||
for (auto & w : weights) {
|
||||
n_elements += ggml_nelements(w.tensor);
|
||||
n_bytes += ggml_nbytes(w.tensor);
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
||||
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
|
||||
n_elements += ggml_nelements(t);
|
||||
n_bytes += ggml_nbytes(t);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||
@@ -2959,8 +2864,7 @@ struct llama_model_loader {
|
||||
enum ggml_type type_max = GGML_TYPE_F32;
|
||||
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
const ggml_tensor * tensor = weights.at(i).tensor;
|
||||
enum ggml_type type = tensor->type;
|
||||
enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
|
||||
|
||||
n_type[type]++;
|
||||
|
||||
@@ -2970,8 +2874,8 @@ struct llama_model_loader {
|
||||
}
|
||||
|
||||
if (trace > 0) {
|
||||
const uint16_t sid = weights.at(i).idx;
|
||||
LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
|
||||
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
||||
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3007,23 +2911,22 @@ struct llama_model_loader {
|
||||
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
||||
|
||||
{
|
||||
const int kid = gguf_find_key(meta, "general.file_type");
|
||||
const int kid = gguf_find_key(ctx_gguf, "general.file_type");
|
||||
if (kid >= 0) {
|
||||
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
||||
ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(meta, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(meta, i);
|
||||
const char * name = gguf_get_key(ctx_gguf, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
|
||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
std::string value = gguf_kv_to_str(meta, i);
|
||||
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
||||
const size_t MAX_VALUE_LEN = 40;
|
||||
if (value.size() > MAX_VALUE_LEN) {
|
||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||
@@ -3052,18 +2955,18 @@ struct llama_model_loader {
|
||||
}
|
||||
|
||||
~llama_model_loader() {
|
||||
if (meta) {
|
||||
gguf_free(meta);
|
||||
if (ctx_gguf) {
|
||||
gguf_free(ctx_gguf);
|
||||
}
|
||||
for (auto * ctx : contexts) {
|
||||
ggml_free(ctx);
|
||||
if (ctx_meta) {
|
||||
ggml_free(ctx_meta);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
||||
const int kid = gguf_find_key(meta, key.c_str());
|
||||
const int kid = gguf_find_key(ctx_gguf, key.c_str());
|
||||
|
||||
if (kid < 0) {
|
||||
if (required) {
|
||||
@@ -3073,7 +2976,7 @@ struct llama_model_loader {
|
||||
}
|
||||
|
||||
struct GGUFMeta::ArrayInfo arr_info =
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
|
||||
|
||||
|
||||
result = arr_info.length;
|
||||
@@ -3093,7 +2996,7 @@ struct llama_model_loader {
|
||||
const struct llama_model_kv_override * override =
|
||||
it != kv_overrides.end() ? &it->second : nullptr;
|
||||
|
||||
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
|
||||
const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
|
||||
|
||||
if (required && !found) {
|
||||
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
||||
@@ -3116,33 +3019,20 @@ struct llama_model_loader {
|
||||
}
|
||||
|
||||
const char * get_tensor_name(int i) const {
|
||||
return weights.at(i).tensor->name;
|
||||
}
|
||||
|
||||
const llama_tensor_weights & get_weights(const char * name) const {
|
||||
for (const auto & weight : weights) {
|
||||
if (strcmp(name, weight.tensor->name) == 0) {
|
||||
return weight;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error(format("tensor %s not found", name));
|
||||
return gguf_get_tensor_name(ctx_gguf, i);
|
||||
}
|
||||
|
||||
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
||||
try {
|
||||
return get_weights(name).tensor;
|
||||
} catch (const std::runtime_error & e) {
|
||||
return NULL;
|
||||
}
|
||||
return ggml_get_tensor(ctx_meta, name);
|
||||
}
|
||||
|
||||
struct ggml_tensor * get_tensor_meta(int i) const {
|
||||
return get_tensor_meta(get_tensor_name(i));
|
||||
}
|
||||
|
||||
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
||||
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
||||
ggml_set_name(tensor, ggml_get_name(cur));
|
||||
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
|
||||
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
|
||||
ggml_set_name(tensor, ggml_get_name(meta));
|
||||
|
||||
n_created++;
|
||||
|
||||
@@ -3150,7 +3040,7 @@ struct llama_model_loader {
|
||||
}
|
||||
|
||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
||||
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
||||
|
||||
if (cur == NULL) {
|
||||
if (!required) {
|
||||
@@ -3185,79 +3075,76 @@ struct llama_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
|
||||
size_t file_offset(const char * name) const {
|
||||
const int idx = gguf_find_tensor(ctx_gguf, name);
|
||||
|
||||
if (idx < 0) {
|
||||
throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
|
||||
}
|
||||
|
||||
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
|
||||
}
|
||||
|
||||
void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
|
||||
// prefetch the whole file - all the data is needed anyway
|
||||
if (use_mmap) {
|
||||
mappings.reserve(files.size());
|
||||
mmaps_used.reserve(files.size());
|
||||
for (const auto & file : files) {
|
||||
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
||||
mmaps_used.emplace_back(std::make_pair(mapping->size, 0));
|
||||
if (mlock_mmaps) {
|
||||
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
||||
mlock_mmap->init(mapping->addr);
|
||||
mlock_mmaps->emplace_back(std::move(mlock_mmap));
|
||||
}
|
||||
mappings.emplace_back(std::move(mapping));
|
||||
}
|
||||
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
||||
}
|
||||
|
||||
// compute the total size of all tensors for progress reporting
|
||||
for (auto & w : weights) {
|
||||
size_data += ggml_nbytes(w.tensor);
|
||||
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
||||
size_data += ggml_nbytes(cur);
|
||||
}
|
||||
|
||||
if (use_mmap && mapping) {
|
||||
if (lmlock) {
|
||||
lmlock->init(mapping->addr);
|
||||
}
|
||||
mmap_used_first = mapping->size;
|
||||
}
|
||||
}
|
||||
|
||||
void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
|
||||
GGML_ASSERT(!mappings.empty());
|
||||
const auto & mapping = mappings.at(idx);
|
||||
void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
|
||||
GGML_ASSERT(mapping);
|
||||
|
||||
*first = mapping->size;
|
||||
*last = 0;
|
||||
*addr = mapping->addr;
|
||||
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
||||
const auto & w = get_weights(ggml_get_name(tensor));
|
||||
if (w.idx != idx) {
|
||||
continue;
|
||||
}
|
||||
*first = std::min(*first, w.offs);
|
||||
*last = std::max(*last, w.offs + ggml_nbytes(tensor));
|
||||
const size_t offs = file_offset(ggml_get_name(tensor));
|
||||
*first = std::min(*first, offs);
|
||||
*last = std::max(*last, offs + ggml_nbytes(tensor));
|
||||
}
|
||||
}
|
||||
|
||||
// for backwards compatibility, does not support ggml-backend
|
||||
void load_data_for(struct ggml_tensor * cur) const {
|
||||
const auto & w = get_weights(ggml_get_name(cur));
|
||||
const size_t offs = file_offset(ggml_get_name(cur));
|
||||
|
||||
if (use_mmap) {
|
||||
const auto & mapping = mappings.at(w.idx);
|
||||
if (use_mmap && mapping) {
|
||||
if (cur->data == nullptr) {
|
||||
cur->data = (uint8_t *)mapping->addr + w.offs;
|
||||
cur->data = (uint8_t *)mapping->addr + offs;
|
||||
} else {
|
||||
memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
|
||||
memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(cur->data != nullptr);
|
||||
GGML_ASSERT(w.idx < files.size());
|
||||
const auto & file = files.at(w.idx);
|
||||
file->seek(w.offs, SEEK_SET);
|
||||
file->read_raw(cur->data, ggml_nbytes(cur));
|
||||
file.seek(offs, SEEK_SET);
|
||||
file.read_raw(cur->data, ggml_nbytes(cur));
|
||||
}
|
||||
}
|
||||
|
||||
size_t size_done = 0;
|
||||
size_t size_data = 0;
|
||||
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
||||
size_t mmap_used_first = -1;
|
||||
size_t mmap_used_last = 0;
|
||||
|
||||
// Returns false if cancelled by progress_callback
|
||||
bool load_all_data(
|
||||
struct ggml_context * ctx,
|
||||
llama_buf_map & bufs_mmap,
|
||||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
||||
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
|
||||
GGML_ASSERT(size_data != 0 && "call init_mapping() first");
|
||||
|
||||
std::vector<no_init<uint8_t>> read_buf;
|
||||
|
||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
if (progress_callback) {
|
||||
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
||||
@@ -3265,57 +3152,41 @@ struct llama_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
const auto & w = get_weights(ggml_get_name(cur));
|
||||
size_t n_size = ggml_nbytes(cur);
|
||||
const size_t offs = file_offset(ggml_get_name(cur));
|
||||
|
||||
if (use_mmap) {
|
||||
const auto & mapping = mappings.at(w.idx);
|
||||
ggml_backend_buffer_t buf_mmap = nullptr;
|
||||
if (bufs_mmap.count(w.idx)) {
|
||||
buf_mmap = bufs_mmap.at(w.idx);
|
||||
}
|
||||
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
||||
if (use_mmap && mapping) {
|
||||
if (buf_mmap && cur->data == nullptr) {
|
||||
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + w.offs);
|
||||
if (lmlocks) {
|
||||
const auto & lmlock = lmlocks->at(w.idx);
|
||||
lmlock->grow_to(w.offs + ggml_nbytes(cur));
|
||||
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
||||
if (lmlock) {
|
||||
lmlock->grow_to(offs + ggml_nbytes(cur));
|
||||
}
|
||||
|
||||
auto & mmap_used = mmaps_used[w.idx];
|
||||
mmap_used.first = std::min(mmap_used.first, w.offs);
|
||||
mmap_used.second = std::max(mmap_used.second, w.offs + n_size);
|
||||
mmap_used_first = std::min(mmap_used_first, offs);
|
||||
mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
|
||||
} else {
|
||||
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + w.offs, 0, n_size);
|
||||
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(w.idx < files.size());
|
||||
const auto & file = files.at(w.idx);
|
||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||
file->seek(w.offs, SEEK_SET);
|
||||
file->read_raw(cur->data, ggml_nbytes(cur));
|
||||
file.seek(offs, SEEK_SET);
|
||||
file.read_raw(cur->data, ggml_nbytes(cur));
|
||||
} else {
|
||||
read_buf.resize(ggml_nbytes(cur));
|
||||
file->seek(w.offs, SEEK_SET);
|
||||
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
||||
file.seek(offs, SEEK_SET);
|
||||
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
|
||||
}
|
||||
}
|
||||
|
||||
size_done += n_size;
|
||||
size_done += ggml_nbytes(cur);
|
||||
}
|
||||
|
||||
// check if this is the last call and do final cleanup
|
||||
if (size_done >= size_data) {
|
||||
// unmap offloaded tensors and metadata
|
||||
if (use_mmap) {
|
||||
for (uint32_t idx = 0; idx < mappings.size(); idx++) {
|
||||
const auto & mmap_used = mmaps_used.at(idx);
|
||||
auto & mapping = mappings.at(idx);
|
||||
mapping->unmap_fragment(0, mmap_used.first);
|
||||
if (mmap_used.second != 0) {
|
||||
mapping->unmap_fragment(mmap_used.second, mapping->size);
|
||||
}
|
||||
if (use_mmap && mapping) {
|
||||
mapping->unmap_fragment(0, mmap_used_first);
|
||||
if (mmap_used_last != 0) {
|
||||
mapping->unmap_fragment(mmap_used_last, mapping->size);
|
||||
}
|
||||
}
|
||||
if (progress_callback) {
|
||||
@@ -3448,7 +3319,7 @@ static void llm_load_hparams(
|
||||
llama_model_loader & ml,
|
||||
llama_model & model) {
|
||||
auto & hparams = model.hparams;
|
||||
const gguf_context * ctx = ml.meta;
|
||||
const gguf_context * ctx = ml.ctx_gguf;
|
||||
|
||||
// get metadata as string
|
||||
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
||||
@@ -3838,7 +3709,7 @@ static void llm_load_vocab(
|
||||
llama_model & model) {
|
||||
auto & vocab = model.vocab;
|
||||
|
||||
struct gguf_context * ctx = ml.meta;
|
||||
struct gguf_context * ctx = ml.ctx_gguf;
|
||||
|
||||
const auto kv = LLM_KV(model.arch);
|
||||
|
||||
@@ -4448,8 +4319,10 @@ static bool llm_load_tensors(
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
||||
|
||||
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
|
||||
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false);
|
||||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
||||
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
|
||||
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
||||
}
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
@@ -5151,97 +5024,56 @@ static bool llm_load_tensors(
|
||||
|
||||
ml.done_getting_tensors();
|
||||
|
||||
ml.init_mappings(true, &model.mlock_mmaps);
|
||||
model.mappings.reserve(ml.mappings.size());
|
||||
ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
|
||||
|
||||
// create the backend buffers
|
||||
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
|
||||
ctx_bufs.reserve(ctx_map.size());
|
||||
|
||||
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
||||
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
||||
model.bufs.reserve(n_max_backend_buffer);
|
||||
std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
|
||||
|
||||
for (auto & it : ctx_map) {
|
||||
ggml_backend_buffer_type_t buft = it.first;
|
||||
ggml_context * ctx = it.second;
|
||||
|
||||
llama_buf_map bufs;
|
||||
bufs.reserve(n_max_backend_buffer);
|
||||
ggml_context * ctx = it.second;
|
||||
ggml_backend_buffer_t buf = nullptr;
|
||||
|
||||
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
||||
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
||||
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
|
||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||
void * addr = nullptr;
|
||||
size_t first, last;
|
||||
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
||||
if (first >= last) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
|
||||
if (buf == nullptr) {
|
||||
throw std::runtime_error("unable to allocate backend CPU buffer");
|
||||
}
|
||||
model.bufs.push_back(buf);
|
||||
bufs.emplace(idx, buf);
|
||||
size_t first, last;
|
||||
ml.get_mapping_range(&first, &last, ctx);
|
||||
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (n_layer >= n_gpu_layers) {
|
||||
ggml_backend_cuda_register_host_buffer(
|
||||
if (n_layer >= n_gpu_layers) {
|
||||
ggml_backend_cuda_register_host_buffer(
|
||||
ggml_backend_buffer_get_base(buf),
|
||||
ggml_backend_buffer_get_size(buf));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#ifdef GGML_USE_METAL
|
||||
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
||||
void * addr = nullptr;
|
||||
size_t first, last;
|
||||
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
||||
if (first >= last) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
|
||||
if (buf == nullptr) {
|
||||
throw std::runtime_error("unable to allocate backend metal buffer");
|
||||
}
|
||||
model.bufs.push_back(buf);
|
||||
bufs.emplace(idx, buf);
|
||||
}
|
||||
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
||||
size_t first, last;
|
||||
ml.get_mapping_range(&first, &last, ctx);
|
||||
buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
|
||||
}
|
||||
#endif
|
||||
else {
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (buf == nullptr) {
|
||||
throw std::runtime_error("unable to allocate backend buffer");
|
||||
}
|
||||
model.bufs.push_back(buf);
|
||||
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
||||
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
|
||||
model.mlock_bufs.emplace_back(new llama_mlock);
|
||||
auto & mlock_buf = model.mlock_bufs.back();
|
||||
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
||||
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
||||
}
|
||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||
bufs.emplace(idx, buf);
|
||||
}
|
||||
}
|
||||
|
||||
if (bufs.empty()) {
|
||||
if (buf == nullptr) {
|
||||
throw std::runtime_error("failed to allocate buffer");
|
||||
}
|
||||
|
||||
for (auto & buf : bufs) {
|
||||
// indicate that this buffer contains weights
|
||||
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
||||
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
}
|
||||
|
||||
ctx_bufs.emplace_back(ctx, bufs);
|
||||
// indicate that this buffer contains weights
|
||||
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
||||
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
model.bufs.push_back(buf);
|
||||
ctx_bufs.emplace_back(ctx, buf);
|
||||
}
|
||||
|
||||
if (llama_supports_gpu_offload()) {
|
||||
@@ -5273,15 +5105,13 @@ static bool llm_load_tensors(
|
||||
// load tensor data
|
||||
for (auto & it : ctx_bufs) {
|
||||
ggml_context * ctx = it.first;
|
||||
auto & bufs = it.second;
|
||||
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
|
||||
ggml_backend_buffer_t buf = it.second;
|
||||
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto & mapping : ml.mappings) {
|
||||
model.mappings.emplace_back(std::move(mapping));
|
||||
}
|
||||
model.mapping = std::move(ml.mapping);
|
||||
|
||||
// loading time will be recalculate after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
@@ -12141,34 +11971,27 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
|
||||
// with the quantization of the output tensor
|
||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
|
||||
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs.params->output_tensor_type;
|
||||
} else {
|
||||
int nx = tensor->ne[0];
|
||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (new_type != GGML_TYPE_Q8_0) {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
int nx = tensor->ne[0];
|
||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (new_type != GGML_TYPE_Q8_0) {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
} else if (name == "token_embd.weight") {
|
||||
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
||||
new_type = qs.params->token_embedding_type;
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
||||
@@ -12204,7 +12027,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
||||
}
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
||||
@@ -12479,7 +12308,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
#endif
|
||||
|
||||
llama_model_loader ml(fname_inp, use_mmap, NULL);
|
||||
ml.init_mappings(false); // no prefetching?
|
||||
ml.init_mapping(false); // no prefetching?
|
||||
|
||||
llama_model model;
|
||||
llm_load_arch(ml, model);
|
||||
@@ -12503,12 +12332,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
struct gguf_context * ctx_out = gguf_init_empty();
|
||||
|
||||
// copy the KV pairs from the input file
|
||||
gguf_set_kv (ctx_out, ml.meta);
|
||||
gguf_set_kv (ctx_out, ml.ctx_gguf);
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
||||
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
||||
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
||||
|
||||
const std::string name = ggml_get_name(meta);
|
||||
|
||||
@@ -12548,7 +12377,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
// populate the original tensors so we get an initial meta data
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
||||
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
||||
gguf_add_tensor(ctx_out, meta);
|
||||
}
|
||||
|
||||
@@ -12753,7 +12582,7 @@ static int llama_apply_lora_from_file_internal(
|
||||
if (path_base_model) {
|
||||
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
||||
ml->init_mappings(/*prefetch*/ false); // no prefetching
|
||||
ml->init_mapping(/*prefetch*/ false); // no prefetching
|
||||
}
|
||||
|
||||
struct tensor_meta {
|
||||
@@ -12874,7 +12703,7 @@ static int llama_apply_lora_from_file_internal(
|
||||
|
||||
ggml_tensor * base_t;
|
||||
if (ml) {
|
||||
if (!ml->get_tensor_meta(base_name.c_str())) {
|
||||
if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
|
||||
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||
return 1;
|
||||
}
|
||||
@@ -13058,8 +12887,6 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
struct llama_model_quantize_params result = {
|
||||
/*.nthread =*/ 0,
|
||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
||||
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
||||
/*.allow_requantize =*/ false,
|
||||
/*.quantize_output_tensor =*/ true,
|
||||
/*.only_copy =*/ false,
|
||||
@@ -14824,30 +14651,6 @@ LLAMA_API int32_t llama_chat_apply_template(
|
||||
return res;
|
||||
}
|
||||
|
||||
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
|
||||
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
|
||||
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
|
||||
return strlen(split_path);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
||||
std::string str_split_path(split_path);
|
||||
char postfix[32];
|
||||
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
||||
std::string str_postfix(postfix);
|
||||
|
||||
// check if dest ends with postfix
|
||||
int size_prefix = str_split_path.size() - str_postfix.size();
|
||||
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
||||
snprintf(dest, std::min((size_t) size_prefix, maxlen), "%s", split_path);
|
||||
return size_prefix;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
||||
struct llama_timings result = {
|
||||
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
||||
|
||||
26
llama.h
26
llama.h
@@ -275,15 +275,13 @@ extern "C" {
|
||||
|
||||
// model quantization parameters
|
||||
typedef struct llama_model_quantize_params {
|
||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
enum ggml_type output_tensor_type; // output tensor type
|
||||
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // quantize all tensors to the default type
|
||||
void * imatrix; // pointer to importance matrix data
|
||||
} llama_model_quantize_params;
|
||||
|
||||
// grammar types
|
||||
@@ -962,16 +960,6 @@ extern "C" {
|
||||
int32_t n_past,
|
||||
int32_t n_predict);
|
||||
|
||||
/// @details Build a split GGUF final path for this chunk.
|
||||
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
||||
// Returns the split_path length.
|
||||
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
||||
|
||||
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
||||
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
||||
// Returns the split_prefix length.
|
||||
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
||||
|
||||
// Performance information
|
||||
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
||||
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
|
||||
|
||||
echo "Usage:"
|
||||
echo ""
|
||||
echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]"
|
||||
echo ""
|
||||
|
||||
exit 0
|
||||
@@ -90,7 +90,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
|
||||
test({
|
||||
FAILURE,
|
||||
"invalid type",
|
||||
"invalid type type",
|
||||
R"""({
|
||||
"type": 123
|
||||
})""",
|
||||
@@ -193,27 +193,21 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
});
|
||||
|
||||
test({
|
||||
SUCCESS,
|
||||
FAILURE,
|
||||
"non-string const",
|
||||
R"""({
|
||||
"const": 123
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "123"
|
||||
space ::= " "?
|
||||
)"""
|
||||
""
|
||||
});
|
||||
|
||||
test({
|
||||
SUCCESS,
|
||||
FAILURE,
|
||||
"non-string enum",
|
||||
R"""({
|
||||
"enum": ["red", "amber", "green", null, 42, ["foo"]]
|
||||
"enum": [123]
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]"
|
||||
space ::= " "?
|
||||
)"""
|
||||
""
|
||||
});
|
||||
|
||||
test({
|
||||
@@ -384,18 +378,20 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
|
||||
test({
|
||||
SUCCESS,
|
||||
"required props in original order",
|
||||
"required props",
|
||||
R"""({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"b": {"type": "string"},
|
||||
"c": {"type": "string"},
|
||||
"a": {"type": "string"}
|
||||
"a": {
|
||||
"type": "string"
|
||||
},
|
||||
"b": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"a",
|
||||
"b",
|
||||
"c"
|
||||
"b"
|
||||
],
|
||||
"additionalProperties": false,
|
||||
"definitions": {}
|
||||
@@ -403,8 +399,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
R"""(
|
||||
a-kv ::= "\"a\"" space ":" space string
|
||||
b-kv ::= "\"b\"" space ":" space string
|
||||
c-kv ::= "\"c\"" space ":" space string
|
||||
root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
|
||||
root ::= "{" space a-kv "," space b-kv "}" space
|
||||
space ::= " "?
|
||||
string ::= "\"" (
|
||||
[^"\\] |
|
||||
@@ -463,13 +458,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
|
||||
test({
|
||||
SUCCESS,
|
||||
"required + optional props each in original order",
|
||||
"required + optional props",
|
||||
R"""({
|
||||
"properties": {
|
||||
"b": {"type": "string"},
|
||||
"a": {"type": "string"},
|
||||
"d": {"type": "string"},
|
||||
"c": {"type": "string"}
|
||||
"b": {"type": "string"},
|
||||
"c": {"type": "string"},
|
||||
"d": {"type": "string"}
|
||||
},
|
||||
"required": ["a", "b"],
|
||||
"additionalProperties": false
|
||||
@@ -478,14 +473,14 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
a-kv ::= "\"a\"" space ":" space string
|
||||
b-kv ::= "\"b\"" space ":" space string
|
||||
c-kv ::= "\"c\"" space ":" space string
|
||||
c-rest ::= ( "," space d-kv )?
|
||||
d-kv ::= "\"d\"" space ":" space string
|
||||
d-rest ::= ( "," space c-kv )?
|
||||
root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
|
||||
root ::= "{" space a-kv "," space b-kv ( "," space ( c-kv c-rest | d-kv ) )? "}" space
|
||||
space ::= " "?
|
||||
string ::= "\"" (
|
||||
[^"\\] |
|
||||
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
|
||||
)* "\"" space
|
||||
)* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -653,16 +648,16 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"$ref": "#/definitions/MyType",
|
||||
"definitions": {
|
||||
"MyType": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"a": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"a"
|
||||
],
|
||||
"additionalProperties": false
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"a": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"a"
|
||||
],
|
||||
"additionalProperties": false
|
||||
}
|
||||
}
|
||||
})""",
|
||||
@@ -688,10 +683,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
],
|
||||
"definitions": {
|
||||
"foo": {
|
||||
"properties": {"a": {"type": "number"}}
|
||||
"properties": {"a": {"type": "number"}}
|
||||
},
|
||||
"bar": {
|
||||
"properties": {"b": {"type": "number"}}
|
||||
"properties": {"b": {"type": "number"}}
|
||||
}
|
||||
},
|
||||
"type": "object"
|
||||
@@ -725,16 +720,16 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
],
|
||||
"definitions": {
|
||||
"foo": {
|
||||
"properties": {"a": {"type": "number"}}
|
||||
"properties": {"a": {"type": "number"}}
|
||||
},
|
||||
"bar": {
|
||||
"properties": {"b": {"type": "number"}}
|
||||
"properties": {"b": {"type": "number"}}
|
||||
},
|
||||
"bam": {
|
||||
"properties": {"c": {"type": "number"}}
|
||||
"properties": {"c": {"type": "number"}}
|
||||
},
|
||||
"baz": {
|
||||
"properties": {"d": {"type": "number"}}
|
||||
"properties": {"d": {"type": "number"}}
|
||||
}
|
||||
},
|
||||
"type": "object"
|
||||
@@ -762,15 +757,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"properties": {
|
||||
"number": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"root": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"root"
|
||||
],
|
||||
"additionalProperties": false
|
||||
"properties": {
|
||||
"root": {
|
||||
"type": "number"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"root"
|
||||
],
|
||||
"additionalProperties": false
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
@@ -799,40 +794,27 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
}
|
||||
|
||||
int main() {
|
||||
fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
|
||||
fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
|
||||
|
||||
test_all("C++", [](const TestCase & tc) {
|
||||
try {
|
||||
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema)));
|
||||
tc.verify(json_schema_to_grammar(nlohmann::json::parse(tc.schema)));
|
||||
tc.verify_status(SUCCESS);
|
||||
} catch (const std::runtime_error & ex) {
|
||||
fprintf(stderr, "Error: %s\n", ex.what());
|
||||
tc.verify_status(FAILURE);
|
||||
}
|
||||
});
|
||||
|
||||
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python --version") == 0)) {
|
||||
test_all("Python", [](const TestCase & tc) {
|
||||
write("test-json-schema-input.tmp", tc.schema);
|
||||
tc.verify_status(std::system(
|
||||
"python ./examples/json-schema-to-grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
|
||||
tc.verify(read("test-grammar-output.tmp"));
|
||||
});
|
||||
} else {
|
||||
fprintf(stderr, "\033[33mWARNING: Python not found, skipping Python JSON schema -> grammar tests.\n\033[0m");
|
||||
}
|
||||
|
||||
if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
|
||||
test_all("JavaScript", [](const TestCase & tc) {
|
||||
write("test-json-schema-input.tmp", tc.schema);
|
||||
tc.verify_status(std::system(
|
||||
"node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
|
||||
tc.verify(read("test-grammar-output.tmp"));
|
||||
});
|
||||
} else {
|
||||
fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
|
||||
}
|
||||
//test_all("Python", [](const TestCase & tc) {
|
||||
// write("test-json-schema-input.tmp", tc.schema);
|
||||
// tc.verify_status(std::system(
|
||||
// "python ./examples/json-schema-to-grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
|
||||
// tc.verify(read("test-grammar-output.tmp"));
|
||||
//});
|
||||
//test_all("JavaScript", [](const TestCase & tc) {
|
||||
// write("test-json-schema-input.tmp", tc.schema);
|
||||
// tc.verify_status(std::system(
|
||||
// "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
|
||||
// tc.verify(read("test-grammar-output.tmp"));
|
||||
//});
|
||||
|
||||
test_all("Check Expectations Validity", [](const TestCase & tc) {
|
||||
if (tc.expected_status == SUCCESS) {
|
||||
|
||||
Reference in New Issue
Block a user