mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
30 Commits
master-234
...
master-672
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
672dda10e4 | ||
|
|
27ab66e437 | ||
|
|
6e7cca4047 | ||
|
|
a6803cab94 | ||
|
|
7dabc66f3c | ||
|
|
7cdd30bf1f | ||
|
|
e8035f141e | ||
|
|
7513b7b0a1 | ||
|
|
de8342423d | ||
|
|
c48c525f87 | ||
|
|
206e01de11 | ||
|
|
4304bd3cde | ||
|
|
229aab351c | ||
|
|
697966680b | ||
|
|
27ad57a69b | ||
|
|
32c5411631 | ||
|
|
ff5d58faec | ||
|
|
b782422a3e | ||
|
|
1cbf561466 | ||
|
|
975221e954 | ||
|
|
4523d10d0c | ||
|
|
680e6f9177 | ||
|
|
4e7464ef88 | ||
|
|
2b5eb72e10 | ||
|
|
f7d278faf3 | ||
|
|
20d7740a9b | ||
|
|
5bf2a27718 | ||
|
|
c9c74b4e3f | ||
|
|
3ec7e596b2 | ||
|
|
917831c63a |
@@ -10,13 +10,13 @@ shift
|
||||
# Join the remaining arguments into a single string
|
||||
arg2="$@"
|
||||
|
||||
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
|
||||
python3 ./convert.py $arg2
|
||||
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
|
||||
./quantize $arg2
|
||||
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
|
||||
./main $arg2
|
||||
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||
python3 ./convert.py "$arg2"
|
||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
./quantize "$arg2"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
./main "$arg2"
|
||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
if [ -f "${i/f16/q4_0}" ]; then
|
||||
@@ -26,6 +26,8 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
||||
./quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
fi
|
||||
done
|
||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||
./server "$arg2"
|
||||
else
|
||||
echo "Unknown command: $arg1"
|
||||
echo "Available commands: "
|
||||
@@ -37,4 +39,6 @@ else
|
||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
||||
echo " ex: \"/models/\" 7B"
|
||||
echo " --server (-s): Run a model on the server"
|
||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
|
||||
fi
|
||||
|
||||
@@ -272,7 +272,7 @@ if (LLAMA_CUBLAS)
|
||||
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
if (LLAMA_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
endif()
|
||||
|
||||
20
Makefile
20
Makefile
@@ -151,14 +151,11 @@ ifdef LLAMA_MPI
|
||||
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
|
||||
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
|
||||
OBJS += ggml-mpi.o
|
||||
|
||||
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifdef LLAMA_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
|
||||
LDFLAGS += -lopenblas
|
||||
CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
|
||||
LDFLAGS += $(shell pkg-config --libs openblas)
|
||||
endif # LLAMA_OPENBLAS
|
||||
|
||||
ifdef LLAMA_BLIS
|
||||
@@ -226,9 +223,6 @@ ifdef LLAMA_METAL
|
||||
CXXFLAGS += -DGGML_USE_METAL
|
||||
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||||
OBJS += ggml-metal.o
|
||||
|
||||
ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifneq ($(filter aarch64%,$(UNAME_M)),)
|
||||
@@ -253,6 +247,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
|
||||
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
|
||||
endif
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifdef LLAMA_MPI
|
||||
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifdef LLAMA_NO_K_QUANTS
|
||||
k_quants.o: k_quants.c k_quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
@@ -239,7 +239,7 @@ In order to build llama.cpp you have three different options.
|
||||
- Using `Zig`:
|
||||
|
||||
```bash
|
||||
zig build -Drelease-fast
|
||||
zig build -Doptimize=ReleaseFast
|
||||
```
|
||||
|
||||
### Metal Build
|
||||
@@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t
|
||||
|
||||
```bash
|
||||
# run the verification script
|
||||
python3 .\scripts\verify-checksum-models.py
|
||||
./scripts/verify-checksum-models.py
|
||||
```
|
||||
|
||||
- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
|
||||
|
||||
32
build.zig
32
build.zig
@@ -1,9 +1,19 @@
|
||||
const std = @import("std");
|
||||
const commit_hash = @embedFile(".git/refs/heads/master");
|
||||
|
||||
// Zig Version: 0.11.0-dev.3379+629f0d23b
|
||||
// Zig Version: 0.11.0-dev.3986+e05c242cd
|
||||
pub fn build(b: *std.build.Builder) void {
|
||||
const target = b.standardTargetOptions(.{});
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
|
||||
const config_header = b.addConfigHeader(
|
||||
.{ .style = .blank, .include_path = "build-info.h" },
|
||||
.{
|
||||
.BUILD_NUMBER = 0,
|
||||
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
|
||||
},
|
||||
);
|
||||
|
||||
const lib = b.addStaticLibrary(.{
|
||||
.name = "llama",
|
||||
.target = target,
|
||||
@@ -13,24 +23,21 @@ pub fn build(b: *std.build.Builder) void {
|
||||
lib.linkLibCpp();
|
||||
lib.addIncludePath(".");
|
||||
lib.addIncludePath("./examples");
|
||||
lib.addCSourceFiles(&.{
|
||||
"ggml.c",
|
||||
}, &.{"-std=c11"});
|
||||
lib.addCSourceFiles(&.{
|
||||
"llama.cpp",
|
||||
}, &.{"-std=c++11"});
|
||||
lib.addConfigHeader(config_header);
|
||||
lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
|
||||
lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
|
||||
b.installArtifact(lib);
|
||||
|
||||
const examples = .{
|
||||
"main",
|
||||
"baby-llama",
|
||||
"embedding",
|
||||
// "metal",
|
||||
"metal",
|
||||
"perplexity",
|
||||
"quantize",
|
||||
"quantize-stats",
|
||||
"save-load-state",
|
||||
// "server",
|
||||
"server",
|
||||
"simple",
|
||||
"train-text-from-scratch",
|
||||
};
|
||||
@@ -43,16 +50,19 @@ pub fn build(b: *std.build.Builder) void {
|
||||
});
|
||||
exe.addIncludePath(".");
|
||||
exe.addIncludePath("./examples");
|
||||
exe.addConfigHeader(config_header);
|
||||
exe.addCSourceFiles(&.{
|
||||
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
|
||||
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
|
||||
"examples/common.cpp",
|
||||
}, &.{"-std=c++11"});
|
||||
exe.linkLibrary(lib);
|
||||
b.installArtifact(exe);
|
||||
|
||||
const run_cmd = b.addRunArtifact(exe);
|
||||
run_cmd.step.dependOn(b.getInstallStep());
|
||||
if (b.args) |args| run_cmd.addArgs(args);
|
||||
const run_step = b.step("run_" ++ example_name, "Run the app");
|
||||
|
||||
const run_step = b.step("run-" ++ example_name, "Run the app");
|
||||
run_step.dependOn(&run_cmd.step);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "--rope-freq-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_base = std::stof(argv[i]);
|
||||
} else if (arg == "--rope-freq-scale") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--memory-f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--top-p") {
|
||||
@@ -236,6 +248,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.mirostat_tau = std::stof(argv[i]);
|
||||
} else if (arg == "--cfg-negative-prompt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.cfg_negative_prompt = argv[i];
|
||||
} else if (arg == "--cfg-scale") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.cfg_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--cfg-smooth-factor") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.cfg_smooth_factor = std::stof(argv[i]);
|
||||
} else if (arg == "-b" || arg == "--batch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -267,6 +297,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.lora_adapter = argv[i];
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -469,7 +500,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n");
|
||||
fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
||||
fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
||||
fprintf(stderr, " --cfg-negative-prompt PROMPT \n");
|
||||
fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n");
|
||||
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
||||
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
|
||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
@@ -498,7 +535,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
@@ -535,7 +572,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
||||
return res;
|
||||
}
|
||||
|
||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
@@ -550,6 +587,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
lparams.logits_all = params.perplexity;
|
||||
lparams.embedding = params.embedding;
|
||||
lparams.rope_freq_base = params.rope_freq_base;
|
||||
lparams.rope_freq_scale = params.rope_freq_scale;
|
||||
|
||||
return lparams;
|
||||
}
|
||||
|
||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
|
||||
auto lparams = llama_context_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||
if (model == NULL) {
|
||||
|
||||
@@ -32,6 +32,8 @@ struct gpt_params {
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
float rope_freq_base = 10000.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
||||
|
||||
// sampling parameters
|
||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||
@@ -48,6 +50,12 @@ struct gpt_params {
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
|
||||
// Classifier-Free Guidance
|
||||
// https://arxiv.org/abs/2306.17806
|
||||
std::string cfg_negative_prompt; // string to help guidance
|
||||
float cfg_scale = 1.f; // How strong is guidance
|
||||
float cfg_smooth_factor = 1.f; // Smooth factor between old and new logits
|
||||
|
||||
std::string model = "models/7B/ggml-model.bin"; // model path
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string prompt = "";
|
||||
@@ -99,6 +107,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
||||
//
|
||||
|
||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||
|
||||
//
|
||||
// Console utils
|
||||
|
||||
@@ -17,7 +17,7 @@ make
|
||||
import torch
|
||||
|
||||
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
|
||||
pth_path = "./examples/embd_input/llava_projection.pth"
|
||||
pth_path = "./examples/embd-input/llava_projection.pth"
|
||||
|
||||
dic = torch.load(bin_path)
|
||||
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
|
||||
|
||||
@@ -59,7 +59,7 @@ if __name__=="__main__":
|
||||
# Also here can use pytorch_model-00003-of-00003.bin directly.
|
||||
a.load_projection(os.path.join(
|
||||
os.path.dirname(__file__) ,
|
||||
"llava_projetion.pth"))
|
||||
"llava_projection.pth"))
|
||||
respose = a.chat_with_image(
|
||||
Image.open("./media/llama1-logo.png").convert('RGB'),
|
||||
"what is the text in the picture?")
|
||||
|
||||
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
|
||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
|
||||
@@ -84,9 +84,17 @@ int main(int argc, char ** argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 10000.0) {
|
||||
fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 1.0) {
|
||||
fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
if (params.n_ctx > 2048) {
|
||||
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||
"expect poor results\n", __func__, params.n_ctx);
|
||||
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
|
||||
" you are on your own\n", __func__, params.n_ctx);
|
||||
} else if (params.n_ctx < 8) {
|
||||
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
@@ -109,10 +117,16 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
llama_context * ctx_guidance = NULL;
|
||||
g_ctx = &ctx;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (params.cfg_scale > 1.f) {
|
||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
||||
}
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
@@ -183,15 +197,28 @@ int main(int argc, char ** argv) {
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||
params.prompt.insert(0, 1, ' ');
|
||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||
params.prompt.insert(0, 1, ' ');
|
||||
|
||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
} else {
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
|
||||
// Tokenize negative prompt
|
||||
std::vector<llama_token> guidance_inp;
|
||||
int guidance_offset = 0;
|
||||
int original_prompt_len = 0;
|
||||
if (ctx_guidance) {
|
||||
params.cfg_negative_prompt.insert(0, 1, ' ');
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
original_prompt_len = original_inp.size();
|
||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||
}
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
@@ -258,6 +285,16 @@ int main(int argc, char ** argv) {
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
|
||||
}
|
||||
|
||||
if (ctx_guidance) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
|
||||
}
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
@@ -334,11 +371,13 @@ int main(int argc, char ** argv) {
|
||||
int n_remain = params.n_predict;
|
||||
int n_consumed = 0;
|
||||
int n_session_consumed = 0;
|
||||
int n_past_guidance = 0;
|
||||
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
console_set_color(con_st, CONSOLE_COLOR_PROMPT);
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
|
||||
// do one empty run to warm up the model
|
||||
{
|
||||
@@ -367,11 +406,12 @@ int main(int argc, char ** argv) {
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() > n_ctx) {
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||
const int n_left = n_past - params.n_keep;
|
||||
|
||||
// always keep the first token - BOS
|
||||
n_past = std::max(1, params.n_keep);
|
||||
n_past_guidance = std::max(1, params.n_keep + guidance_offset);
|
||||
|
||||
// insert n_left/2 tokens at the start of embd from last_n_tokens
|
||||
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
|
||||
@@ -412,6 +452,48 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// evaluate tokens in batches
|
||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
||||
|
||||
if (ctx_guidance) {
|
||||
int input_size = 0;
|
||||
llama_token* input_buf = NULL;
|
||||
|
||||
if (n_past_guidance < (int) guidance_inp.size()) {
|
||||
// Guidance context should have the same data with these modifications:
|
||||
//
|
||||
// * Replace the initial prompt
|
||||
// * Shift everything by guidance_offset
|
||||
embd_guidance = guidance_inp;
|
||||
if (embd.begin() + original_prompt_len < embd.end()) {
|
||||
embd_guidance.insert(
|
||||
embd_guidance.end(),
|
||||
embd.begin() + original_prompt_len,
|
||||
embd.end()
|
||||
);
|
||||
}
|
||||
|
||||
input_buf = embd_guidance.data();
|
||||
input_size = embd_guidance.size();
|
||||
//fprintf(stderr, "\n---------------------\n");
|
||||
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
|
||||
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
|
||||
//}
|
||||
//fprintf(stderr, "\n---------------------\n");
|
||||
} else {
|
||||
input_buf = embd.data();
|
||||
input_size = embd.size();
|
||||
}
|
||||
|
||||
for (int i = 0; i < input_size; i += params.n_batch) {
|
||||
int n_eval = std::min(input_size - i, params.n_batch);
|
||||
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_past_guidance += n_eval;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
||||
int n_eval = (int) embd.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
@@ -431,6 +513,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
embd_guidance.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
// out of user input, sample next token
|
||||
@@ -473,6 +556,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
if (ctx_guidance) {
|
||||
llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
|
||||
}
|
||||
|
||||
// Apply penalties
|
||||
float nl_logit = logits[llama_token_nl()];
|
||||
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
@@ -668,6 +755,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
if (ctx_guidance) { llama_free(ctx_guidance); }
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ Command line options:
|
||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
||||
@@ -66,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the
|
||||
```sh
|
||||
curl --request POST \
|
||||
--url http://localhost:8080/completion \
|
||||
--header "Content-Type: application/json" \
|
||||
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
|
||||
```
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ tokenize() {
|
||||
--silent \
|
||||
--request POST \
|
||||
--url "${API_URL}/tokenize" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
|
||||
| jq '.tokens[]'
|
||||
}
|
||||
@@ -64,6 +65,7 @@ chat_completion() {
|
||||
--no-buffer \
|
||||
--request POST \
|
||||
--url "${API_URL}/completion" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "${DATA}")
|
||||
|
||||
printf "\n"
|
||||
|
||||
@@ -608,6 +608,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
@@ -632,7 +634,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
|
||||
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||
@@ -722,6 +724,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--rope-freq-base")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_base = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--rope-freq-scale")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||
{
|
||||
params.memory_f16 = false;
|
||||
@@ -820,6 +838,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
break;
|
||||
}
|
||||
params.lora_adapter = argv[i];
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-base")
|
||||
{
|
||||
|
||||
@@ -1354,17 +1354,9 @@ struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
|
||||
}
|
||||
}
|
||||
|
||||
if (t->src0) {
|
||||
expand(g, t->src0);
|
||||
}
|
||||
|
||||
if (t->src1) {
|
||||
expand(g, t->src1);
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_OPT; ++i) {
|
||||
if (t->opt[i]) {
|
||||
expand(g, t->opt[i]);
|
||||
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
||||
if (t->src[i]) {
|
||||
expand(g, t->src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -43,6 +43,8 @@
|
||||
"-DLLAMA_METAL=ON"
|
||||
]);
|
||||
installPhase = ''
|
||||
runHook preInstall
|
||||
|
||||
mkdir -p $out/bin
|
||||
mv bin/* $out/bin/
|
||||
mv $out/bin/main $out/bin/llama
|
||||
@@ -51,6 +53,8 @@
|
||||
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
|
||||
cat ${./convert.py} >> $out/bin/convert.py
|
||||
chmod +x $out/bin/convert.py
|
||||
|
||||
runHook postInstall
|
||||
'';
|
||||
meta.mainProgram = "llama";
|
||||
};
|
||||
|
||||
634
ggml-cuda.cu
634
ggml-cuda.cu
File diff suppressed because it is too large
Load Diff
52
ggml-metal.m
52
ggml-metal.m
@@ -393,8 +393,8 @@ void ggml_metal_graph_compute(
|
||||
for (int i = node_start; i < node_end; ++i) {
|
||||
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||
|
||||
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
||||
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
||||
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
||||
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
|
||||
struct ggml_tensor * dst = gf->nodes[i];
|
||||
|
||||
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
||||
@@ -740,8 +740,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q2_K ||
|
||||
src0t == GGML_TYPE_Q3_K ||
|
||||
@@ -882,28 +881,35 @@ void ggml_metal_graph_compute(
|
||||
|
||||
const int n_past = ((int32_t *)(src1->data))[0];
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_rope];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
|
||||
243
ggml-metal.metal
243
ggml-metal.metal
@@ -365,6 +365,10 @@ kernel void kernel_rms_norm(
|
||||
}
|
||||
}
|
||||
|
||||
// putting them in the kernel cause a significant performance penalty
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
kernel void kernel_mul_mat_q4_0_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -372,64 +376,83 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne0,
|
||||
threadgroup float * sum [[threadgroup(0)]],
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||
uint2 tptg[[threads_per_threadgroup]]) {
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
const int nb = ne00/QK4_0;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
|
||||
device const float * y = (device const float *) src1 + r1*ne10;
|
||||
block_q4_0 qb_curr, qb_next;
|
||||
float4 y_curr[8]; // src1 vector cache
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
thread float * yl=(thread float *)y_curr;
|
||||
|
||||
const int nth = tptg.x*tptg.y;
|
||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
const int ix = tpitg.y/4; // 0 or 1
|
||||
const int iy = tpitg.y - 4*ix; // 0...3
|
||||
|
||||
const int first = 4 * iy;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
|
||||
|
||||
const float d = (float)x[i].d;
|
||||
|
||||
device const uint8_t * xl = x[i].qs + first;
|
||||
device const float * yl = y + i * QK4_0 + first;
|
||||
|
||||
float2 acc = {0.0f, 0.0f};
|
||||
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
|
||||
acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
|
||||
acc[1] += yl[j] + yl[j+16];
|
||||
// bootstrap
|
||||
qb_curr = x[tiisg];
|
||||
// each thread in a SIMD group deals with 1 block.
|
||||
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
|
||||
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
sumy *= (-8.f);
|
||||
|
||||
sumf += d * (acc[0] - 8.f*acc[1]);
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
float d = qb_curr.d;
|
||||
float acc = sumy;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
sumf[row] += d * acc;
|
||||
qb_curr = qb_next;
|
||||
}
|
||||
}
|
||||
|
||||
sum[ith] = sumf;
|
||||
if (nb % N_SIMDWIDTH == 0) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
//
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%4 == 0) {
|
||||
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%16 == 0) {
|
||||
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith == 0) {
|
||||
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
dst[r1*ne0 + r0] = sum[0];
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
sumy *= (-8.f);
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
float d = qb_curr.d;
|
||||
float acc = sumy;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
if (tiisg < nb % N_SIMDWIDTH) {
|
||||
sumf[row] += d * acc;
|
||||
}
|
||||
qb_curr = qb_next;
|
||||
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -440,65 +463,83 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne0,
|
||||
threadgroup float * sum [[threadgroup(0)]],
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||
uint2 tptg[[threads_per_threadgroup]]) {
|
||||
const int nb = ne00/QK4_1;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
const int nb = ne00/QK4_0;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
|
||||
device const float * y = (device const float *) src1 + r1*ne10;
|
||||
block_q4_1 qb_curr, qb_next;
|
||||
float4 y_curr[8]; // src1 vector cache
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
thread float * yl=(thread float *)y_curr;
|
||||
|
||||
const uint nth = tptg.x*tptg.y;
|
||||
const uint ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
const int ix = tpitg.y/4; // 0 or 1
|
||||
const int iy = tpitg.y - 4*ix; // 0...3
|
||||
|
||||
const int first = 4 * iy;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
|
||||
|
||||
const float d = (float)x[i].d;
|
||||
const float m = (float)x[i].m;
|
||||
|
||||
device const uint8_t * xl = x[i].qs + first;
|
||||
device const float * yl = y + i * QK4_1 + first;
|
||||
|
||||
float2 acc = {0.0f, 0.0f};
|
||||
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
|
||||
acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
|
||||
acc[1] += yl[j+16] * (d * (xl[j] >> 4) + m);
|
||||
// bootstrap
|
||||
qb_curr = x[tiisg];
|
||||
// each thread in a SIMD group deals with 1 block.
|
||||
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
|
||||
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
|
||||
sumf += acc[0] + acc[1];
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
const float d = qb_curr.d;
|
||||
const float m = qb_curr.m;
|
||||
float acc = 0.f;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
sumf[row] += d * acc + m * sumy;
|
||||
qb_curr = qb_next;
|
||||
}
|
||||
}
|
||||
|
||||
sum[ith] = sumf;
|
||||
if (nb % N_SIMDWIDTH == 0) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
//
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%4 == 0) {
|
||||
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%16 == 0) {
|
||||
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith == 0) {
|
||||
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
dst[r1*ne0 + r0] = sum[0];
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
const float d = qb_curr.d;
|
||||
const float m = qb_curr.m;
|
||||
float acc = 0.f;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
if (tiisg < nb % N_SIMDWIDTH) {
|
||||
sumf[row] += d * acc + m * sumy;
|
||||
}
|
||||
qb_curr = qb_next;
|
||||
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -615,17 +656,19 @@ kernel void kernel_rope(
|
||||
constant int & n_past,
|
||||
constant int & n_dims,
|
||||
constant int & mode,
|
||||
constant float & freq_base,
|
||||
constant float & freq_scale,
|
||||
uint3 tpig[[thread_position_in_grid]]) {
|
||||
const int64_t i3 = tpig[2];
|
||||
const int64_t i2 = tpig[1];
|
||||
const int64_t i1 = tpig[0];
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const float theta_scale = pow(10000.0, -2.0f/n_dims);
|
||||
const float theta_scale = pow(freq_base, -2.0f/n_dims);
|
||||
|
||||
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
||||
|
||||
float theta = (float)p;
|
||||
float theta = freq_scale * (float)p;
|
||||
|
||||
if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
|
||||
@@ -175,11 +175,11 @@ void ggml_mpi_graph_compute_pre(
|
||||
// attach the input data to all nodes that need it
|
||||
// TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
|
||||
for (int i = idx_l0; i < idx_l1; i++) {
|
||||
if (gf->nodes[i]->src0 == gf->nodes[idx_l0]) {
|
||||
gf->nodes[i]->src0 = inp0;
|
||||
if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
|
||||
gf->nodes[i]->src[0] = inp0;
|
||||
}
|
||||
if (gf->nodes[i]->src1 == gf->nodes[idx_l0]) {
|
||||
gf->nodes[i]->src1 = inp0;
|
||||
if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
|
||||
gf->nodes[i]->src[1] = inp0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
59
ggml.h
59
ggml.h
@@ -132,10 +132,10 @@
|
||||
// {
|
||||
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
||||
//
|
||||
// // a[1, 2] = 1.0f;
|
||||
// // a[2, 1] = 1.0f;
|
||||
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
||||
//
|
||||
// // a[2, 0] = 2.0f;
|
||||
// // a[0, 2] = 2.0f;
|
||||
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
||||
//
|
||||
// ...
|
||||
@@ -197,12 +197,17 @@
|
||||
#define GGML_MAX_NODES 4096
|
||||
#define GGML_MAX_PARAMS 256
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_OPT 4
|
||||
#define GGML_MAX_SRC 6
|
||||
#define GGML_MAX_NAME 48
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
|
||||
#define GGML_EXIT_SUCCESS 0
|
||||
#define GGML_EXIT_ABORTED 1
|
||||
|
||||
#define GGML_UNUSED(x) (void)(x)
|
||||
|
||||
|
||||
#define GGML_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
@@ -363,6 +368,8 @@ extern "C" {
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
@@ -414,9 +421,7 @@ extern "C" {
|
||||
bool is_param;
|
||||
|
||||
struct ggml_tensor * grad;
|
||||
struct ggml_tensor * src0;
|
||||
struct ggml_tensor * src1;
|
||||
struct ggml_tensor * opt[GGML_MAX_OPT];
|
||||
struct ggml_tensor * src[GGML_MAX_SRC];
|
||||
|
||||
// performance
|
||||
int perf_runs;
|
||||
@@ -444,6 +449,10 @@ extern "C" {
|
||||
|
||||
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
||||
int n_tasks[GGML_MAX_NODES];
|
||||
|
||||
// abort ggml_graph_compute when true
|
||||
bool (*abort_callback)(void * data);
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
// computation graph
|
||||
@@ -1112,6 +1121,17 @@ extern "C" {
|
||||
int mode,
|
||||
int n_ctx);
|
||||
|
||||
// custom RoPE, in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
int n_ctx);
|
||||
|
||||
// rotary position embedding backward, i.e compute dx from dy
|
||||
// a - dy
|
||||
GGML_API struct ggml_tensor * ggml_rope_back(
|
||||
@@ -1166,6 +1186,31 @@ extern "C" {
|
||||
int s,
|
||||
int d);
|
||||
|
||||
enum ggml_op_pool {
|
||||
GGML_OP_POOL_MAX,
|
||||
GGML_OP_POOL_AVG,
|
||||
GGML_OP_POOL_COUNT,
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tensor* ggml_pool_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_op_pool op,
|
||||
int k0, // kernel size
|
||||
int s0, // stride
|
||||
int p0); // padding
|
||||
|
||||
GGML_API struct ggml_tensor* ggml_pool_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_op_pool op,
|
||||
int k0,
|
||||
int k1,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
@@ -1305,7 +1350,7 @@ extern "C" {
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||
GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
|
||||
@@ -15,6 +15,14 @@
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif
|
||||
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
||||
@@ -175,13 +175,13 @@ struct llama_mmap {
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_PRIVATE;
|
||||
int flags = MAP_SHARED;
|
||||
// prefetch/readahead impairs performance on NUMA systems
|
||||
if (numa) { prefetch = 0; }
|
||||
#ifdef __linux__
|
||||
if (prefetch) { flags |= MAP_POPULATE; }
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
if (addr == MAP_FAILED) {
|
||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||
}
|
||||
@@ -223,7 +223,7 @@ struct llama_mmap {
|
||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
error = GetLastError();
|
||||
CloseHandle(hMapping);
|
||||
|
||||
|
||||
204
llama.cpp
204
llama.cpp
@@ -101,14 +101,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
||||
// memory sizes
|
||||
//
|
||||
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_3B, 256ull * MB },
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
/* empirical scaling, still a guess */
|
||||
{ MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
|
||||
{ MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
|
||||
{ MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
|
||||
{ MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
|
||||
{ MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
@@ -140,14 +141,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||
|
||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
||||
// not actually needed if BLAS is disabled
|
||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_3B, 512ull * MB },
|
||||
{ MODEL_7B, 768ull * MB },
|
||||
{ MODEL_13B, 1024ull * MB },
|
||||
{ MODEL_30B, 1280ull * MB },
|
||||
{ MODEL_65B, 1536ull * MB },
|
||||
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
|
||||
{ MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
|
||||
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
|
||||
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
|
||||
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
@@ -189,6 +190,10 @@ struct llama_hparams {
|
||||
uint32_t n_head = 32;
|
||||
uint32_t n_layer = 32;
|
||||
uint32_t n_rot = 64;
|
||||
|
||||
float rope_freq_base = 10000.0f;
|
||||
float rope_freq_scale = 1.0f;
|
||||
|
||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
@@ -303,7 +308,7 @@ struct llama_model {
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
||||
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
||||
#ifdef GGML_USE_METAL
|
||||
~llama_context() {
|
||||
if (ctx_metal) {
|
||||
@@ -324,7 +329,6 @@ struct llama_context {
|
||||
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||
|
||||
const llama_model & model;
|
||||
const llama_vocab & vocab;
|
||||
|
||||
bool model_owner = false;
|
||||
|
||||
@@ -648,7 +652,7 @@ struct llama_model_loader {
|
||||
*ctx_size_p = *mmapped_size_p = 0;
|
||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
||||
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
||||
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
||||
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -844,6 +848,8 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.gpu_layers =*/ 0,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ {0},
|
||||
/*.rope_freq_base =*/ 10000.0f,
|
||||
/*.rope_freq_scale =*/ 1.0f,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.low_vram =*/ false,
|
||||
@@ -967,6 +973,8 @@ static void llama_model_load_internal(
|
||||
int n_gpu_layers,
|
||||
int main_gpu,
|
||||
const float * tensor_split,
|
||||
float rope_freq_base,
|
||||
float rope_freq_scale,
|
||||
bool low_vram,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
@@ -1001,22 +1009,27 @@ static void llama_model_load_internal(
|
||||
}
|
||||
|
||||
hparams.n_ctx = n_ctx;
|
||||
|
||||
hparams.rope_freq_base = rope_freq_base;
|
||||
hparams.rope_freq_scale = rope_freq_scale;
|
||||
}
|
||||
|
||||
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||
|
||||
{
|
||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
||||
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
||||
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||
}
|
||||
|
||||
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
||||
@@ -1165,9 +1178,9 @@ static void llama_model_load_internal(
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size - vram_weights + // weights in VRAM not in memory
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at (model.type);
|
||||
MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
@@ -1271,6 +1284,8 @@ static bool llama_model_load(
|
||||
int n_gpu_layers,
|
||||
int main_gpu,
|
||||
float * tensor_split,
|
||||
float rope_freq_base,
|
||||
float rope_freq_scale,
|
||||
bool low_vram,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
@@ -1279,7 +1294,7 @@ static bool llama_model_load(
|
||||
llama_progress_callback progress_callback,
|
||||
void *progress_callback_user_data) {
|
||||
try {
|
||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||
return true;
|
||||
} catch (const std::exception & err) {
|
||||
@@ -1331,6 +1346,9 @@ static bool llama_eval_internal(
|
||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||
const int n_gpu_layers = model.n_gpu_layers;
|
||||
|
||||
const float freq_base = hparams.rope_freq_base;
|
||||
const float freq_scale = hparams.rope_freq_scale;
|
||||
|
||||
auto & mem_per_token = lctx.mem_per_token;
|
||||
auto & buf_compute = lctx.buf_compute;
|
||||
|
||||
@@ -1428,11 +1446,11 @@ static bool llama_eval_internal(
|
||||
offload_func_kq(tmpq);
|
||||
ggml_set_name(tmpq, "tmpq");
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
|
||||
offload_func_kq(Kcur);
|
||||
ggml_set_name(Kcur, "Kcur");
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
|
||||
offload_func_kq(Qcur);
|
||||
ggml_set_name(Qcur, "Qcur");
|
||||
|
||||
@@ -2167,6 +2185,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
||||
}
|
||||
}
|
||||
|
||||
static void llama_log_softmax(float * array, size_t size) {
|
||||
float max_l = *std::max_element(array, array + size);
|
||||
float sum = 0.f;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
float p = expf(array[i] - max_l);
|
||||
sum += p;
|
||||
array[i] = p;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
array[i] = logf(array[i] / sum);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_classifier_free_guidance(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
struct llama_context * guidance_ctx,
|
||||
float scale,
|
||||
float smooth_factor) {
|
||||
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
||||
|
||||
assert(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
assert(n_vocab == (int)candidates->size);
|
||||
assert(!candidates->sorted);
|
||||
|
||||
std::vector<float> logits_base;
|
||||
logits_base.reserve(candidates->size);
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
logits_base.push_back(candidates->data[i].logit);
|
||||
}
|
||||
llama_log_softmax(logits_base.data(), candidates->size);
|
||||
|
||||
float* logits_guidance = llama_get_logits(guidance_ctx);
|
||||
llama_log_softmax(logits_guidance, n_vocab);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
float logit_guidance = logits_guidance[i];
|
||||
float logit_base = logits_base[i];
|
||||
logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
|
||||
}
|
||||
|
||||
llama_log_softmax(logits_guidance, n_vocab);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
float logit_base = logits_base[i];
|
||||
float logit_guidance = logits_guidance[i];
|
||||
|
||||
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
|
||||
}
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
||||
assert(ctx);
|
||||
@@ -2619,8 +2693,9 @@ struct llama_model * llama_load_model_from_file(
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
||||
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
||||
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
||||
params.progress_callback_user_data)) {
|
||||
delete model;
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
return nullptr;
|
||||
@@ -2641,7 +2716,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
llama_context * ctx = new llama_context(*model, model->vocab);
|
||||
llama_context * ctx = new llama_context(*model);
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
@@ -2695,9 +2770,9 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
|
||||
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||
}
|
||||
|
||||
@@ -3479,13 +3554,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
int llama_tokenize_with_model(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos) {
|
||||
auto res = llama_tokenize(ctx->vocab, text, add_bos);
|
||||
auto res = llama_tokenize(model->vocab, text, add_bos);
|
||||
|
||||
if (n_max_tokens < (int) res.size()) {
|
||||
fprintf(stderr, "%s: too many tokens\n", __func__);
|
||||
@@ -3499,8 +3574,29 @@ int llama_tokenize(
|
||||
return res.size();
|
||||
}
|
||||
|
||||
int llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos) {
|
||||
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
||||
}
|
||||
|
||||
int llama_n_vocab_from_model(const struct llama_model * model) {
|
||||
return model->vocab.id_to_token.size();
|
||||
}
|
||||
|
||||
int llama_n_ctx_from_model(const struct llama_model * model) {
|
||||
return model->hparams.n_ctx;
|
||||
}
|
||||
|
||||
int llama_n_embd_from_model(const struct llama_model * model) {
|
||||
return model->hparams.n_embd;
|
||||
}
|
||||
|
||||
int llama_n_vocab(const struct llama_context * ctx) {
|
||||
return ctx->vocab.id_to_token.size();
|
||||
return ctx->model.vocab.id_to_token.size();
|
||||
}
|
||||
|
||||
int llama_n_ctx(const struct llama_context * ctx) {
|
||||
@@ -3511,17 +3607,25 @@ int llama_n_embd(const struct llama_context * ctx) {
|
||||
return ctx->model.hparams.n_embd;
|
||||
}
|
||||
|
||||
int llama_get_vocab_from_model(
|
||||
const struct llama_model * model,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity) {
|
||||
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
|
||||
for (int i = 0; i<n; ++i) {
|
||||
strings[i] = model->vocab.id_to_token[i].tok.c_str();
|
||||
scores[i] = model->vocab.id_to_token[i].score;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
int llama_get_vocab(
|
||||
const struct llama_context * ctx,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity) {
|
||||
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
|
||||
for (int i = 0; i<n; ++i) {
|
||||
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
|
||||
scores[i] = ctx->vocab.id_to_token[i].score;
|
||||
}
|
||||
return n;
|
||||
return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
|
||||
}
|
||||
|
||||
float * llama_get_logits(struct llama_context * ctx) {
|
||||
@@ -3532,12 +3636,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
||||
return ctx->embedding.data();
|
||||
}
|
||||
|
||||
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||
if (token >= llama_n_vocab(ctx)) {
|
||||
const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
|
||||
if (token >= llama_n_vocab_from_model(model)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return ctx->vocab.id_to_token[token].tok.c_str();
|
||||
return model->vocab.id_to_token[token].tok.c_str();
|
||||
}
|
||||
|
||||
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||
return llama_token_to_str_with_model(&ctx->model, token);
|
||||
}
|
||||
|
||||
llama_token llama_token_bos() {
|
||||
|
||||
42
llama.h
42
llama.h
@@ -89,6 +89,11 @@ extern "C" {
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||
float rope_freq_base; // RoPE base frequency
|
||||
float rope_freq_scale; // RoPE frequency scaling factor
|
||||
|
||||
// called with a progress value between 0 and 1, pass NULL to disable
|
||||
llama_progress_callback progress_callback;
|
||||
// context pointer passed to the progress callback
|
||||
@@ -270,10 +275,21 @@ extern "C" {
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_tokenize_with_model(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
||||
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
||||
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
||||
|
||||
// Get the vocabulary as output parameters.
|
||||
// Returns number of results.
|
||||
LLAMA_API int llama_get_vocab(
|
||||
@@ -282,6 +298,12 @@ extern "C" {
|
||||
float * scores,
|
||||
int capacity);
|
||||
|
||||
LLAMA_API int llama_get_vocab_from_model(
|
||||
const struct llama_model * model,
|
||||
const char * * strings,
|
||||
float * scores,
|
||||
int capacity);
|
||||
|
||||
// Token logits obtained from the last call to llama_eval()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Can be mutated in order to change the probabilities of the next token
|
||||
@@ -294,7 +316,13 @@ extern "C" {
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
||||
LLAMA_API const char * llama_token_to_str(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
|
||||
LLAMA_API const char * llama_token_to_str_with_model(
|
||||
const struct llama_model * model,
|
||||
llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
||||
@@ -309,6 +337,18 @@ extern "C" {
|
||||
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
||||
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
||||
|
||||
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
||||
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
||||
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
||||
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
|
||||
LLAMA_API void llama_sample_classifier_free_guidance(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
struct llama_context * guidance_ctx,
|
||||
float scale,
|
||||
float smooth_factor);
|
||||
|
||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
|
||||
|
||||
2
scripts/verify-checksum-models.py
Normal file → Executable file
2
scripts/verify-checksum-models.py
Normal file → Executable file
@@ -1,3 +1,5 @@
|
||||
#!/bin/env python3
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
|
||||
@@ -10,7 +10,9 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||
#endif
|
||||
|
||||
#define MAX_NARGS 3
|
||||
|
||||
|
||||
@@ -7,7 +7,9 @@
|
||||
|
||||
#define MAX_NARGS 2
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||
#endif
|
||||
|
||||
//
|
||||
// logging
|
||||
|
||||
Reference in New Issue
Block a user