mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
13 Commits
master-574
...
master-7f9
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7f9753fa12 | ||
|
|
cfa0750bc9 | ||
|
|
9d23589d63 | ||
|
|
0be54f75a6 | ||
|
|
181e8d9755 | ||
|
|
d9779021bd | ||
|
|
d38e451578 | ||
|
|
eaa6ca5a61 | ||
|
|
aa777abbb7 | ||
|
|
c824d2e368 | ||
|
|
b853d45601 | ||
|
|
9225baef71 | ||
|
|
a84ab1da8d |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,5 +1,6 @@
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
.DS_Store
|
||||
.build/
|
||||
.cache/
|
||||
@@ -39,8 +40,8 @@ models/*
|
||||
/vdot
|
||||
/server
|
||||
/Pipfile
|
||||
/embd-input-test
|
||||
/libllama.so
|
||||
|
||||
build-info.h
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
|
||||
11
Makefile
11
Makefile
@@ -1,5 +1,5 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple libembdinput.so embd-input-test
|
||||
|
||||
ifdef LLAMA_BUILD_SERVER
|
||||
BUILD_TARGETS += server
|
||||
@@ -272,7 +272,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
|
||||
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch embd-input-test build-info.h
|
||||
|
||||
#
|
||||
# Examples
|
||||
@@ -305,6 +305,13 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||
|
||||
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||
|
||||
|
||||
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||
|
||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
|
||||
@@ -93,6 +93,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
||||
|
||||
**UI:**
|
||||
|
||||
@@ -687,6 +688,8 @@ GGML_OPENCL_DEVICE=0
|
||||
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
|
||||
```
|
||||
|
||||
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
|
||||
|
||||
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
|
||||
|
||||
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
|
||||
|
||||
@@ -113,6 +113,10 @@ with open(output_path, "wb") as fout:
|
||||
|
||||
write_file_header(fout, params)
|
||||
for k, v in model.items():
|
||||
if k.endswith(".default.weight"):
|
||||
k = k.replace(".default.weight", ".weight")
|
||||
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
||||
continue
|
||||
if k.endswith("lora_A.weight"):
|
||||
if v.dtype != torch.float16 and v.dtype != torch.float32:
|
||||
v = v.float()
|
||||
@@ -120,7 +124,7 @@ with open(output_path, "wb") as fout:
|
||||
else:
|
||||
v = v.float()
|
||||
|
||||
t = v.numpy()
|
||||
t = v.detach().numpy()
|
||||
tname = translate_tensor_name(k)
|
||||
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||
|
||||
@@ -39,6 +39,7 @@ else()
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(embd-input)
|
||||
if (LLAMA_METAL)
|
||||
add_subdirectory(metal)
|
||||
endif()
|
||||
|
||||
@@ -566,8 +566,8 @@ struct ggml_tensor * forward(
|
||||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
// Qcur shape [n_embd/n_head, n_head, N, 1]
|
||||
// Kcur shape [n_embd/n_head, n_head, N, 1]
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
@@ -823,8 +823,8 @@ struct ggml_tensor * forward_batch(
|
||||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
||||
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
||||
|
||||
@@ -1116,7 +1116,7 @@ struct ggml_tensor * forward_lora(
|
||||
model->layers[il].wqb,
|
||||
cur)),
|
||||
n_embd/n_head, n_head, N),
|
||||
n_past, n_rot, 0);
|
||||
n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_mul_mat(ctx0,
|
||||
@@ -1125,7 +1125,7 @@ struct ggml_tensor * forward_lora(
|
||||
model->layers[il].wkb,
|
||||
cur)),
|
||||
n_embd/n_head, n_head, N),
|
||||
n_past, n_rot, 0);
|
||||
n_past, n_rot, 0, 0);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
|
||||
@@ -343,6 +343,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--mtest") {
|
||||
params.mem_test = true;
|
||||
} else if (arg == "--numa") {
|
||||
params.numa = true;
|
||||
} else if (arg == "--export") {
|
||||
params.export_cgraph = true;
|
||||
} else if (arg == "--verbose-prompt") {
|
||||
@@ -414,13 +416,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
|
||||
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
|
||||
exit(1);
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
if (escape_prompt) {
|
||||
process_escapes(params.prompt);
|
||||
}
|
||||
@@ -488,6 +483,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
if (llama_mmap_supported()) {
|
||||
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
|
||||
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
|
||||
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||
|
||||
@@ -76,6 +76,7 @@ struct gpt_params {
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool mem_test = false; // compute maximum memory usage
|
||||
bool numa = false; // attempt optimizations that help on some NUMA systems
|
||||
bool export_cgraph = false; // export the computation graph
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
};
|
||||
|
||||
4
examples/embd-input/.gitignore
vendored
Normal file
4
examples/embd-input/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
PandaGPT
|
||||
MiniGPT-4
|
||||
*.pth
|
||||
|
||||
15
examples/embd-input/CMakeLists.txt
Normal file
15
examples/embd-input/CMakeLists.txt
Normal file
@@ -0,0 +1,15 @@
|
||||
set(TARGET embdinput)
|
||||
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
set(TARGET embd-input-test)
|
||||
add_executable(${TARGET} embd-input-test.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
63
examples/embd-input/README.md
Normal file
63
examples/embd-input/README.md
Normal file
@@ -0,0 +1,63 @@
|
||||
### Examples for input embedding directly
|
||||
|
||||
## Requirement
|
||||
build `libembdinput.so`
|
||||
run the following comman in main dir (../../).
|
||||
```
|
||||
make
|
||||
```
|
||||
|
||||
## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
|
||||
|
||||
1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
|
||||
2. Convert it to ggml format.
|
||||
3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
|
||||
|
||||
```
|
||||
import torch
|
||||
|
||||
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
|
||||
pth_path = "./examples/embd_input/llava_projection.pth"
|
||||
|
||||
dic = torch.load(bin_path)
|
||||
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
|
||||
torch.save({k: dic[k] for k in used_key}, pth_path)
|
||||
```
|
||||
4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
|
||||
|
||||
|
||||
## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
|
||||
|
||||
1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
|
||||
The `adapter_config.json` is
|
||||
```
|
||||
{
|
||||
"peft_type": "LORA",
|
||||
"fan_in_fan_out": false,
|
||||
"bias": null,
|
||||
"modules_to_save": null,
|
||||
"r": 32,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.1,
|
||||
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||
}
|
||||
```
|
||||
2. Papare the `vicuna` v0 model.
|
||||
3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
|
||||
4. Clone the PandaGPT source.
|
||||
```
|
||||
git clone https://github.com/yxuansu/PandaGPT
|
||||
```
|
||||
5. Install the requirement of PandaGPT.
|
||||
6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
|
||||
|
||||
## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
|
||||
|
||||
1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
|
||||
2. Clone the MiniGPT-4 source.
|
||||
```
|
||||
git clone https://github.com/Vision-CAIR/MiniGPT-4/
|
||||
```
|
||||
3. Install the requirement of PandaGPT.
|
||||
4. Papare the `vicuna` v0 model.
|
||||
5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
|
||||
220
examples/embd-input/embd-input-lib.cpp
Normal file
220
examples/embd-input/embd-input-lib.cpp
Normal file
@@ -0,0 +1,220 @@
|
||||
// Defines sigaction on msys:
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
|
||||
#include "embd-input.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static llama_context ** g_ctx;
|
||||
|
||||
extern "C" {
|
||||
|
||||
struct MyModel* create_mymodel(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
|
||||
if (params.seed < 0) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
||||
|
||||
llama_init_backend(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
g_ctx = &ctx;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
}
|
||||
struct MyModel * ret = new MyModel();
|
||||
ret->ctx = ctx;
|
||||
ret->params = params;
|
||||
ret->n_past = 0;
|
||||
// printf("ctx: %d\n", ret->ctx);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void free_mymodel(struct MyModel * mymodel) {
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
delete mymodel;
|
||||
}
|
||||
|
||||
|
||||
bool eval_float(void * model, float * input, int N){
|
||||
MyModel * mymodel = (MyModel*)model;
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
int n_emb = llama_n_embd(ctx);
|
||||
int n_past = mymodel->n_past;
|
||||
int n_batch = N; // params.n_batch;
|
||||
|
||||
for (int i = 0; i < (int) N; i += n_batch) {
|
||||
int n_eval = (int) N - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
n_past += n_eval;
|
||||
}
|
||||
mymodel->n_past = n_past;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool eval_tokens(void * model, std::vector<llama_token> tokens) {
|
||||
MyModel * mymodel = (MyModel* )model;
|
||||
llama_context * ctx;
|
||||
ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
int n_past = mymodel->n_past;
|
||||
for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
n_past += n_eval;
|
||||
}
|
||||
mymodel->n_past = n_past;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool eval_id(struct MyModel* mymodel, int id) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(mymodel, tokens);
|
||||
}
|
||||
|
||||
bool eval_string(struct MyModel * mymodel,const char* str){
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
|
||||
eval_tokens(mymodel, embd_inp);
|
||||
return true;
|
||||
}
|
||||
|
||||
llama_token sampling_id(struct MyModel* mymodel) {
|
||||
llama_context* ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
// int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
// out of user input, sample next token
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||
// const float repeat_penalty = params.repeat_penalty;
|
||||
// const float alpha_presence = params.presence_penalty;
|
||||
// const float alpha_frequency = params.frequency_penalty;
|
||||
const int mirostat = params.mirostat;
|
||||
const float mirostat_tau = params.mirostat_tau;
|
||||
const float mirostat_eta = params.mirostat_eta;
|
||||
// const bool penalize_nl = params.penalize_nl;
|
||||
|
||||
llama_token id = 0;
|
||||
{
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// TODO: Apply penalties
|
||||
// float nl_logit = logits[llama_token_nl()];
|
||||
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, repeat_penalty);
|
||||
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, alpha_frequency, alpha_presence);
|
||||
// if (!penalize_nl) {
|
||||
// logits[llama_token_nl()] = nl_logit;
|
||||
// }
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||
} else if (mirostat == 2) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
|
||||
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
const char * sampling(struct MyModel * mymodel) {
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
int id = sampling_id(mymodel);
|
||||
std::string ret;
|
||||
if (id == llama_token_eos()) ret = "</s>";
|
||||
else ret = llama_token_to_str(ctx, id);
|
||||
eval_id(mymodel, id);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
}
|
||||
35
examples/embd-input/embd-input-test.cpp
Normal file
35
examples/embd-input/embd-input-test.cpp
Normal file
@@ -0,0 +1,35 @@
|
||||
#include "embd-input.h"
|
||||
#include <stdlib.h>
|
||||
#include <random>
|
||||
#include <string.h>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
auto mymodel = create_mymodel(argc, argv);
|
||||
int N = 10;
|
||||
int max_tgt_len = 500;
|
||||
int n_embd = llama_n_embd(mymodel->ctx);
|
||||
|
||||
// add random float embd to test evaluation
|
||||
float * data = new float[N*n_embd];
|
||||
std::default_random_engine e;
|
||||
std::uniform_real_distribution<float> u(0,1);
|
||||
for (int i=0;i<N*n_embd;i++) {
|
||||
data[i] = u(e);
|
||||
}
|
||||
|
||||
eval_string(mymodel, "user: what is the color of the flag of UN?");
|
||||
eval_float(mymodel, data, N);
|
||||
eval_string(mymodel, "assistant:");
|
||||
eval_string(mymodel, mymodel->params.prompt.c_str());
|
||||
const char* tmp;
|
||||
for (int i=0; i<max_tgt_len; i++) {
|
||||
tmp = sampling(mymodel);
|
||||
if (strcmp(tmp, "</s>")==0) break;
|
||||
printf("%s", tmp);
|
||||
fflush(stdout);
|
||||
}
|
||||
printf("\n");
|
||||
free_mymodel(mymodel);
|
||||
return 0;
|
||||
}
|
||||
30
examples/embd-input/embd-input.h
Normal file
30
examples/embd-input/embd-input.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef _EMBD_INPUT_H_
|
||||
#define _EMBD_INPUT_H_ 1
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
|
||||
extern "C" {
|
||||
|
||||
typedef struct MyModel {
|
||||
llama_context* ctx;
|
||||
gpt_params params;
|
||||
int n_past = 0;
|
||||
} MyModel;
|
||||
|
||||
|
||||
struct MyModel* create_mymodel(int argc, char ** argv);
|
||||
|
||||
bool eval_float(void* model, float* input, int N);
|
||||
bool eval_tokens(void* model, std::vector<llama_token> tokens);
|
||||
bool eval_id(struct MyModel* mymodel, int id);
|
||||
bool eval_string(struct MyModel* mymodel, const char* str);
|
||||
const char* sampling(struct MyModel* mymodel);
|
||||
llama_token sampling_id(struct MyModel* mymodel);
|
||||
void free_mymodel(struct MyModel* mymodel);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
71
examples/embd-input/embd_input.py
Normal file
71
examples/embd-input/embd_input.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import ctypes
|
||||
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
libc = cdll.LoadLibrary("./libembdinput.so")
|
||||
libc.sampling.restype=c_char_p
|
||||
libc.create_mymodel.restype=c_void_p
|
||||
libc.eval_string.argtypes=[c_void_p, c_char_p]
|
||||
libc.sampling.argtypes=[c_void_p]
|
||||
libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
|
||||
|
||||
|
||||
class MyModel:
|
||||
def __init__(self, args):
|
||||
argc = len(args)
|
||||
c_str = [c_char_p(i.encode()) for i in args]
|
||||
args_c = (c_char_p * argc)(*c_str)
|
||||
self.model = c_void_p(libc.create_mymodel(argc, args_c))
|
||||
self.max_tgt_len = 512
|
||||
self.print_string_eval = True
|
||||
|
||||
def __del__(self):
|
||||
libc.free_mymodel(self.model)
|
||||
|
||||
def eval_float(self, x):
|
||||
libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
|
||||
|
||||
def eval_string(self, x):
|
||||
libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
|
||||
if self.print_string_eval:
|
||||
print(x)
|
||||
|
||||
def eval_token(self, x):
|
||||
libc.eval_id(self.model, x)
|
||||
|
||||
def sampling(self):
|
||||
s = libc.sampling(self.model)
|
||||
return s
|
||||
|
||||
def stream_generate(self, end="</s>"):
|
||||
ret = b""
|
||||
end = end.encode()
|
||||
for _ in range(self.max_tgt_len):
|
||||
tmp = self.sampling()
|
||||
ret += tmp
|
||||
yield tmp
|
||||
if ret.endswith(end):
|
||||
break
|
||||
|
||||
def generate_with_print(self, end="</s>"):
|
||||
ret = b""
|
||||
for i in self.stream_generate(end=end):
|
||||
ret += i
|
||||
print(i.decode(errors="replace"), end="", flush=True)
|
||||
print("")
|
||||
return ret.decode(errors="replace")
|
||||
|
||||
|
||||
def generate(self, end="</s>"):
|
||||
text = b"".join(self.stream_generate(end=end))
|
||||
return text.decode(errors="replace")
|
||||
|
||||
if __name__ == "__main__":
|
||||
model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
|
||||
model.eval_string("""user: what is the color of the flag of UN?""")
|
||||
x = np.random.random((5120,10))# , dtype=np.float32)
|
||||
model.eval_float(x)
|
||||
model.eval_string("""assistant:""")
|
||||
for i in model.generate():
|
||||
print(i.decode(errors="replace"), end="", flush=True)
|
||||
70
examples/embd-input/llava.py
Normal file
70
examples/embd-input/llava.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from embd_input import MyModel
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import torch
|
||||
from transformers import CLIPVisionModel, CLIPImageProcessor
|
||||
from PIL import Image
|
||||
|
||||
# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
|
||||
vision_tower = "openai/clip-vit-large-patch14"
|
||||
select_hidden_state_layer = -2
|
||||
# (vision_config.image_size // vision_config.patch_size) ** 2
|
||||
image_token_len = (224//14)**2
|
||||
|
||||
class Llava:
|
||||
def __init__(self, args):
|
||||
self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
|
||||
self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
|
||||
self.mm_projector = nn.Linear(1024, 5120)
|
||||
self.model = MyModel(["main", *args])
|
||||
|
||||
def load_projection(self, path):
|
||||
state = torch.load(path)
|
||||
self.mm_projector.load_state_dict({
|
||||
"weight": state["model.mm_projector.weight"],
|
||||
"bias": state["model.mm_projector.bias"]})
|
||||
|
||||
def chat(self, question):
|
||||
self.model.eval_string("user: ")
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\nassistant: ")
|
||||
return self.model.generate_with_print()
|
||||
|
||||
def chat_with_image(self, image, question):
|
||||
with torch.no_grad():
|
||||
embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
|
||||
image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
|
||||
select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
|
||||
image_feature = select_hidden_state[:, 1:]
|
||||
embd_image = self.mm_projector(image_feature)
|
||||
embd_image = embd_image.cpu().numpy()[0]
|
||||
self.model.eval_string("user: ")
|
||||
self.model.eval_token(32003-2) # im_start
|
||||
self.model.eval_float(embd_image.T)
|
||||
for i in range(image_token_len-embd_image.shape[0]):
|
||||
self.model.eval_token(32003-3) # im_patch
|
||||
self.model.eval_token(32003-1) # im_end
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\nassistant: ")
|
||||
return self.model.generate_with_print()
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
# model form liuhaotian/LLaVA-13b-delta-v1-1
|
||||
a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
|
||||
# Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
|
||||
# Also here can use pytorch_model-00003-of-00003.bin directly.
|
||||
a.load_projection(os.path.join(
|
||||
os.path.dirname(__file__) ,
|
||||
"llava_projetion.pth"))
|
||||
respose = a.chat_with_image(
|
||||
Image.open("./media/llama1-logo.png").convert('RGB'),
|
||||
"what is the text in the picture?")
|
||||
respose
|
||||
a.chat("what is the color of it?")
|
||||
|
||||
|
||||
|
||||
128
examples/embd-input/minigpt4.py
Normal file
128
examples/embd-input/minigpt4.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from embd_input import MyModel
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
|
||||
sys.path.insert(0, minigpt4_path)
|
||||
from minigpt4.models.blip2 import Blip2Base
|
||||
from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
|
||||
|
||||
|
||||
class MiniGPT4(Blip2Base):
|
||||
"""
|
||||
MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
|
||||
"""
|
||||
def __init__(self,
|
||||
args,
|
||||
vit_model="eva_clip_g",
|
||||
q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
|
||||
img_size=224,
|
||||
drop_path_rate=0,
|
||||
use_grad_checkpoint=False,
|
||||
vit_precision="fp32",
|
||||
freeze_vit=True,
|
||||
freeze_qformer=True,
|
||||
num_query_token=32,
|
||||
llama_model="",
|
||||
prompt_path="",
|
||||
prompt_template="",
|
||||
max_txt_len=32,
|
||||
end_sym='\n',
|
||||
low_resource=False, # use 8 bit and put vit in cpu
|
||||
device_8bit=0
|
||||
):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.low_resource = low_resource
|
||||
self.preprocessor = Blip2ImageEvalProcessor(img_size)
|
||||
|
||||
print('Loading VIT')
|
||||
self.visual_encoder, self.ln_vision = self.init_vision_encoder(
|
||||
vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
|
||||
)
|
||||
print('Loading VIT Done')
|
||||
print('Loading Q-Former')
|
||||
self.Qformer, self.query_tokens = self.init_Qformer(
|
||||
num_query_token, self.visual_encoder.num_features
|
||||
)
|
||||
self.Qformer.cls = None
|
||||
self.Qformer.bert.embeddings.word_embeddings = None
|
||||
self.Qformer.bert.embeddings.position_embeddings = None
|
||||
for layer in self.Qformer.bert.encoder.layer:
|
||||
layer.output = None
|
||||
layer.intermediate = None
|
||||
self.load_from_pretrained(url_or_filename=q_former_model)
|
||||
print('Loading Q-Former Done')
|
||||
self.llama_proj = nn.Linear(
|
||||
self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
|
||||
)
|
||||
self.max_txt_len = max_txt_len
|
||||
self.end_sym = end_sym
|
||||
self.model = MyModel(["main", *args])
|
||||
# system promt
|
||||
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
|
||||
"You will be able to see the image once I provide it to you. Please answer my questions."
|
||||
"###")
|
||||
|
||||
def encode_img(self, image):
|
||||
image = self.preprocessor(image)
|
||||
image = image.unsqueeze(0)
|
||||
device = image.device
|
||||
if self.low_resource:
|
||||
self.vit_to_cpu()
|
||||
image = image.to("cpu")
|
||||
|
||||
with self.maybe_autocast():
|
||||
image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
|
||||
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
|
||||
|
||||
query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
|
||||
query_output = self.Qformer.bert(
|
||||
query_embeds=query_tokens,
|
||||
encoder_hidden_states=image_embeds,
|
||||
encoder_attention_mask=image_atts,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
inputs_llama = self.llama_proj(query_output.last_hidden_state)
|
||||
# atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
|
||||
return inputs_llama
|
||||
|
||||
def load_projection(self, path):
|
||||
state = torch.load(path)["model"]
|
||||
self.llama_proj.load_state_dict({
|
||||
"weight": state["llama_proj.weight"],
|
||||
"bias": state["llama_proj.bias"]})
|
||||
|
||||
def chat(self, question):
|
||||
self.model.eval_string("Human: ")
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\n### Assistant:")
|
||||
return self.model.generate_with_print(end="###")
|
||||
|
||||
def chat_with_image(self, image, question):
|
||||
with torch.no_grad():
|
||||
embd_image = self.encode_img(image)
|
||||
embd_image = embd_image.cpu().numpy()[0]
|
||||
self.model.eval_string("Human: <Img>")
|
||||
self.model.eval_float(embd_image.T)
|
||||
self.model.eval_string("</Img> ")
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\n### Assistant:")
|
||||
return self.model.generate_with_print(end="###")
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
|
||||
a.load_projection(os.path.join(
|
||||
os.path.dirname(__file__) ,
|
||||
"pretrained_minigpt4.pth"))
|
||||
respose = a.chat_with_image(
|
||||
Image.open("./media/llama1-logo.png").convert('RGB'),
|
||||
"what is the text in the picture?")
|
||||
a.chat("what is the color of it?")
|
||||
98
examples/embd-input/panda_gpt.py
Normal file
98
examples/embd-input/panda_gpt.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from embd_input import MyModel
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import torch
|
||||
|
||||
# use PandaGPT path
|
||||
panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
|
||||
imagebind_ckpt_path = "./models/panda_gpt/"
|
||||
|
||||
sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
|
||||
from ImageBind.models import imagebind_model
|
||||
from ImageBind import data
|
||||
|
||||
ModalityType = imagebind_model.ModalityType
|
||||
max_tgt_len = 400
|
||||
|
||||
class PandaGPT:
|
||||
def __init__(self, args):
|
||||
self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
|
||||
self.visual_encoder.eval()
|
||||
self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
|
||||
self.max_tgt_len = max_tgt_len
|
||||
self.model = MyModel(["main", *args])
|
||||
self.generated_text = ""
|
||||
self.device = "cpu"
|
||||
|
||||
def load_projection(self, path):
|
||||
state = torch.load(path, map_location="cpu")
|
||||
self.llama_proj.load_state_dict({
|
||||
"weight": state["llama_proj.weight"],
|
||||
"bias": state["llama_proj.bias"]})
|
||||
|
||||
def eval_inputs(self, inputs):
|
||||
self.model.eval_string("<Img>")
|
||||
embds = self.extract_multimoal_feature(inputs)
|
||||
for i in embds:
|
||||
self.model.eval_float(i.T)
|
||||
self.model.eval_string("</Img> ")
|
||||
|
||||
def chat(self, question):
|
||||
return self.chat_with_image(None, question)
|
||||
|
||||
def chat_with_image(self, inputs, question):
|
||||
if self.generated_text == "":
|
||||
self.model.eval_string("###")
|
||||
self.model.eval_string(" Human: ")
|
||||
if inputs:
|
||||
self.eval_inputs(inputs)
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\n### Assistant:")
|
||||
ret = self.model.generate_with_print(end="###")
|
||||
self.generated_text += ret
|
||||
return ret
|
||||
|
||||
def extract_multimoal_feature(self, inputs):
|
||||
features = []
|
||||
for key in ["image", "audio", "video", "thermal"]:
|
||||
if key + "_paths" in inputs:
|
||||
embeds = self.encode_data(key, inputs[key+"_paths"])
|
||||
features.append(embeds)
|
||||
return features
|
||||
|
||||
def encode_data(self, data_type, data_paths):
|
||||
|
||||
type_map = {
|
||||
"image": ModalityType.VISION,
|
||||
"audio": ModalityType.AUDIO,
|
||||
"video": ModalityType.VISION,
|
||||
"thermal": ModalityType.THERMAL,
|
||||
}
|
||||
load_map = {
|
||||
"image": data.load_and_transform_vision_data,
|
||||
"audio": data.load_and_transform_audio_data,
|
||||
"video": data.load_and_transform_video_data,
|
||||
"thermal": data.load_and_transform_thermal_data
|
||||
}
|
||||
|
||||
load_function = load_map[data_type]
|
||||
key = type_map[data_type]
|
||||
|
||||
inputs = {key: load_function(data_paths, self.device)}
|
||||
with torch.no_grad():
|
||||
embeddings = self.visual_encoder(inputs)
|
||||
embeds = embeddings[key]
|
||||
embeds = self.llama_proj(embeds).cpu().numpy()
|
||||
return embeds
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
|
||||
a.load_projection("./models/panda_gpt/adapter_model.bin")
|
||||
a.chat_with_image(
|
||||
{"image_paths": ["./media/llama1-logo.png"]},
|
||||
"what is the text in the picture? 'llama' or 'lambda'?")
|
||||
a.chat("what is the color of it?")
|
||||
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_init_backend();
|
||||
llama_init_backend(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
@@ -262,6 +262,10 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||
|
||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
|
||||
|
||||
### NUMA support
|
||||
|
||||
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
|
||||
|
||||
### Memory Float 32
|
||||
|
||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
|
||||
|
||||
@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_init_backend();
|
||||
llama_init_backend(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_init_backend();
|
||||
llama_init_backend(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
llama_init_backend();
|
||||
llama_init_backend(false);
|
||||
|
||||
// parse command line arguments
|
||||
const std::string fname_inp = argv[arg_idx];
|
||||
|
||||
@@ -789,7 +789,7 @@ int main(int argc, char ** argv) {
|
||||
params.model_alias = params.model;
|
||||
}
|
||||
|
||||
llama_init_backend();
|
||||
llama_init_backend(params.numa);
|
||||
|
||||
LOG_INFO("build info", {
|
||||
{ "build", BUILD_NUMBER },
|
||||
|
||||
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
|
||||
// Init LLM :
|
||||
//---------------------------------
|
||||
|
||||
llama_init_backend();
|
||||
llama_init_backend(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
@@ -294,20 +294,9 @@ void init_model(struct my_llama_model * model) {
|
||||
|
||||
ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
|
||||
|
||||
// 'layers.10.feed_forward.w1.weight' has length of 32.
|
||||
// ggml_tensor->name only has 32 characters, but we need one more for the '\0' terminator.
|
||||
// ggml_set_name will set the last character to '\0', so we can only store 'layers.10.feed_forward.w1.weigh'.
|
||||
// when saving llama compatible model the tensors names will miss a character.
|
||||
// ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str());
|
||||
// ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str());
|
||||
// ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str());
|
||||
|
||||
strncpy(layer.w1->name, (layers_i + ".feed_forward.w1.weight").c_str(), sizeof(layer.w1->name));
|
||||
strncpy(layer.w2->name, (layers_i + ".feed_forward.w2.weight").c_str(), sizeof(layer.w2->name));
|
||||
strncpy(layer.w3->name, (layers_i + ".feed_forward.w3.weight").c_str(), sizeof(layer.w3->name));
|
||||
layer.w1->padding[0] = 0;
|
||||
layer.w2->padding[0] = 0;
|
||||
layer.w3->padding[0] = 0;
|
||||
ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
|
||||
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
|
||||
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -454,8 +443,8 @@ struct ggml_tensor * forward(
|
||||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
// Qcur shape [n_embd/n_head, n_head, N, 1]
|
||||
// Kcur shape [n_embd/n_head, n_head, N, 1]
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
@@ -711,8 +700,8 @@ struct ggml_tensor * forward_batch(
|
||||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
||||
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
||||
|
||||
@@ -996,8 +985,8 @@ struct ggml_tensor * forward_batch_wo_cache(
|
||||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
||||
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
||||
|
||||
@@ -1218,8 +1207,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
|
||||
// compute Q and K and RoPE them
|
||||
// wq shape [n_embd, n_embd, 1, 1]
|
||||
// wk shape [n_embd, n_embd, 1, 1]
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
|
||||
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
|
||||
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
|
||||
|
||||
@@ -1618,10 +1607,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||
use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
|
||||
use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd);
|
||||
use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
|
||||
use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
|
||||
@@ -2368,7 +2357,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
|
||||
file->write_u32(0);
|
||||
file->write_u32(0);
|
||||
file->write_u32(GGML_TYPE_F32);
|
||||
file->seek(0-file->tell() & 31, SEEK_CUR);
|
||||
file->seek((0-file->tell()) & 31, SEEK_CUR);
|
||||
return;
|
||||
}
|
||||
const char * name = ggml_get_name(tensor);
|
||||
@@ -2383,7 +2372,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
|
||||
file->write_u32(tensor->type);
|
||||
file->write_raw(ne, sizeof(ne[0]) * nd);
|
||||
file->write_raw(name, name_len);
|
||||
file->seek(0-file->tell() & 31, SEEK_CUR);
|
||||
file->seek((0-file->tell()) & 31, SEEK_CUR);
|
||||
file->write_raw(tensor->data, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
@@ -2404,7 +2393,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
|
||||
std::string name = file->read_string(name_len);
|
||||
GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
|
||||
|
||||
file->seek(0-file->tell() & 31, SEEK_CUR);
|
||||
file->seek((0-file->tell()) & 31, SEEK_CUR);
|
||||
file->read_raw(tensor->data, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
|
||||
53
ggml-cuda.cu
53
ggml-cuda.cu
@@ -223,6 +223,15 @@ static __global__ void add_f32(const float * x, const float * y, float * dst, co
|
||||
dst[i] = x[i] + y[i];
|
||||
}
|
||||
|
||||
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = __hadd(x[i], __float2half(y[i]));
|
||||
}
|
||||
|
||||
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
@@ -1459,6 +1468,11 @@ static void add_f32_cuda(const float * x, const float * y, float * dst, const in
|
||||
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
||||
}
|
||||
|
||||
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
||||
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
|
||||
}
|
||||
|
||||
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
||||
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
||||
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
||||
@@ -1941,7 +1955,7 @@ inline void ggml_cuda_op_add(
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
||||
cudaStream_t & cudaStream_main){
|
||||
|
||||
GGML_ASSERT(src0_ddf_i != nullptr);
|
||||
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
|
||||
GGML_ASSERT(src1_ddf_i != nullptr);
|
||||
GGML_ASSERT(dst_ddf_i != nullptr);
|
||||
|
||||
@@ -1949,7 +1963,13 @@ inline void ggml_cuda_op_add(
|
||||
const int64_t i01_diff = i01_high - i01_low;
|
||||
|
||||
// compute
|
||||
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
(void) src1;
|
||||
@@ -2547,8 +2567,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
||||
}
|
||||
|
||||
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true);
|
||||
// ggml_cuda_add permits f16 dst even though this could in theory cause problems with the pointer arithmetic in ggml_cuda_op.
|
||||
// Due to flatten_rows == true this does in practice not make a difference however.
|
||||
// Better solution would be nice but right now that would require disproportionate changes.
|
||||
GGML_ASSERT(
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16));
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, false, true);
|
||||
}
|
||||
|
||||
void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -2801,7 +2827,7 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||
delete extra;
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
||||
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
||||
if (scratch && g_scratch_size == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -2810,11 +2836,11 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
||||
if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
|
||||
const ggml_op src0_op = tensor->src0->op;
|
||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
||||
ggml_cuda_assign_buffers_impl(tensor->src0, scratch);
|
||||
ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
|
||||
}
|
||||
}
|
||||
if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
|
||||
ggml_cuda_assign_buffers_impl(tensor->src1, scratch);
|
||||
ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
@@ -2822,11 +2848,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
|
||||
tensor->op == GGML_OP_VIEW;
|
||||
tensor->op == GGML_OP_VIEW ||
|
||||
force_inplace;
|
||||
const size_t size = ggml_nbytes(tensor);
|
||||
|
||||
CUDA_CHECK(cudaSetDevice(g_main_device));
|
||||
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
|
||||
if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
|
||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||
size_t offset = 0;
|
||||
@@ -2865,11 +2892,15 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
||||
ggml_cuda_assign_buffers_impl(tensor, true);
|
||||
ggml_cuda_assign_buffers_impl(tensor, true, false);
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
||||
ggml_cuda_assign_buffers_impl(tensor, false);
|
||||
ggml_cuda_assign_buffers_impl(tensor, false, false);
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
||||
ggml_cuda_assign_buffers_impl(tensor, false, true);
|
||||
}
|
||||
|
||||
void ggml_cuda_set_main_device(int main_device) {
|
||||
|
||||
@@ -29,6 +29,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_set_main_device(int main_device);
|
||||
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
||||
void ggml_cuda_free_scratch(void);
|
||||
|
||||
599
ggml.c
599
ggml.c
@@ -91,6 +91,11 @@ static int sched_yield (void) {
|
||||
#include <stdatomic.h>
|
||||
|
||||
typedef void* thread_ret_t;
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#endif
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
@@ -119,6 +124,30 @@ typedef void* thread_ret_t;
|
||||
#define GGML_SOFT_MAX_UNROLL 4
|
||||
#define GGML_VEC_DOT_UNROLL 2
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 5)
|
||||
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_5(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 10)
|
||||
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_10(...)
|
||||
#endif
|
||||
|
||||
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
// uncomment to use vDSP for soft max computation
|
||||
// note: not sure if it is actually faster
|
||||
@@ -459,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// timing
|
||||
//
|
||||
@@ -522,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
|
||||
#define ggml_perf_cycles_per_ms() 0
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// cache line
|
||||
//
|
||||
@@ -3843,12 +3872,31 @@ struct ggml_context_container {
|
||||
struct ggml_context context;
|
||||
};
|
||||
|
||||
//
|
||||
// NUMA support
|
||||
//
|
||||
|
||||
#define GGML_NUMA_MAX_NODES 8
|
||||
#define GGML_NUMA_MAX_CPUS 512
|
||||
|
||||
struct ggml_numa_node {
|
||||
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
|
||||
uint32_t n_cpus;
|
||||
};
|
||||
|
||||
struct ggml_numa_nodes {
|
||||
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
||||
uint32_t n_nodes;
|
||||
uint32_t total_cpus; // hardware threads on system
|
||||
};
|
||||
|
||||
//
|
||||
// ggml state
|
||||
//
|
||||
|
||||
struct ggml_state {
|
||||
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
||||
struct ggml_numa_nodes numa;
|
||||
};
|
||||
|
||||
// global state
|
||||
@@ -3873,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
|
||||
atomic_fetch_sub(&g_state_barrier, 1);
|
||||
}
|
||||
|
||||
void ggml_numa_init(void) {
|
||||
if (g_state.numa.n_nodes > 0) {
|
||||
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef __linux__
|
||||
struct stat st;
|
||||
char path[256];
|
||||
int rv;
|
||||
|
||||
// enumerate nodes
|
||||
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
||||
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
||||
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
||||
if (stat(path, &st) != 0) { break; }
|
||||
++g_state.numa.n_nodes;
|
||||
}
|
||||
|
||||
// enumerate CPUs
|
||||
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
|
||||
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
|
||||
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
||||
if (stat(path, &st) != 0) { break; }
|
||||
++g_state.numa.total_cpus;
|
||||
}
|
||||
|
||||
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
||||
|
||||
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
|
||||
g_state.numa.n_nodes = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
||||
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
||||
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
||||
node->n_cpus = 0;
|
||||
for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
|
||||
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
|
||||
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
||||
if (stat(path, &st) == 0) {
|
||||
node->cpus[node->n_cpus++] = c;
|
||||
GGML_PRINT_DEBUG(" %u", c);
|
||||
}
|
||||
}
|
||||
GGML_PRINT_DEBUG("\n");
|
||||
}
|
||||
|
||||
if (ggml_is_numa()) {
|
||||
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
|
||||
if (fptr != NULL) {
|
||||
char buf[42];
|
||||
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
|
||||
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
|
||||
}
|
||||
fclose(fptr);
|
||||
}
|
||||
}
|
||||
#else
|
||||
// TODO
|
||||
#endif
|
||||
}
|
||||
|
||||
bool ggml_is_numa(void) {
|
||||
return g_state.numa.n_nodes > 1;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ggml_print_object(const struct ggml_object * obj) {
|
||||
@@ -4129,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||
|
||||
g_state = (struct ggml_state) {
|
||||
/*.contexts =*/ { { 0 } },
|
||||
/*.numa =*/ {
|
||||
.n_nodes = 0,
|
||||
.total_cpus = 0,
|
||||
},
|
||||
};
|
||||
|
||||
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
|
||||
@@ -6657,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(n_past >= 0);
|
||||
bool is_node = false;
|
||||
@@ -6669,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
||||
|
||||
((int32_t *) b->data)[0] = n_past;
|
||||
((int32_t *) b->data)[1] = n_dims;
|
||||
((int32_t *) b->data)[2] = mode;
|
||||
((int32_t *) b->data)[3] = n_ctx;
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
@@ -6690,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_inplace(
|
||||
@@ -6699,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
||||
}
|
||||
|
||||
// ggml_rope_back
|
||||
@@ -12319,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 3);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 4);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
@@ -12328,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
@@ -12372,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
|
||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||
@@ -12382,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
|
||||
|
||||
float theta = (float)p;
|
||||
|
||||
if (!is_neox) {
|
||||
if (is_glm) {
|
||||
theta = MIN(p, n_ctx - 2);
|
||||
float block_theta = MAX(p - (n_ctx - 2), 0);
|
||||
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_block_theta = cosf(block_theta);
|
||||
const float sin_block_theta = sinf(block_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
block_theta *= theta_scale;
|
||||
|
||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[n_dims/2];
|
||||
const float x2 = src[n_dims];
|
||||
const float x3 = src[n_dims/2*3];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
|
||||
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
||||
}
|
||||
} else if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
@@ -12432,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 3);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 4);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
@@ -12441,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
@@ -12485,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
||||
@@ -12495,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
|
||||
|
||||
float theta = (float)p;
|
||||
|
||||
if (!is_neox) {
|
||||
if (is_glm) {
|
||||
theta = MIN(p, n_ctx - 2);
|
||||
float block_theta = MAX(p - (n_ctx - 2), 0);
|
||||
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_block_theta = cosf(block_theta);
|
||||
const float sin_block_theta = sinf(block_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
block_theta *= theta_scale;
|
||||
|
||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
||||
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
||||
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
|
||||
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
|
||||
|
||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
||||
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
||||
}
|
||||
} if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
@@ -13387,8 +13566,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
||||
const int nk1 = ne01;
|
||||
|
||||
// size of the convolution row - the kernel size unrolled across all channels
|
||||
// round-up so it is more suitable for SIMD
|
||||
const int ew0 = ggml_up32(nk0*nk1*ne02);
|
||||
const int ew0 = nk0*nk1*ne02;
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
@@ -16069,17 +16247,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
{
|
||||
if (src0->grad) {
|
||||
assert(src1->type == GGML_TYPE_I32);
|
||||
assert(ggml_nelements(src1) == 3);
|
||||
assert(ggml_nelements(src1) == 4);
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
src0->grad = ggml_add_impl(ctx,
|
||||
src0->grad,
|
||||
ggml_rope(ctx,
|
||||
tensor->grad,
|
||||
n_past,
|
||||
n_dims,
|
||||
mode),
|
||||
mode,
|
||||
n_ctx),
|
||||
inplace);
|
||||
}
|
||||
if (src1->grad) {
|
||||
@@ -16504,68 +16684,173 @@ typedef pthread_t ggml_thread_t;
|
||||
|
||||
#endif
|
||||
|
||||
// Android's libc implementation "bionic" does not support setting affinity
|
||||
#if defined(__linux__) && !defined(__BIONIC__)
|
||||
void set_numa_thread_affinity(int thread_n, int n_threads) {
|
||||
if (!ggml_is_numa()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// run thread on node_num thread_n / (threads per node)
|
||||
const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
|
||||
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
||||
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
||||
|
||||
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
||||
CPU_ZERO_S(setsize, cpus);
|
||||
for (size_t i = 0; i < node->n_cpus; ++i) {
|
||||
CPU_SET_S(node->cpus[i], setsize, cpus);
|
||||
}
|
||||
|
||||
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
||||
if (rv) {
|
||||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
||||
strerror(rv));
|
||||
}
|
||||
|
||||
CPU_FREE(cpus);
|
||||
}
|
||||
|
||||
void clear_numa_thread_affinity(void) {
|
||||
if (!ggml_is_numa()) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
||||
|
||||
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
||||
CPU_ZERO_S(setsize, cpus);
|
||||
for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
|
||||
CPU_SET_S(i, setsize, cpus);
|
||||
}
|
||||
|
||||
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
||||
if (rv) {
|
||||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
||||
strerror(rv));
|
||||
}
|
||||
|
||||
CPU_FREE(cpus);
|
||||
}
|
||||
#else
|
||||
// TODO: Windows etc.
|
||||
// (the linux implementation may also work on BSD, someone should test)
|
||||
void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
|
||||
void clear_numa_thread_affinity(void) {}
|
||||
#endif
|
||||
|
||||
struct ggml_compute_state_shared {
|
||||
ggml_lock_t spin;
|
||||
struct ggml_cgraph * cgraph;
|
||||
|
||||
int64_t perf_node_start_cycles;
|
||||
int64_t perf_node_start_time_us;
|
||||
|
||||
int n_threads;
|
||||
|
||||
// synchronization primitives
|
||||
atomic_int n_ready;
|
||||
atomic_bool has_work;
|
||||
atomic_bool stop; // stop all threads
|
||||
atomic_int n_active; // num active threads
|
||||
atomic_int node_n; // active graph node
|
||||
};
|
||||
|
||||
struct ggml_compute_state {
|
||||
ggml_thread_t thrd;
|
||||
|
||||
struct ggml_compute_params params;
|
||||
struct ggml_tensor * node;
|
||||
|
||||
int ith;
|
||||
struct ggml_compute_state_shared * shared;
|
||||
};
|
||||
|
||||
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
||||
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
||||
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
||||
|
||||
node->perf_runs++;
|
||||
node->perf_cycles += cycles_cur;
|
||||
node->perf_time_us += time_us_cur;
|
||||
}
|
||||
|
||||
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
||||
struct ggml_cgraph * cgraph = state->shared->cgraph;
|
||||
|
||||
const int n_threads = state->shared->n_threads;
|
||||
set_numa_thread_affinity(state->ith, n_threads);
|
||||
|
||||
int node_n = -1;
|
||||
|
||||
while (true) {
|
||||
if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
|
||||
atomic_store(&state->shared->has_work, false);
|
||||
} else {
|
||||
while (atomic_load(&state->shared->has_work)) {
|
||||
if (atomic_load(&state->shared->stop)) {
|
||||
return 0;
|
||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||
// all other threads are finished and spinning
|
||||
// do finalize and init here so we don't have synchronize again
|
||||
struct ggml_compute_params params = {
|
||||
/*.type =*/ GGML_TASK_FINALIZE,
|
||||
/*.ith =*/ 0,
|
||||
/*.nth =*/ 0,
|
||||
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
||||
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
||||
};
|
||||
|
||||
if (node_n != -1) {
|
||||
/* FINALIZE */
|
||||
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
||||
params.nth = node->n_tasks;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
}
|
||||
|
||||
// distribute new work or execute it direct if 1T
|
||||
while (++node_n < cgraph->n_nodes) {
|
||||
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
||||
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
||||
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
||||
|
||||
/* INIT */
|
||||
params.type = GGML_TASK_INIT;
|
||||
params.nth = node->n_tasks;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
if (node->n_tasks == 1) {
|
||||
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
||||
// they do something more efficient than spinning (?)
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
params.type = GGML_TASK_FINALIZE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
ggml_lock_lock (&state->shared->spin);
|
||||
ggml_lock_unlock(&state->shared->spin);
|
||||
}
|
||||
}
|
||||
|
||||
atomic_fetch_sub(&state->shared->n_ready, 1);
|
||||
|
||||
// wait for work
|
||||
while (!atomic_load(&state->shared->has_work)) {
|
||||
if (atomic_load(&state->shared->stop)) {
|
||||
return 0;
|
||||
}
|
||||
ggml_lock_lock (&state->shared->spin);
|
||||
ggml_lock_unlock(&state->shared->spin);
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_n, node_n);
|
||||
} else {
|
||||
// wait for other threads to finish
|
||||
const int last = node_n;
|
||||
do {
|
||||
sched_yield();
|
||||
node_n = atomic_load(&state->shared->node_n);
|
||||
} while (node_n == last);
|
||||
}
|
||||
|
||||
// check if we should stop
|
||||
if (atomic_load(&state->shared->stop)) {
|
||||
break;
|
||||
}
|
||||
if (node_n >= cgraph->n_nodes) break;
|
||||
|
||||
if (state->node) {
|
||||
if (state->params.ith < state->params.nth) {
|
||||
ggml_compute_forward(&state->params, state->node);
|
||||
}
|
||||
/* COMPUTE */
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
state->node = NULL;
|
||||
} else {
|
||||
break;
|
||||
struct ggml_compute_params params = {
|
||||
/*.type =*/ GGML_TASK_COMPUTE,
|
||||
/*.ith =*/ state->ith,
|
||||
/*.nth =*/ node->n_tasks,
|
||||
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
||||
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
||||
};
|
||||
|
||||
if (state->ith < node->n_tasks) {
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16576,39 +16861,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
const int n_threads = cgraph->n_threads;
|
||||
|
||||
struct ggml_compute_state_shared state_shared = {
|
||||
/*.spin =*/ GGML_LOCK_INITIALIZER,
|
||||
/*.n_threads =*/ n_threads,
|
||||
/*.n_ready =*/ 0,
|
||||
/*.has_work =*/ false,
|
||||
/*.stop =*/ false,
|
||||
/*.cgraph =*/ cgraph,
|
||||
/*.perf_node_start_cycles =*/ 0,
|
||||
/*.perf_node_start_time_us =*/ 0,
|
||||
/*.n_threads =*/ n_threads,
|
||||
/*.n_active =*/ n_threads,
|
||||
/*.node_n =*/ -1,
|
||||
};
|
||||
struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
|
||||
|
||||
// create thread pool
|
||||
if (n_threads > 1) {
|
||||
ggml_lock_init(&state_shared.spin);
|
||||
|
||||
atomic_store(&state_shared.has_work, true);
|
||||
|
||||
for (int j = 0; j < n_threads - 1; j++) {
|
||||
workers[j] = (struct ggml_compute_state) {
|
||||
.thrd = 0,
|
||||
.params = {
|
||||
.type = GGML_TASK_COMPUTE,
|
||||
.ith = j + 1,
|
||||
.nth = n_threads,
|
||||
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
||||
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
||||
},
|
||||
.node = NULL,
|
||||
.shared = &state_shared,
|
||||
};
|
||||
|
||||
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
||||
GGML_ASSERT(rc == 0);
|
||||
UNUSED(rc);
|
||||
}
|
||||
}
|
||||
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
||||
|
||||
// initialize tasks + work buffer
|
||||
{
|
||||
@@ -16752,7 +17012,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
node->n_tasks = 1;
|
||||
} break;
|
||||
case GGML_OP_SET:
|
||||
case GGML_OP_CONT:
|
||||
@@ -16956,166 +17216,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
}
|
||||
}
|
||||
|
||||
// create thread pool
|
||||
if (n_threads > 1) {
|
||||
for (int j = 1; j < n_threads; ++j) {
|
||||
workers[j] = (struct ggml_compute_state) {
|
||||
.thrd = 0,
|
||||
.ith = j,
|
||||
.shared = &state_shared,
|
||||
};
|
||||
|
||||
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
||||
GGML_ASSERT(rc == 0);
|
||||
}
|
||||
}
|
||||
workers[0].ith = 0;
|
||||
workers[0].shared = &state_shared;
|
||||
|
||||
const int64_t perf_start_cycles = ggml_perf_cycles();
|
||||
const int64_t perf_start_time_us = ggml_perf_time_us();
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
|
||||
// this is a work thread too
|
||||
ggml_graph_compute_thread(&workers[0]);
|
||||
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
// TODO: this could be used to avoid unnecessary computations, but it needs to be improved
|
||||
//if (node->grad == NULL && node->perf_runs > 0) {
|
||||
// continue;
|
||||
//}
|
||||
|
||||
const int64_t perf_node_start_cycles = ggml_perf_cycles();
|
||||
const int64_t perf_node_start_time_us = ggml_perf_time_us();
|
||||
|
||||
// INIT
|
||||
struct ggml_compute_params params = {
|
||||
/*.type =*/ GGML_TASK_INIT,
|
||||
/*.ith =*/ 0,
|
||||
/*.nth =*/ node->n_tasks,
|
||||
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
||||
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
||||
};
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
// COMPUTE
|
||||
if (node->n_tasks > 1) {
|
||||
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
||||
atomic_store(&state_shared.has_work, false);
|
||||
}
|
||||
|
||||
while (atomic_load(&state_shared.has_work)) {
|
||||
ggml_lock_lock (&state_shared.spin);
|
||||
ggml_lock_unlock(&state_shared.spin);
|
||||
}
|
||||
|
||||
// launch thread pool
|
||||
for (int j = 0; j < n_threads - 1; j++) {
|
||||
workers[j].params = (struct ggml_compute_params) {
|
||||
.type = GGML_TASK_COMPUTE,
|
||||
.ith = j + 1,
|
||||
.nth = node->n_tasks,
|
||||
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
||||
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
||||
};
|
||||
workers[j].node = node;
|
||||
}
|
||||
|
||||
atomic_fetch_sub(&state_shared.n_ready, 1);
|
||||
|
||||
while (atomic_load(&state_shared.n_ready) > 0) {
|
||||
ggml_lock_lock (&state_shared.spin);
|
||||
ggml_lock_unlock(&state_shared.spin);
|
||||
}
|
||||
|
||||
atomic_store(&state_shared.has_work, true);
|
||||
}
|
||||
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
// wait for thread pool
|
||||
if (node->n_tasks > 1) {
|
||||
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
||||
atomic_store(&state_shared.has_work, false);
|
||||
}
|
||||
|
||||
while (atomic_load(&state_shared.has_work)) {
|
||||
ggml_lock_lock (&state_shared.spin);
|
||||
ggml_lock_unlock(&state_shared.spin);
|
||||
}
|
||||
|
||||
atomic_fetch_sub(&state_shared.n_ready, 1);
|
||||
|
||||
while (atomic_load(&state_shared.n_ready) != 0) {
|
||||
ggml_lock_lock (&state_shared.spin);
|
||||
ggml_lock_unlock(&state_shared.spin);
|
||||
}
|
||||
}
|
||||
|
||||
// FINALIZE
|
||||
if (node->n_tasks > 1) {
|
||||
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
||||
atomic_store(&state_shared.has_work, false);
|
||||
}
|
||||
|
||||
while (atomic_load(&state_shared.has_work)) {
|
||||
ggml_lock_lock (&state_shared.spin);
|
||||
ggml_lock_unlock(&state_shared.spin);
|
||||
}
|
||||
|
||||
// launch thread pool
|
||||
for (int j = 0; j < n_threads - 1; j++) {
|
||||
workers[j].params = (struct ggml_compute_params) {
|
||||
.type = GGML_TASK_FINALIZE,
|
||||
.ith = j + 1,
|
||||
.nth = node->n_tasks,
|
||||
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
||||
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
||||
};
|
||||
workers[j].node = node;
|
||||
}
|
||||
|
||||
atomic_fetch_sub(&state_shared.n_ready, 1);
|
||||
|
||||
while (atomic_load(&state_shared.n_ready) > 0) {
|
||||
ggml_lock_lock (&state_shared.spin);
|
||||
ggml_lock_unlock(&state_shared.spin);
|
||||
}
|
||||
|
||||
atomic_store(&state_shared.has_work, true);
|
||||
}
|
||||
|
||||
params.type = GGML_TASK_FINALIZE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
// wait for thread pool
|
||||
if (node->n_tasks > 1) {
|
||||
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
||||
atomic_store(&state_shared.has_work, false);
|
||||
}
|
||||
|
||||
while (atomic_load(&state_shared.has_work)) {
|
||||
ggml_lock_lock (&state_shared.spin);
|
||||
ggml_lock_unlock(&state_shared.spin);
|
||||
}
|
||||
|
||||
atomic_fetch_sub(&state_shared.n_ready, 1);
|
||||
|
||||
while (atomic_load(&state_shared.n_ready) != 0) {
|
||||
ggml_lock_lock (&state_shared.spin);
|
||||
ggml_lock_unlock(&state_shared.spin);
|
||||
}
|
||||
}
|
||||
|
||||
// performance stats (node)
|
||||
{
|
||||
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
|
||||
int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
|
||||
|
||||
node->perf_runs++;
|
||||
node->perf_cycles += perf_cycles_cur;
|
||||
node->perf_time_us += perf_time_us_cur;
|
||||
}
|
||||
}
|
||||
// don't leave affinity set on the main thread
|
||||
clear_numa_thread_affinity();
|
||||
|
||||
// join thread pool
|
||||
if (n_threads > 1) {
|
||||
atomic_store(&state_shared.stop, true);
|
||||
atomic_store(&state_shared.has_work, true);
|
||||
|
||||
for (int j = 0; j < n_threads - 1; j++) {
|
||||
int rc = ggml_thread_join(workers[j].thrd, NULL);
|
||||
for (int j = 1; j < n_threads; j++) {
|
||||
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
||||
GGML_ASSERT(rc == 0);
|
||||
UNUSED(rc);
|
||||
}
|
||||
|
||||
ggml_lock_destroy(&state_shared.spin);
|
||||
}
|
||||
|
||||
// performance stats (graph)
|
||||
|
||||
12
ggml.h
12
ggml.h
@@ -198,7 +198,7 @@
|
||||
#define GGML_MAX_PARAMS 256
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_OPT 4
|
||||
#define GGML_MAX_NAME 32
|
||||
#define GGML_MAX_NAME 48
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
#define GGML_ASSERT(x) \
|
||||
@@ -469,6 +469,9 @@ extern "C" {
|
||||
GGML_API int64_t ggml_cycles(void);
|
||||
GGML_API int64_t ggml_cycles_per_ms(void);
|
||||
|
||||
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
@@ -1033,13 +1036,15 @@ extern "C" {
|
||||
// rotary position embedding
|
||||
// if mode & 1 == 1, skip n_past elements
|
||||
// if mode & 2 == 1, GPT-NeoX style
|
||||
// if mode & 4 == 1, ChatGLM style
|
||||
// TODO: avoid creating a new tensor every time
|
||||
GGML_API struct ggml_tensor * ggml_rope(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode);
|
||||
int mode,
|
||||
int n_ctx);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
||||
@@ -1047,7 +1052,8 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode);
|
||||
int mode,
|
||||
int n_ctx);
|
||||
|
||||
// rotary position embedding backward, i.e compute dx from dy
|
||||
// a - dy
|
||||
|
||||
@@ -1981,7 +1981,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
|
||||
|
||||
// prepare low and high bits
|
||||
const int bit = j << 2;
|
||||
const int bit = j << 2;
|
||||
|
||||
const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
|
||||
const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
|
||||
const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
|
||||
|
||||
24
llama-util.h
24
llama-util.h
@@ -172,12 +172,14 @@ struct llama_mmap {
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
// prefetch/readahead impairs performance on NUMA systems
|
||||
if (numa) { prefetch = 0; }
|
||||
#ifdef __linux__
|
||||
flags |= MAP_POPULATE;
|
||||
if (prefetch) { flags |= MAP_POPULATE; }
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
if (addr == MAP_FAILED) {
|
||||
@@ -191,6 +193,14 @@ struct llama_mmap {
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
if (numa) {
|
||||
// advise the kernel not to use readahead
|
||||
// (because the next page might not belong on the same node)
|
||||
if (madvise(addr, file->size, MADV_RANDOM)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
@@ -199,7 +209,9 @@ struct llama_mmap {
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
||||
(void) numa;
|
||||
|
||||
size = file->size;
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
||||
@@ -244,8 +256,10 @@ struct llama_mmap {
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
llama_mmap(struct llama_file *, bool prefetch = true) {
|
||||
(void)prefetch;
|
||||
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
|
||||
(void) prefetch;
|
||||
(void) numa;
|
||||
|
||||
throw std::runtime_error(std::string("mmap not supported"));
|
||||
}
|
||||
#endif
|
||||
|
||||
120
llama.cpp
120
llama.cpp
@@ -774,7 +774,7 @@ struct llama_model_loader {
|
||||
}
|
||||
|
||||
if (use_mmap) {
|
||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
|
||||
if (lmlock) {
|
||||
lmlock->init(mapping->addr);
|
||||
}
|
||||
@@ -977,7 +977,7 @@ bool llama_mlock_supported() {
|
||||
return llama_mlock::SUPPORTED;
|
||||
}
|
||||
|
||||
void llama_init_backend() {
|
||||
void llama_init_backend(bool numa) {
|
||||
ggml_time_init();
|
||||
|
||||
// needed to initialize f16 tables
|
||||
@@ -986,6 +986,10 @@ void llama_init_backend() {
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
ggml_free(ctx);
|
||||
}
|
||||
|
||||
if (numa) {
|
||||
ggml_numa_init();
|
||||
}
|
||||
}
|
||||
|
||||
int64_t llama_time_us() {
|
||||
@@ -1365,22 +1369,26 @@ static bool llama_model_load(
|
||||
|
||||
// evaluate the transformer
|
||||
//
|
||||
// - lctx: llama context
|
||||
// - tokens: new batch of tokens to process
|
||||
// - n_past: the context size so far
|
||||
// - n_threads: number of threads to use
|
||||
// - cgraph_fname: filename of the exported computation graph
|
||||
// - lctx: llama context
|
||||
// - tokens: new batch of tokens to process
|
||||
// - embd embeddings input
|
||||
// - n_tokens number of tokens
|
||||
// - n_past: the context size so far
|
||||
// - n_threads: number of threads to use
|
||||
//
|
||||
static bool llama_eval_internal(
|
||||
llama_context & lctx,
|
||||
const llama_token * tokens,
|
||||
const int n_tokens,
|
||||
const int n_past,
|
||||
const int n_threads,
|
||||
llama_context & lctx,
|
||||
const llama_token * tokens,
|
||||
const float * embd,
|
||||
const int n_tokens,
|
||||
const int n_past,
|
||||
const int n_threads,
|
||||
const char * cgraph_fname) {
|
||||
|
||||
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
||||
|
||||
// enforce that the first token is BOS
|
||||
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
||||
if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
|
||||
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@@ -1420,12 +1428,18 @@ static bool llama_eval_internal(
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
ggml_set_name(embd, "embd");
|
||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
if (tokens) {
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
ggml_set_name(embd, "embd");
|
||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||
inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
||||
} else {
|
||||
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
||||
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||
}
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
(void) i_gpu_start;
|
||||
@@ -1487,11 +1501,11 @@ static bool llama_eval_internal(
|
||||
offload_func_kq(tmpq);
|
||||
ggml_set_name(tmpq, "tmpq");
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
offload_func_kq(Kcur);
|
||||
ggml_set_name(Kcur, "Kcur");
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
offload_func_kq(Qcur);
|
||||
ggml_set_name(Qcur, "Qcur");
|
||||
|
||||
@@ -2650,6 +2664,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// interface implementation
|
||||
//
|
||||
@@ -2899,7 +2915,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||
|
||||
// maybe this should in llama_model_loader
|
||||
if (model_loader->use_mmap) {
|
||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2960,7 +2976,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||
return false;
|
||||
}
|
||||
}
|
||||
ggml_tensor* lora_tensor;
|
||||
ggml_tensor * lora_tensor;
|
||||
if (n_dims == 2) {
|
||||
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
||||
}
|
||||
@@ -2968,6 +2984,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
return 1;
|
||||
}
|
||||
ggml_set_name(lora_tensor, "lora_tensor");
|
||||
|
||||
// load tensor data
|
||||
size_t offset = fin.tellg();
|
||||
@@ -2983,6 +3000,21 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
||||
|
||||
ggml_tensor * dest_t = model_tensors[base_name];
|
||||
|
||||
offload_func_t offload_func = llama_nop;
|
||||
offload_func_t offload_func_force_inplace = llama_nop;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
if (dest_t->type != GGML_TYPE_F16) {
|
||||
throw std::runtime_error(format(
|
||||
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
|
||||
}
|
||||
offload_func = ggml_cuda_assign_buffers;
|
||||
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
ggml_tensor * base_t;
|
||||
if (model_loader) {
|
||||
// load from base model
|
||||
@@ -3010,7 +3042,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||
}
|
||||
|
||||
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
||||
GGML_ASSERT(loraA->type == GGML_TYPE_F32);
|
||||
ggml_set_name(loraA, "loraA");
|
||||
|
||||
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
||||
GGML_ASSERT(loraB->type == GGML_TYPE_F32);
|
||||
ggml_set_name(loraB, "loraB");
|
||||
|
||||
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
||||
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
||||
@@ -3020,19 +3057,32 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||
|
||||
// w = w + BA*s
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||
offload_func(BA);
|
||||
ggml_set_name(BA, "BA");
|
||||
|
||||
if (scaling != 1.0f) {
|
||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
||||
ggml_set_name(scale_tensor, "scale_tensor");
|
||||
|
||||
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
||||
offload_func(BA);
|
||||
ggml_set_name(BA, "BA_scaled");
|
||||
}
|
||||
|
||||
ggml_tensor * r;
|
||||
if (base_t == dest_t) {
|
||||
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
||||
offload_func_force_inplace(r);
|
||||
ggml_set_name(r, "r_add_inplace");
|
||||
}
|
||||
else {
|
||||
r = ggml_add(lora_ctx, base_t, BA);
|
||||
offload_func(r);
|
||||
ggml_set_name(r, "r_add");
|
||||
|
||||
r = ggml_cpy(lora_ctx, r, dest_t);
|
||||
offload_func(r);
|
||||
ggml_set_name(r, "r_cpy");
|
||||
}
|
||||
|
||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||
@@ -3417,7 +3467,29 @@ int llama_eval(
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads) {
|
||||
if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, nullptr)) {
|
||||
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
||||
fprintf(stderr, "%s: failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// get a more accurate load time, upon first eval
|
||||
// TODO: fix this
|
||||
if (!ctx->has_evaluated_once) {
|
||||
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
||||
ctx->has_evaluated_once = true;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int llama_eval_embd(
|
||||
struct llama_context * ctx,
|
||||
const float * embd,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads) {
|
||||
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
||||
fprintf(stderr, "%s: failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -3438,7 +3510,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
||||
|
||||
const std::vector<llama_token> tmp(n_batch, llama_token_bos());
|
||||
|
||||
if (!llama_eval_internal(*ctx, tmp.data(), tmp.size(), n_ctx, 1, fname)) {
|
||||
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
||||
fprintf(stderr, "%s: failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
11
llama.h
11
llama.h
@@ -140,8 +140,9 @@ extern "C" {
|
||||
|
||||
// TODO: not great API - very likely to change
|
||||
// Initialize the llama + ggml backend
|
||||
// If numa is true, use NUMA optimizations
|
||||
// Call once at the start of the program
|
||||
LLAMA_API void llama_init_backend();
|
||||
LLAMA_API void llama_init_backend(bool numa);
|
||||
|
||||
LLAMA_API int64_t llama_time_us();
|
||||
|
||||
@@ -225,6 +226,14 @@ extern "C" {
|
||||
int n_past,
|
||||
int n_threads);
|
||||
|
||||
// Same as llama_eval, but use float matrix input directly.
|
||||
LLAMA_API int llama_eval_embd(
|
||||
struct llama_context * ctx,
|
||||
const float * embd,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads);
|
||||
|
||||
// Export a static computation graph for context of 511 and batch size of 1
|
||||
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
||||
// parameters here to keep things simple
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#define QK 32
|
||||
#define WARMUP 5
|
||||
#define ITERATIONS 10
|
||||
#define MAX_ITERATIONS 100000000
|
||||
|
||||
#define L1_SIZE 32*128
|
||||
#define L2_SIZE 32*2048
|
||||
@@ -36,9 +37,9 @@ struct quantize_perf_params {
|
||||
bool op_dequantize_row_q = false;
|
||||
bool op_quantize_row_q_dot = false;
|
||||
bool op_vec_dot_q = false;
|
||||
int64_t iterations = ITERATIONS;
|
||||
};
|
||||
|
||||
|
||||
#if defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#include <x86intrin.h>
|
||||
@@ -75,7 +76,7 @@ void * align_with_offset(void * ptr, int offset) {
|
||||
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
||||
}
|
||||
|
||||
void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)> function) {
|
||||
void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function<size_t(void)> function) {
|
||||
int64_t min_time_us = INT64_MAX;
|
||||
int64_t total_time_us = 0;
|
||||
int64_t min_time_cycles = INT64_MAX;
|
||||
@@ -86,7 +87,7 @@ void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)>
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < ITERATIONS; i++) {
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
const int64_t start_time = ggml_time_us();
|
||||
const int64_t start_cycles = cpu_cycles();
|
||||
|
||||
@@ -102,9 +103,38 @@ void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)>
|
||||
}
|
||||
|
||||
printf(" min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size);
|
||||
printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * ITERATIONS));
|
||||
printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * ITERATIONS, total_time_us));
|
||||
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * ITERATIONS, total_time_us));
|
||||
printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * iterations));
|
||||
printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * iterations, total_time_us));
|
||||
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us));
|
||||
}
|
||||
|
||||
void usage(char * argv[]) {
|
||||
printf("Benchmark quantization specific functions on synthetic data\n");
|
||||
printf("\n");
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
printf("\n");
|
||||
printf("options: (default)\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
|
||||
printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
|
||||
printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
|
||||
printf(" --op OP set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
|
||||
printf(" quantize_row_q_dot, vec_dot_q (all)\n");
|
||||
printf(" --type TYPE set test type as");
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
|
||||
if (ggml_type_name(type) != NULL) {
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
printf(" %s", ggml_type_name(type));
|
||||
}
|
||||
}
|
||||
}
|
||||
printf(" (all)\n");
|
||||
printf(" --alignment-offset OFFSET\n");
|
||||
printf(" set alignment offset as OFFSET (0)\n");
|
||||
printf(" -i NUM, --iterations NUM\n");
|
||||
printf(" set test iteration number (%d)\n", ITERATIONS);
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
@@ -178,6 +208,21 @@ int main(int argc, char * argv[]) {
|
||||
break;
|
||||
}
|
||||
params.alignment_offset = alignment;
|
||||
} else if ((arg == "-i") || (arg == "--iterations")) {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
int number = std::stoi(argv[i]);
|
||||
if (number < 0 || number > MAX_ITERATIONS) {
|
||||
fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.iterations = number;
|
||||
} else if ((arg == "-h") || (arg == "--help")) {
|
||||
usage(argv);
|
||||
return 1;
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
return 1;
|
||||
@@ -213,6 +258,8 @@ int main(int argc, char * argv[]) {
|
||||
generate_data(0, largest, test_data1);
|
||||
generate_data(1, largest, test_data2);
|
||||
|
||||
int64_t iterations = params.iterations;
|
||||
|
||||
|
||||
// Initialize GGML, ensures float conversion tables are initialized
|
||||
struct ggml_init_params ggml_params = {
|
||||
@@ -225,7 +272,7 @@ int main(int argc, char * argv[]) {
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -241,7 +288,7 @@ int main(int argc, char * argv[]) {
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
@@ -255,7 +302,7 @@ int main(int argc, char * argv[]) {
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
@@ -270,7 +317,7 @@ int main(int argc, char * argv[]) {
|
||||
return test_out[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
@@ -284,7 +331,7 @@ int main(int argc, char * argv[]) {
|
||||
return test_q1[0];
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
@@ -301,7 +348,7 @@ int main(int argc, char * argv[]) {
|
||||
return result;
|
||||
};
|
||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||
benchmark_function(size, quantized_size, quantize_fn);
|
||||
benchmark_function(size, quantized_size, iterations, quantize_fn);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user