Compare commits

..

9 Commits

Author SHA1 Message Date
Howard Su
0be54f75a6 baby-llama : fix build after ggml_rope change (#2016) 2023-06-27 08:07:13 +03:00
Georgi Gerganov
181e8d9755 llama : fix rope usage after ChatGLM change 2023-06-27 00:37:33 +03:00
Georgi Gerganov
d9779021bd ggml : add support for ChatGLM RoPE 2023-06-27 00:06:51 +03:00
Roman Parykin
d38e451578 readme : add Scala 3 bindings repo (#2010) 2023-06-26 22:47:59 +03:00
David Yang
eaa6ca5a61 ggml : increase max tensor name + clean up compiler warnings in train-text (#1988)
* Clean up compiler warnings in train-text

Some brackets to disambiguate order of operations

* Increase GGML_MAX_NAME

Avoiding strncpy danger in train-text-from-scratch and reducing potential future name length issues
2023-06-26 22:45:32 +03:00
Gustavo Rocha Dias
aa777abbb7 readme : LD_LIBRARY_PATH complement for some Android devices when building with CLBlast inside Termux (#2007)
* docs - Alternative way to build at Android, with CLBlast.

* doc - LD_LIBRARY_PATH complement for some Android devices when building with CLBlast inside Termux.

* doc- fix typo
2023-06-26 22:34:45 +03:00
Georgi Gerganov
c824d2e368 ggml : avoid conv 2d kernel round up 2023-06-26 21:03:59 +03:00
zrm
b853d45601 ggml : add NUMA support (#1556)
* detect NUMA systems and pin work threads to nodes (linux)

* disable mmap prefetch/readahead for NUMA systems

* avoid sending finalize op to thread pool if it does nothing

* silence robot

* fix args

* make --numa a param

* recommendation that n_nodes evenly divide n_threads did not warrant such aggressive enforcement

* lower synchronization overhead

* statically allocate

* move numa state to g_state

* add description for --numa

* ggml : minor style changes

* ggml : minor style + try fix sanitizer build

* llama : allow to initialize backend with NUMA support

* llama : avoid ggml include in llama-util.h

* ggml : style / formatting

* ggml : fix handling of ops with n_threads > n_tasks > 1

* server : utilize numa parameter

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-26 20:57:59 +03:00
Georgi Gerganov
9225baef71 k-quants : fix indentation 2023-06-26 20:10:52 +03:00
18 changed files with 446 additions and 288 deletions

View File

@@ -93,6 +93,7 @@ as the main playground for developing new features for the [ggml](https://github
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
**UI:**
@@ -687,6 +688,8 @@ GGML_OPENCL_DEVICE=0
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
```
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.

View File

@@ -566,8 +566,8 @@ struct ggml_tensor * forward(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, 1]
// Kcur shape [n_embd/n_head, n_head, N, 1]
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
// store key and value to memory
{
@@ -823,8 +823,8 @@ struct ggml_tensor * forward_batch(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@@ -1116,7 +1116,7 @@ struct ggml_tensor * forward_lora(
model->layers[il].wqb,
cur)),
n_embd/n_head, n_head, N),
n_past, n_rot, 0);
n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0,
ggml_reshape_3d(ctx0,
ggml_mul_mat(ctx0,
@@ -1125,7 +1125,7 @@ struct ggml_tensor * forward_lora(
model->layers[il].wkb,
cur)),
n_embd/n_head, n_head, N),
n_past, n_rot, 0);
n_past, n_rot, 0, 0);
// store key and value to memory
{

View File

@@ -343,6 +343,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--numa") {
params.numa = true;
} else if (arg == "--export") {
params.export_cgraph = true;
} else if (arg == "--verbose-prompt") {
@@ -488,6 +490,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n");

View File

@@ -76,6 +76,7 @@ struct gpt_params {
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool numa = false; // attempt optimizations that help on some NUMA systems
bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation
};

View File

@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}
llama_init_backend();
llama_init_backend(params.numa);
llama_model * model;
llama_context * ctx;

View File

@@ -262,6 +262,10 @@ These options help improve the performance and memory usage of the LLaMA models.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
### NUMA support
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
### Memory Float 32
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.

View File

@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}
llama_init_backend();
llama_init_backend(params.numa);
llama_model * model;
llama_context * ctx;

View File

@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}
llama_init_backend();
llama_init_backend(params.numa);
llama_model * model;
llama_context * ctx;

View File

@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
usage(argv[0]);
}
llama_init_backend();
llama_init_backend(false);
// parse command line arguments
const std::string fname_inp = argv[arg_idx];

View File

@@ -789,7 +789,7 @@ int main(int argc, char ** argv) {
params.model_alias = params.model;
}
llama_init_backend();
llama_init_backend(params.numa);
LOG_INFO("build info", {
{ "build", BUILD_NUMBER },

View File

@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
// Init LLM :
//---------------------------------
llama_init_backend();
llama_init_backend(params.numa);
llama_model * model;
llama_context * ctx;

View File

@@ -294,20 +294,9 @@ void init_model(struct my_llama_model * model) {
ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
// 'layers.10.feed_forward.w1.weight' has length of 32.
// ggml_tensor->name only has 32 characters, but we need one more for the '\0' terminator.
// ggml_set_name will set the last character to '\0', so we can only store 'layers.10.feed_forward.w1.weigh'.
// when saving llama compatible model the tensors names will miss a character.
// ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str());
// ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str());
// ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str());
strncpy(layer.w1->name, (layers_i + ".feed_forward.w1.weight").c_str(), sizeof(layer.w1->name));
strncpy(layer.w2->name, (layers_i + ".feed_forward.w2.weight").c_str(), sizeof(layer.w2->name));
strncpy(layer.w3->name, (layers_i + ".feed_forward.w3.weight").c_str(), sizeof(layer.w3->name));
layer.w1->padding[0] = 0;
layer.w2->padding[0] = 0;
layer.w3->padding[0] = 0;
ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
}
}
@@ -454,8 +443,8 @@ struct ggml_tensor * forward(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, 1]
// Kcur shape [n_embd/n_head, n_head, N, 1]
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
// store key and value to memory
{
@@ -711,8 +700,8 @@ struct ggml_tensor * forward_batch(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@@ -996,8 +985,8 @@ struct ggml_tensor * forward_batch_wo_cache(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@@ -1218,8 +1207,8 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
// compute Q and K and RoPE them
// wq shape [n_embd, n_embd, 1, 1]
// wk shape [n_embd, n_embd, 1, 1]
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0);
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@@ -1618,10 +1607,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd);
use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
@@ -2368,7 +2357,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
file->write_u32(0);
file->write_u32(0);
file->write_u32(GGML_TYPE_F32);
file->seek(0-file->tell() & 31, SEEK_CUR);
file->seek((0-file->tell()) & 31, SEEK_CUR);
return;
}
const char * name = ggml_get_name(tensor);
@@ -2383,7 +2372,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
file->write_u32(tensor->type);
file->write_raw(ne, sizeof(ne[0]) * nd);
file->write_raw(name, name_len);
file->seek(0-file->tell() & 31, SEEK_CUR);
file->seek((0-file->tell()) & 31, SEEK_CUR);
file->write_raw(tensor->data, ggml_nbytes(tensor));
}
@@ -2404,7 +2393,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
std::string name = file->read_string(name_len);
GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
file->seek(0-file->tell() & 31, SEEK_CUR);
file->seek((0-file->tell()) & 31, SEEK_CUR);
file->read_raw(tensor->data, ggml_nbytes(tensor));
}

598
ggml.c
View File

@@ -91,6 +91,11 @@ static int sched_yield (void) {
#include <stdatomic.h>
typedef void* thread_ret_t;
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -119,6 +124,30 @@ typedef void* thread_ret_t;
#define GGML_SOFT_MAX_UNROLL 4
#define GGML_VEC_DOT_UNROLL 2
//
// logging
//
#if (GGML_DEBUG >= 1)
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG(...)
#endif
#if (GGML_DEBUG >= 5)
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_5(...)
#endif
#if (GGML_DEBUG >= 10)
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_10(...)
#endif
#define GGML_PRINT(...) printf(__VA_ARGS__)
#ifdef GGML_USE_ACCELERATE
// uncomment to use vDSP for soft max computation
// note: not sure if it is actually faster
@@ -459,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
}
}
//
// timing
//
@@ -522,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
#define ggml_perf_cycles_per_ms() 0
#endif
//
// cache line
//
@@ -3843,12 +3872,31 @@ struct ggml_context_container {
struct ggml_context context;
};
//
// NUMA support
//
#define GGML_NUMA_MAX_NODES 8
#define GGML_NUMA_MAX_CPUS 512
struct ggml_numa_node {
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
uint32_t n_cpus;
};
struct ggml_numa_nodes {
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
uint32_t n_nodes;
uint32_t total_cpus; // hardware threads on system
};
//
// ggml state
//
struct ggml_state {
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
struct ggml_numa_nodes numa;
};
// global state
@@ -3873,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
atomic_fetch_sub(&g_state_barrier, 1);
}
void ggml_numa_init(void) {
if (g_state.numa.n_nodes > 0) {
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
return;
}
#ifdef __linux__
struct stat st;
char path[256];
int rv;
// enumerate nodes
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) != 0) { break; }
++g_state.numa.n_nodes;
}
// enumerate CPUs
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) != 0) { break; }
++g_state.numa.total_cpus;
}
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
g_state.numa.n_nodes = 0;
return;
}
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
struct ggml_numa_node * node = &g_state.numa.nodes[n];
GGML_PRINT_DEBUG("CPUs on node %u:", n);
node->n_cpus = 0;
for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) == 0) {
node->cpus[node->n_cpus++] = c;
GGML_PRINT_DEBUG(" %u", c);
}
}
GGML_PRINT_DEBUG("\n");
}
if (ggml_is_numa()) {
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
if (fptr != NULL) {
char buf[42];
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
}
fclose(fptr);
}
}
#else
// TODO
#endif
}
bool ggml_is_numa(void) {
return g_state.numa.n_nodes > 1;
}
////////////////////////////////////////////////////////////////////////////////
void ggml_print_object(const struct ggml_object * obj) {
@@ -4129,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
g_state = (struct ggml_state) {
/*.contexts =*/ { { 0 } },
/*.numa =*/ {
.n_nodes = 0,
.total_cpus = 0,
},
};
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
@@ -6657,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
int n_past,
int n_dims,
int mode,
int n_ctx,
bool inplace) {
GGML_ASSERT(n_past >= 0);
bool is_node = false;
@@ -6669,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
ggml_scratch_save(ctx);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode;
((int32_t *) b->data)[3] = n_ctx;
ggml_scratch_load(ctx);
@@ -6690,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
int mode,
int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
}
struct ggml_tensor * ggml_rope_inplace(
@@ -6699,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
int mode,
int n_ctx) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
}
// ggml_rope_back
@@ -12319,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) == 3);
GGML_ASSERT(ggml_nelements(src1) == 4);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
@@ -12328,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
assert(n_past >= 0);
@@ -12372,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
const float theta_scale = powf(10000.0, -2.0f/n_dims);
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -12382,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
float theta = (float)p;
if (!is_neox) {
if (is_glm) {
theta = MIN(p, n_ctx - 2);
float block_theta = MAX(p - (n_ctx - 2), 0);
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
const float cos_block_theta = cosf(block_theta);
const float sin_block_theta = sinf(block_theta);
theta *= theta_scale;
block_theta *= theta_scale;
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0];
const float x1 = src[n_dims/2];
const float x2 = src[n_dims];
const float x3 = src[n_dims/2*3];
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
}
} else if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
@@ -12432,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) == 3);
GGML_ASSERT(ggml_nelements(src1) == 4);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
@@ -12441,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
assert(n_past >= 0);
@@ -12485,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
const float theta_scale = powf(10000.0, -2.0f/n_dims);
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -12495,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
float theta = (float)p;
if (!is_neox) {
if (is_glm) {
theta = MIN(p, n_ctx - 2);
float block_theta = MAX(p - (n_ctx - 2), 0);
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
const float cos_block_theta = cosf(block_theta);
const float sin_block_theta = sinf(block_theta);
theta *= theta_scale;
block_theta *= theta_scale;
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = GGML_FP16_TO_FP32(src[0]);
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
}
} if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
@@ -13387,8 +13566,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
const int nk1 = ne01;
// size of the convolution row - the kernel size unrolled across all channels
// round-up so it is more suitable for SIMD
const int ew0 = ggml_up32(nk0*nk1*ne02);
const int ew0 = nk0*nk1*ne02;
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
@@ -16069,17 +16247,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
if (src0->grad) {
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3);
assert(ggml_nelements(src1) == 4);
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
src0->grad = ggml_add_impl(ctx,
src0->grad,
ggml_rope(ctx,
tensor->grad,
n_past,
n_dims,
mode),
mode,
n_ctx),
inplace);
}
if (src1->grad) {
@@ -16504,68 +16684,172 @@ typedef pthread_t ggml_thread_t;
#endif
#ifdef __linux__
void set_numa_thread_affinity(int thread_n, int n_threads) {
if (!ggml_is_numa()) {
return;
}
// run thread on node_num thread_n / (threads per node)
const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
CPU_ZERO_S(setsize, cpus);
for (size_t i = 0; i < node->n_cpus; ++i) {
CPU_SET_S(node->cpus[i], setsize, cpus);
}
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
strerror(rv));
}
CPU_FREE(cpus);
}
void clear_numa_thread_affinity(void) {
if (!ggml_is_numa()) {
return;
}
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
CPU_ZERO_S(setsize, cpus);
for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
CPU_SET_S(i, setsize, cpus);
}
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
strerror(rv));
}
CPU_FREE(cpus);
}
#else
// TODO: Windows etc.
// (the linux implementation may also work on BSD, someone should test)
void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
void clear_numa_thread_affinity(void) {}
#endif
struct ggml_compute_state_shared {
ggml_lock_t spin;
struct ggml_cgraph * cgraph;
int64_t perf_node_start_cycles;
int64_t perf_node_start_time_us;
int n_threads;
// synchronization primitives
atomic_int n_ready;
atomic_bool has_work;
atomic_bool stop; // stop all threads
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
};
struct ggml_compute_state {
ggml_thread_t thrd;
struct ggml_compute_params params;
struct ggml_tensor * node;
int ith;
struct ggml_compute_state_shared * shared;
};
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
node->perf_runs++;
node->perf_cycles += cycles_cur;
node->perf_time_us += time_us_cur;
}
static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
struct ggml_cgraph * cgraph = state->shared->cgraph;
const int n_threads = state->shared->n_threads;
set_numa_thread_affinity(state->ith, n_threads);
int node_n = -1;
while (true) {
if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
atomic_store(&state->shared->has_work, false);
} else {
while (atomic_load(&state->shared->has_work)) {
if (atomic_load(&state->shared->stop)) {
return 0;
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
// all other threads are finished and spinning
// do finalize and init here so we don't have synchronize again
struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_FINALIZE,
/*.ith =*/ 0,
/*.nth =*/ 0,
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
};
if (node_n != -1) {
/* FINALIZE */
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
// distribute new work or execute it direct if 1T
while (++node_n < cgraph->n_nodes) {
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
struct ggml_tensor * node = cgraph->nodes[node_n];
state->shared->perf_node_start_cycles = ggml_perf_cycles();
state->shared->perf_node_start_time_us = ggml_perf_time_us();
/* INIT */
params.type = GGML_TASK_INIT;
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
if (node->n_tasks == 1) {
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
// they do something more efficient than spinning (?)
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(&params, node);
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
} else {
break;
}
ggml_lock_lock (&state->shared->spin);
ggml_lock_unlock(&state->shared->spin);
}
}
atomic_fetch_sub(&state->shared->n_ready, 1);
// wait for work
while (!atomic_load(&state->shared->has_work)) {
if (atomic_load(&state->shared->stop)) {
return 0;
}
ggml_lock_lock (&state->shared->spin);
ggml_lock_unlock(&state->shared->spin);
atomic_store(&state->shared->n_active, n_threads);
atomic_store(&state->shared->node_n, node_n);
} else {
// wait for other threads to finish
const int last = node_n;
do {
sched_yield();
node_n = atomic_load(&state->shared->node_n);
} while (node_n == last);
}
// check if we should stop
if (atomic_load(&state->shared->stop)) {
break;
}
if (node_n >= cgraph->n_nodes) break;
if (state->node) {
if (state->params.ith < state->params.nth) {
ggml_compute_forward(&state->params, state->node);
}
/* COMPUTE */
struct ggml_tensor * node = cgraph->nodes[node_n];
state->node = NULL;
} else {
break;
struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_COMPUTE,
/*.ith =*/ state->ith,
/*.nth =*/ node->n_tasks,
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
};
if (state->ith < node->n_tasks) {
ggml_compute_forward(&params, node);
}
}
@@ -16576,39 +16860,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
const int n_threads = cgraph->n_threads;
struct ggml_compute_state_shared state_shared = {
/*.spin =*/ GGML_LOCK_INITIALIZER,
/*.n_threads =*/ n_threads,
/*.n_ready =*/ 0,
/*.has_work =*/ false,
/*.stop =*/ false,
/*.cgraph =*/ cgraph,
/*.perf_node_start_cycles =*/ 0,
/*.perf_node_start_time_us =*/ 0,
/*.n_threads =*/ n_threads,
/*.n_active =*/ n_threads,
/*.node_n =*/ -1,
};
struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
// create thread pool
if (n_threads > 1) {
ggml_lock_init(&state_shared.spin);
atomic_store(&state_shared.has_work, true);
for (int j = 0; j < n_threads - 1; j++) {
workers[j] = (struct ggml_compute_state) {
.thrd = 0,
.params = {
.type = GGML_TASK_COMPUTE,
.ith = j + 1,
.nth = n_threads,
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
.wdata = cgraph->work ? cgraph->work->data : NULL,
},
.node = NULL,
.shared = &state_shared,
};
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
GGML_ASSERT(rc == 0);
UNUSED(rc);
}
}
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
// initialize tasks + work buffer
{
@@ -16752,7 +17011,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} break;
case GGML_OP_SCALE:
{
node->n_tasks = n_threads;
node->n_tasks = 1;
} break;
case GGML_OP_SET:
case GGML_OP_CONT:
@@ -16956,166 +17215,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
}
}
// create thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; ++j) {
workers[j] = (struct ggml_compute_state) {
.thrd = 0,
.ith = j,
.shared = &state_shared,
};
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
GGML_ASSERT(rc == 0);
}
}
workers[0].ith = 0;
workers[0].shared = &state_shared;
const int64_t perf_start_cycles = ggml_perf_cycles();
const int64_t perf_start_time_us = ggml_perf_time_us();
for (int i = 0; i < cgraph->n_nodes; i++) {
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
// this is a work thread too
ggml_graph_compute_thread(&workers[0]);
struct ggml_tensor * node = cgraph->nodes[i];
// TODO: this could be used to avoid unnecessary computations, but it needs to be improved
//if (node->grad == NULL && node->perf_runs > 0) {
// continue;
//}
const int64_t perf_node_start_cycles = ggml_perf_cycles();
const int64_t perf_node_start_time_us = ggml_perf_time_us();
// INIT
struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_INIT,
/*.ith =*/ 0,
/*.nth =*/ node->n_tasks,
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
};
ggml_compute_forward(&params, node);
// COMPUTE
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
// launch thread pool
for (int j = 0; j < n_threads - 1; j++) {
workers[j].params = (struct ggml_compute_params) {
.type = GGML_TASK_COMPUTE,
.ith = j + 1,
.nth = node->n_tasks,
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
.wdata = cgraph->work ? cgraph->work->data : NULL,
};
workers[j].node = node;
}
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) > 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
atomic_store(&state_shared.has_work, true);
}
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(&params, node);
// wait for thread pool
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) != 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
}
// FINALIZE
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
// launch thread pool
for (int j = 0; j < n_threads - 1; j++) {
workers[j].params = (struct ggml_compute_params) {
.type = GGML_TASK_FINALIZE,
.ith = j + 1,
.nth = node->n_tasks,
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
.wdata = cgraph->work ? cgraph->work->data : NULL,
};
workers[j].node = node;
}
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) > 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
atomic_store(&state_shared.has_work, true);
}
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node);
// wait for thread pool
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) != 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
}
// performance stats (node)
{
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
node->perf_runs++;
node->perf_cycles += perf_cycles_cur;
node->perf_time_us += perf_time_us_cur;
}
}
// don't leave affinity set on the main thread
clear_numa_thread_affinity();
// join thread pool
if (n_threads > 1) {
atomic_store(&state_shared.stop, true);
atomic_store(&state_shared.has_work, true);
for (int j = 0; j < n_threads - 1; j++) {
int rc = ggml_thread_join(workers[j].thrd, NULL);
for (int j = 1; j < n_threads; j++) {
const int rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == 0);
UNUSED(rc);
}
ggml_lock_destroy(&state_shared.spin);
}
// performance stats (graph)

12
ggml.h
View File

@@ -198,7 +198,7 @@
#define GGML_MAX_PARAMS 256
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4
#define GGML_MAX_NAME 32
#define GGML_MAX_NAME 48
#define GGML_DEFAULT_N_THREADS 4
#define GGML_ASSERT(x) \
@@ -469,6 +469,9 @@ extern "C" {
GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void);
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
GGML_API void ggml_print_object (const struct ggml_object * obj);
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
@@ -1033,13 +1036,15 @@ extern "C" {
// rotary position embedding
// if mode & 1 == 1, skip n_past elements
// if mode & 2 == 1, GPT-NeoX style
// if mode & 4 == 1, ChatGLM style
// TODO: avoid creating a new tensor every time
GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode);
int mode,
int n_ctx);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_inplace(
@@ -1047,7 +1052,8 @@ extern "C" {
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode);
int mode,
int n_ctx);
// rotary position embedding backward, i.e compute dx from dy
// a - dy

View File

@@ -1981,7 +1981,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
// prepare low and high bits
const int bit = j << 2;
const int bit = j << 2;
const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);

View File

@@ -172,12 +172,14 @@ struct llama_mmap {
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; }
#ifdef __linux__
flags |= MAP_POPULATE;
if (prefetch) { flags |= MAP_POPULATE; }
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
@@ -191,6 +193,14 @@ struct llama_mmap {
strerror(errno));
}
}
if (numa) {
// advise the kernel not to use readahead
// (because the next page might not belong on the same node)
if (madvise(addr, file->size, MADV_RANDOM)) {
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
strerror(errno));
}
}
}
~llama_mmap() {
@@ -199,7 +209,9 @@ struct llama_mmap {
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true) {
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
(void) numa;
size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
@@ -244,8 +256,10 @@ struct llama_mmap {
#else
static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file *, bool prefetch = true) {
(void)prefetch;
llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
(void) prefetch;
(void) numa;
throw std::runtime_error(std::string("mmap not supported"));
}
#endif

View File

@@ -774,7 +774,7 @@ struct llama_model_loader {
}
if (use_mmap) {
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
if (lmlock) {
lmlock->init(mapping->addr);
}
@@ -977,7 +977,7 @@ bool llama_mlock_supported() {
return llama_mlock::SUPPORTED;
}
void llama_init_backend() {
void llama_init_backend(bool numa) {
ggml_time_init();
// needed to initialize f16 tables
@@ -986,6 +986,10 @@ void llama_init_backend() {
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}
if (numa) {
ggml_numa_init();
}
}
int64_t llama_time_us() {
@@ -1487,11 +1491,11 @@ static bool llama_eval_internal(
offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq");
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur");
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur");
@@ -2899,7 +2903,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
// maybe this should in llama_model_loader
if (model_loader->use_mmap) {
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
}
}

View File

@@ -140,8 +140,9 @@ extern "C" {
// TODO: not great API - very likely to change
// Initialize the llama + ggml backend
// If numa is true, use NUMA optimizations
// Call once at the start of the program
LLAMA_API void llama_init_backend();
LLAMA_API void llama_init_backend(bool numa);
LLAMA_API int64_t llama_time_us();