mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
8 Commits
master-ee1
...
master-121
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1215ed7d5c | ||
|
|
2dbf518911 | ||
|
|
9d2382b3e4 | ||
|
|
a113689571 | ||
|
|
11f3ca06b8 | ||
|
|
9baf9ef304 | ||
|
|
8a88e5855c | ||
|
|
a9559bf77b |
@@ -67,7 +67,9 @@ endif()
|
||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
||||
option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
||||
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
|
||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||
@@ -251,6 +253,10 @@ if (LLAMA_CUBLAS)
|
||||
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
||||
|
||||
add_compile_definitions(GGML_USE_CUBLAS)
|
||||
if (LLAMA_CUDA_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_CUBLAS)
|
||||
endif()
|
||||
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
@@ -497,6 +503,8 @@ endif()
|
||||
add_library(ggml OBJECT
|
||||
ggml.c
|
||||
ggml.h
|
||||
ggml-alloc.c
|
||||
ggml-alloc.h
|
||||
${GGML_SOURCES_CUDA}
|
||||
${GGML_SOURCES_OPENCL}
|
||||
${GGML_SOURCES_METAL}
|
||||
|
||||
22
Makefile
22
Makefile
@@ -194,7 +194,7 @@ ifdef LLAMA_CUBLAS
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
OBJS += ggml-cuda.o
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
||||
ifdef LLAMA_CUDA_NVCC
|
||||
NVCC = $(LLAMA_CUDA_NVCC)
|
||||
else
|
||||
@@ -220,14 +220,25 @@ else ifdef LLAMA_CUDA_DMMV_Y
|
||||
else
|
||||
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
||||
endif # LLAMA_CUDA_MMV_Y
|
||||
ifdef LLAMA_CUDA_F16
|
||||
NVCCFLAGS += -DGGML_CUDA_F16
|
||||
endif # LLAMA_CUDA_F16
|
||||
ifdef LLAMA_CUDA_DMMV_F16
|
||||
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
|
||||
NVCCFLAGS += -DGGML_CUDA_F16
|
||||
endif # LLAMA_CUDA_DMMV_F16
|
||||
ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||
else
|
||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||
endif
|
||||
ifdef LLAMA_CUDA_MMQ_Y
|
||||
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
|
||||
else
|
||||
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
|
||||
endif # LLAMA_CUDA_MMQ_Y
|
||||
ifdef LLAMA_CUDA_CUBLAS
|
||||
NVCCFLAGS += -DGGML_CUDA_CUBLAS
|
||||
endif # LLAMA_CUDA_CUBLAS
|
||||
ifdef LLAMA_CUDA_CCBIN
|
||||
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||
endif
|
||||
@@ -318,7 +329,12 @@ $(info )
|
||||
ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
|
||||
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
OBJS += ggml-alloc.o
|
||||
|
||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
|
||||
@@ -402,10 +402,12 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
| Option | Legal values | Default | Description |
|
||||
|-------------------------|------------------------|---------|-------------|
|
||||
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
|
||||
| LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
|
||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
|
||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
|
||||
- #### CLBlast
|
||||
|
||||
@@ -402,8 +402,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.antiprompt.push_back(argv[i]);
|
||||
} else if (arg == "--perplexity") {
|
||||
params.perplexity = true;
|
||||
} else if (arg == "--perplexity-lines") {
|
||||
params.perplexity_lines = true;
|
||||
} else if (arg == "--hellaswag") {
|
||||
params.hellaswag = true;
|
||||
} else if (arg == "--hellaswag-tasks") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.hellaswag_tasks = std::stoi(argv[i]);
|
||||
} else if (arg == "--ignore-eos") {
|
||||
params.logit_bias[llama_token_eos()] = -INFINITY;
|
||||
} else if (arg == "--no-penalize-nl") {
|
||||
@@ -559,8 +565,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n");
|
||||
fprintf(stdout, " --perplexity-lines compute perplexity over each line of the prompt\n");
|
||||
fprintf(stdout, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %d)\n", params.hellaswag_tasks);
|
||||
fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
if (llama_mlock_supported()) {
|
||||
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
|
||||
@@ -70,7 +70,10 @@ struct gpt_params {
|
||||
std::string lora_adapter = ""; // lora adapter path
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||
|
||||
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
@@ -86,7 +89,6 @@ struct gpt_params {
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
bool perplexity = false; // compute perplexity over the prompt
|
||||
bool perplexity_lines = false; // compute perplexity over each line of the prompt
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool mem_test = false; // compute maximum memory usage
|
||||
|
||||
@@ -121,8 +121,23 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void perplexity_lines(llama_context * ctx, const gpt_params & params) {
|
||||
// Calculates perplexity over each line of the prompt
|
||||
void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
// Calculates hellaswag score (acc_norm) from prompt
|
||||
//
|
||||
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
|
||||
// All used data fields are preprocessed as in https://github.com/EleutherAI/lm-evaluation-harness/blob/df3da98c5405deafd519c2ddca52bb7c3fe36bef/lm_eval/tasks/hellaswag.py#L62-L68
|
||||
//
|
||||
// All 10042 tasks should be extracted to keep the results standardized like other implementations.
|
||||
//
|
||||
// Datafile layout:
|
||||
// ['??'] denotes json fields
|
||||
// 6 lines per task:
|
||||
// ['activity_label'] + ": " +['ctx'] - The first part of the query, the context
|
||||
// ['label'] - The index the best common sense ending aka gold ending
|
||||
// ['endings'][0] - Endings added to the first part of the query
|
||||
// ['endings'][1]
|
||||
// ['endings'][2]
|
||||
// ['endings'][3]
|
||||
|
||||
std::vector<std::string> prompt_lines;
|
||||
std::istringstream strstream(params.prompt);
|
||||
@@ -132,63 +147,149 @@ void perplexity_lines(llama_context * ctx, const gpt_params & params) {
|
||||
prompt_lines.push_back(line);
|
||||
}
|
||||
|
||||
if( prompt_lines.size() % 6 != 0) {
|
||||
fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t hs_task_count = prompt_lines.size()/6;
|
||||
fprintf(stderr, "%s : loaded %lu tasks from prompt.\n", __func__, hs_task_count);
|
||||
|
||||
// This is needed as usual for LLaMA models
|
||||
bool prepend_bos = true;
|
||||
|
||||
// Number of tasks to use when computing the score
|
||||
if ( params.hellaswag_tasks < hs_task_count ) {
|
||||
hs_task_count = params.hellaswag_tasks;
|
||||
}
|
||||
|
||||
// The tasks should be randomized so the score stabilizes quickly.
|
||||
bool randomize_tasks = true;
|
||||
|
||||
// The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
|
||||
std::mt19937 rng(1);
|
||||
|
||||
// Dataholder for hellaswag tasks
|
||||
struct hs_data_t {
|
||||
std::string context;
|
||||
size_t gold_ending_idx;
|
||||
std::string ending[4];
|
||||
size_t ending_logprob_count[4];
|
||||
double ending_logprob[4];
|
||||
};
|
||||
|
||||
fprintf(stderr, "%s : selecting %lu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
||||
|
||||
// Select and read data from prompt lines
|
||||
hs_data_t *hs_data = new hs_data_t[hs_task_count];
|
||||
for (size_t i=0; i < hs_task_count; i++) {
|
||||
size_t idx = i;
|
||||
|
||||
// Select a random example of those left in the prompt
|
||||
if (randomize_tasks) {
|
||||
std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
|
||||
idx = dist(rng);
|
||||
}
|
||||
|
||||
hs_data[i].context = prompt_lines[idx*6];
|
||||
hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||
for (size_t j=0; j < 4; j++) {
|
||||
hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
|
||||
}
|
||||
|
||||
// Delete the selected random example from the prompt
|
||||
if (randomize_tasks) {
|
||||
prompt_lines.erase( std::next(prompt_lines.begin(),idx*6) , std::next(prompt_lines.begin(),idx*6+6) );
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
|
||||
printf("\ntask\tacc_norm\n");
|
||||
|
||||
double acc = 0.0f;
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
int counttotal = 0;
|
||||
size_t n_lines = prompt_lines.size();
|
||||
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
||||
|
||||
double nll = 0.0;
|
||||
// Tokenize the context to count tokens
|
||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
|
||||
size_t context_size = context_embd.size();
|
||||
|
||||
fprintf(stderr, "%s: calculating perplexity over %lu lines\n", __func__, n_lines);
|
||||
for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
|
||||
|
||||
printf("\nLine\tPPL line\tPPL cumulative\n");
|
||||
// Tokenize the query
|
||||
std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
|
||||
size_t query_size = query_embd.size();
|
||||
|
||||
for (size_t i = 0; i < n_lines; ++i) {
|
||||
// Stop if query wont fit the ctx window
|
||||
if (query_size > (size_t)params.n_ctx) {
|
||||
fprintf(stderr, "%s : number of tokens in query %lu > n_ctxl\n", __func__, query_size);
|
||||
return;
|
||||
}
|
||||
|
||||
// Tokenize and insert BOS at start
|
||||
std::vector<int> batch_embd = ::llama_tokenize(ctx, prompt_lines[i], true);
|
||||
// Speedup small evaluations by evaluating atleast 32 tokens
|
||||
if (query_size < 32) {
|
||||
query_embd.resize(32);
|
||||
}
|
||||
|
||||
size_t batch_size = batch_embd.size();
|
||||
// Evaluate the query
|
||||
if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
// Stop if line is too long
|
||||
if( batch_size > (size_t)params.n_ctx ) {
|
||||
fprintf(stderr, "%s : tokens in line %lu > n_ctxl\n", __func__, i);
|
||||
return;
|
||||
const auto query_logits = llama_get_logits(ctx);
|
||||
std::vector<float> logits;
|
||||
logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
|
||||
|
||||
hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
|
||||
hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
|
||||
|
||||
// Calculate the logprobs over the ending
|
||||
for (size_t j = context_size-1; j < query_size - 1; j++) {
|
||||
// Calculate probability of next token, given the previous ones.
|
||||
const std::vector<float> tok_logits(
|
||||
logits.begin() + (j + 0) * n_vocab,
|
||||
logits.begin() + (j + 1) * n_vocab);
|
||||
|
||||
const float prob = softmax(tok_logits)[query_embd[ j + 1]];
|
||||
|
||||
hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
|
||||
hs_data[task_idx].ending_logprob_count[ending_idx]++;
|
||||
}
|
||||
|
||||
// Calculate the mean token logprob for acc_norm
|
||||
hs_data[task_idx].ending_logprob[ending_idx] /= hs_data[task_idx].ending_logprob_count[ending_idx];
|
||||
|
||||
|
||||
// printf("task %lu, ending %lu, whole_len %lu, context_len %lu, ending_logprob_count %lu, ending_logprob %.4f\n",
|
||||
// task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] );
|
||||
}
|
||||
|
||||
if (llama_eval(ctx, batch_embd.data(), batch_size, 0, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return;
|
||||
// Find the ending with maximum logprob
|
||||
size_t ending_logprob_max_idx = -1;
|
||||
double ending_logprob_max_val = -INFINITY;
|
||||
for (size_t j=0; j < 4; j++) {
|
||||
if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
|
||||
ending_logprob_max_idx = j;
|
||||
ending_logprob_max_val = hs_data[task_idx].ending_logprob[j];
|
||||
}
|
||||
}
|
||||
|
||||
const auto batch_logits = llama_get_logits(ctx);
|
||||
std::vector<float> logits;
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||
// printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_data[task_idx].gold_ending_idx);
|
||||
|
||||
double nllline = 0.0;
|
||||
int countline = 0;
|
||||
|
||||
// Perplexity over second half of the line
|
||||
for (size_t j = batch_size/2; j < batch_size - 1; ++j) {
|
||||
// Calculate probability of next token, given the previous ones.
|
||||
const std::vector<float> tok_logits(
|
||||
logits.begin() + (j + 0) * n_vocab,
|
||||
logits.begin() + (j + 1) * n_vocab);
|
||||
|
||||
const float prob = softmax(tok_logits)[batch_embd[ j + 1]];
|
||||
|
||||
nllline += -std::log(prob);
|
||||
++countline;
|
||||
// If the gold ending got the maximum logprobe add one accuracy point
|
||||
if (ending_logprob_max_idx == hs_data[task_idx].gold_ending_idx) {
|
||||
acc += 1.0;
|
||||
}
|
||||
|
||||
nll += nllline;
|
||||
counttotal += countline;
|
||||
|
||||
// perplexity is e^(average negative log-likelihood)
|
||||
printf("%lu\t%.8lf\t%.8lf\n", i + 1, std::exp(nllline/countline), std::exp(nll / counttotal) );
|
||||
// Print the accumulated accuracy mean x 100
|
||||
printf("%li\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
delete [] hs_data;
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -240,8 +341,8 @@ int main(int argc, char ** argv) {
|
||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
}
|
||||
|
||||
if (params.perplexity_lines) {
|
||||
perplexity_lines(ctx, params);
|
||||
if (params.hellaswag) {
|
||||
hellaswag_score(ctx, params);
|
||||
} else {
|
||||
perplexity(ctx, params);
|
||||
}
|
||||
|
||||
541
ggml-alloc.c
Normal file
541
ggml-alloc.c
Normal file
@@ -0,0 +1,541 @@
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml.h"
|
||||
#include <assert.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
//#define GGML_ALLOCATOR_DEBUG
|
||||
|
||||
//#define AT_PRINTF printf
|
||||
#define AT_PRINTF(...) ((void)0)
|
||||
|
||||
struct hash_node {
|
||||
struct ggml_tensor * t;
|
||||
int n_children;
|
||||
int n_views;
|
||||
};
|
||||
|
||||
static size_t hash(void * p) {
|
||||
return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
|
||||
}
|
||||
|
||||
static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
|
||||
size_t h = hash(t);
|
||||
|
||||
// linear probing
|
||||
size_t i = h;
|
||||
while (hash_table[i].t != NULL) {
|
||||
if (hash_table[i].t == t) {
|
||||
return &hash_table[i];
|
||||
}
|
||||
i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
|
||||
if (i == h) {
|
||||
// hash table is full
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
hash_table[i].t = t;
|
||||
return &hash_table[i];
|
||||
}
|
||||
|
||||
// TODO: GGML_PAD ?
|
||||
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
||||
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
||||
size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
|
||||
return offset + align;
|
||||
}
|
||||
|
||||
struct free_block {
|
||||
void * addr;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
#define MAX_FREE_BLOCKS 128
|
||||
|
||||
struct ggml_allocr {
|
||||
void * data;
|
||||
size_t size;
|
||||
size_t alignment;
|
||||
int n_free_blocks;
|
||||
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
||||
size_t max_size;
|
||||
bool measure;
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
struct ggml_tensor * allocated_tensors[1024];
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
if (alloc->allocated_tensors[i] == NULL) {
|
||||
alloc->allocated_tensors[i] = tensor;
|
||||
return;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(!"out of allocated_tensors");
|
||||
}
|
||||
static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
if (alloc->allocated_tensors[i] == tensor ||
|
||||
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
||||
alloc->allocated_tensors[i] = NULL;
|
||||
return;
|
||||
}
|
||||
}
|
||||
printf("tried to free tensor %s not found\n", tensor->name);
|
||||
GGML_ASSERT(!"tensor not found");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
return ggml_nbytes(tensor);
|
||||
|
||||
UNUSED(alloc);
|
||||
}
|
||||
|
||||
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||
|
||||
size_t max_avail = 0;
|
||||
|
||||
// find the best fitting free block
|
||||
int best_fit_block = -1;
|
||||
size_t best_fit_size = SIZE_MAX;
|
||||
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
||||
struct free_block * block = &alloc->free_blocks[i];
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
if (block->size >= size && block->size <= best_fit_size) {
|
||||
best_fit_block = i;
|
||||
best_fit_size = block->size;
|
||||
}
|
||||
}
|
||||
|
||||
AT_PRINTF("block %d\n", best_fit_block);
|
||||
|
||||
if (best_fit_block == -1) {
|
||||
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
||||
__func__, size, max_avail);
|
||||
GGML_ASSERT(!"not enough space in the buffer");
|
||||
return;
|
||||
}
|
||||
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
||||
void * addr = block->addr;
|
||||
block->addr = (char*)block->addr + size;
|
||||
block->size -= size;
|
||||
if (block->size == 0) {
|
||||
// remove block if empty
|
||||
alloc->n_free_blocks--;
|
||||
for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
|
||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||
}
|
||||
}
|
||||
|
||||
tensor->data = addr;
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
add_allocated_tensor(alloc, tensor);
|
||||
size_t cur_max = (char*)addr - (char*)alloc->data + size;
|
||||
if (cur_max > alloc->max_size) {
|
||||
printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
if (alloc->allocated_tensors[i]) {
|
||||
printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
|
||||
}
|
||||
|
||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||
static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||
void * ptr = tensor->data;
|
||||
|
||||
if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
|
||||
// the tensor was not allocated in this buffer
|
||||
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
||||
// the easiest way to deal with this is just to ignore it
|
||||
return;
|
||||
}
|
||||
|
||||
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, tensor);
|
||||
#endif
|
||||
|
||||
// see if we can merge with an existing block
|
||||
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
||||
struct free_block * block = &alloc->free_blocks[i];
|
||||
// check if ptr is at the end of the block
|
||||
if ((char*)block->addr + block->size == ptr) {
|
||||
block->size += size;
|
||||
// check if we can merge with the next block
|
||||
if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
|
||||
block->size += alloc->free_blocks[i+1].size;
|
||||
alloc->n_free_blocks--;
|
||||
for (int j = i+1; j < alloc->n_free_blocks; j++) {
|
||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
// check if ptr is at the beginning of the block
|
||||
if ((char*)ptr + size == block->addr) {
|
||||
block->addr = ptr;
|
||||
block->size += size;
|
||||
// check if we can merge with the previous block
|
||||
if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
|
||||
alloc->free_blocks[i-1].size += block->size;
|
||||
alloc->n_free_blocks--;
|
||||
for (int j = i; j < alloc->n_free_blocks; j++) {
|
||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
// otherwise, add a new block
|
||||
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
||||
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
||||
int insert_pos = 0;
|
||||
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
|
||||
insert_pos++;
|
||||
}
|
||||
// shift all blocks from insert_pos onward to make room for the new block
|
||||
for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
|
||||
alloc->free_blocks[i] = alloc->free_blocks[i-1];
|
||||
}
|
||||
// insert the new block
|
||||
alloc->free_blocks[insert_pos].addr = ptr;
|
||||
alloc->free_blocks[insert_pos].size = size;
|
||||
alloc->n_free_blocks++;
|
||||
}
|
||||
|
||||
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
||||
alloc->n_free_blocks = 1;
|
||||
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
||||
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
||||
alloc->free_blocks[0].size = alloc->size - align_offset;
|
||||
}
|
||||
|
||||
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||
|
||||
*alloc = (struct ggml_allocr){
|
||||
/*.data = */ data,
|
||||
/*.size = */ size,
|
||||
/*.alignment = */ alignment,
|
||||
/*.n_free_blocks = */ 0,
|
||||
/*.free_blocks = */ {{0}},
|
||||
/*.hash_table = */ {{0}},
|
||||
/*.max_size = */ 0,
|
||||
/*.measure = */ false,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
/*.allocated_tensors = */ = {0},
|
||||
#endif
|
||||
};
|
||||
|
||||
ggml_allocr_reset(alloc);
|
||||
|
||||
return alloc;
|
||||
}
|
||||
|
||||
// address and size of the buffer when measuring
|
||||
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
|
||||
static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
|
||||
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
|
||||
|
||||
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||
|
||||
*alloc = (struct ggml_allocr){
|
||||
/*.data = */ MEASURE_BASE_ADDR,
|
||||
/*.size = */ MEASURE_MAX_SIZE,
|
||||
/*.alignment = */ alignment,
|
||||
/*.n_free_blocks = */ 0,
|
||||
/*.free_blocks = */ {{0}},
|
||||
/*.hash_table = */ {{0}},
|
||||
/*.max_size = */ 0,
|
||||
/*.measure = */ true,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
/*.allocated_tensors = */ = {0},
|
||||
#endif
|
||||
};
|
||||
|
||||
ggml_allocr_reset(alloc);
|
||||
|
||||
return alloc;
|
||||
}
|
||||
|
||||
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
||||
free(alloc);
|
||||
}
|
||||
|
||||
bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
||||
return alloc->measure;
|
||||
}
|
||||
|
||||
//////////// compute graph allocator
|
||||
|
||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
|
||||
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
||||
}
|
||||
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
if (a->type != b->type) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (a->ne[i] != b->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (a->nb[i] != b->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
|
||||
switch (t->op) {
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
case GGML_OP_VIEW:
|
||||
return t->src[0];
|
||||
case GGML_OP_CPY:
|
||||
return t->src[1];
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
||||
struct ggml_tensor * parent = t;
|
||||
do {
|
||||
parent = get_view_parent(parent);
|
||||
} while (ggml_is_view(parent));
|
||||
return parent;
|
||||
}
|
||||
|
||||
static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_DIAG_MASK_ZERO:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_UNARY:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SET:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_CONT:
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
||||
struct hash_node * ht = alloc->hash_table;
|
||||
if (node->data == NULL) {
|
||||
if (ggml_is_view(node)) {
|
||||
size_t offset;
|
||||
switch(node->op) {
|
||||
case GGML_OP_VIEW:
|
||||
memcpy(&offset, node->op_params, sizeof(size_t));
|
||||
node->data = (char *) node->src[0]->data + offset;
|
||||
break;
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
node->data = node->src[0]->data;
|
||||
break;
|
||||
case GGML_OP_CPY:
|
||||
node->data = node->src[1]->data;
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(!"unknown view op");
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// see if we can reuse a parent's buffer (inplace)
|
||||
if (ggml_op_can_inplace(node->op)) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
struct ggml_tensor * parent = node->src[i];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
struct hash_node * p_hn = hash_get(ht, parent);
|
||||
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
||||
// the parent's data that it will need later (same layout requirement). the problem is that then
|
||||
// we cannot free the tensor because the original address of the allocation is lost.
|
||||
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
||||
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||
node->data = parent->data;
|
||||
return;
|
||||
}
|
||||
}
|
||||
else {
|
||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||
node->data = parent->data;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_allocr_alloc(alloc, node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
struct ggml_allocr * alloc,
|
||||
struct ggml_cgraph ** graphs, int n_graphs,
|
||||
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
||||
|
||||
// reset hash table
|
||||
struct hash_node * ht = alloc->hash_table;
|
||||
memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
|
||||
|
||||
// count number of children and views
|
||||
for (int g = 0; g < n_graphs; g++) {
|
||||
struct ggml_cgraph * gf = graphs[g];
|
||||
for (int i = 0; i < gf->n_nodes; i++) {
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
|
||||
if (ggml_is_view(node)) {
|
||||
struct ggml_tensor * view_src = get_view_source(node);
|
||||
hash_get(ht, view_src)->n_views += 1;
|
||||
}
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
hash_get(ht, parent)->n_children += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// allocate tensors
|
||||
for (int g = 0; g < n_graphs; g++) {
|
||||
struct ggml_cgraph * gf = graphs[g];
|
||||
AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
|
||||
// graph inputs are allocated first to ensure that they are not overwritten by each other
|
||||
if (inputs != NULL && inputs[g] != NULL) {
|
||||
for (int i = 0; inputs[g][i] != NULL; i++) {
|
||||
struct ggml_tensor * input = inputs[g][i];
|
||||
AT_PRINTF("input: %s\n", input->name);
|
||||
allocate_node(alloc, input);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < gf->n_nodes; i++) {
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
|
||||
// allocate parents (leafs)
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
allocate_node(alloc, parent);
|
||||
}
|
||||
|
||||
// allocate node
|
||||
allocate_node(alloc, node);
|
||||
|
||||
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
AT_PRINTF("%s", parent->name);
|
||||
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
||||
AT_PRINTF(", ");
|
||||
}
|
||||
}
|
||||
AT_PRINTF("\n");
|
||||
|
||||
// update parents
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
struct hash_node * p_hn = hash_get(ht, parent);
|
||||
p_hn->n_children -= 1;
|
||||
|
||||
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
||||
|
||||
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
view_src_hn->n_views -= 1;
|
||||
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, view_src);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (parent->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, parent);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
AT_PRINTF("\n");
|
||||
}
|
||||
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
||||
if (outputs != NULL && outputs[g] != NULL) {
|
||||
for (int i = 0; outputs[g][i] != NULL; i++) {
|
||||
struct ggml_tensor * output = outputs[g][i];
|
||||
AT_PRINTF("output: %s\n", output->name);
|
||||
ggml_allocator_free_tensor(alloc, output);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return alloc->max_size;
|
||||
}
|
||||
|
||||
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
||||
return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
||||
}
|
||||
22
ggml-alloc.h
Normal file
22
ggml-alloc.h
Normal file
@@ -0,0 +1,22 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
||||
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
||||
|
||||
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
||||
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
||||
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
||||
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
1982
ggml-cuda.cu
1982
ggml-cuda.cu
File diff suppressed because it is too large
Load Diff
75
ggml.c
75
ggml.c
@@ -4557,10 +4557,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
||||
|
||||
static struct ggml_tensor * ggml_new_tensor_impl(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int n_dims,
|
||||
const int64_t* ne,
|
||||
void* data) {
|
||||
enum ggml_type type,
|
||||
int n_dims,
|
||||
const int64_t * ne,
|
||||
void * data) {
|
||||
|
||||
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
||||
|
||||
size_t data_size = 0;
|
||||
|
||||
@@ -4648,22 +4650,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int n_dims,
|
||||
const int64_t * ne) {
|
||||
enum ggml_type type,
|
||||
int n_dims,
|
||||
const int64_t * ne) {
|
||||
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_1d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
enum ggml_type type,
|
||||
int64_t ne0) {
|
||||
return ggml_new_tensor(ctx, type, 1, &ne0);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_2d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1) {
|
||||
const int64_t ne[2] = { ne0, ne1 };
|
||||
@@ -4672,7 +4674,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_3d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2) {
|
||||
@@ -6238,6 +6240,27 @@ struct ggml_tensor * ggml_reshape_4d(
|
||||
|
||||
// ggml_view_1d
|
||||
|
||||
static struct ggml_tensor * ggml_view_tensor_offset(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_dims,
|
||||
const int64_t * ne,
|
||||
size_t offset) {
|
||||
// don't calculate an offset from an unallocated tensor
|
||||
void * data = NULL;
|
||||
if (a->data != NULL) {
|
||||
data = (char *) a->data + offset;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
|
||||
|
||||
ggml_format_name(result, "%s (view)", a->name);
|
||||
|
||||
ggml_set_op_params(result, &offset, sizeof(offset));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_view_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -6250,10 +6273,7 @@ struct ggml_tensor * ggml_view_1d(
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
||||
ggml_format_name(result, "%s (view)", a->name);
|
||||
|
||||
ggml_set_op_params(result, &offset, sizeof(offset));
|
||||
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
||||
|
||||
result->op = GGML_OP_VIEW;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6280,10 +6300,7 @@ struct ggml_tensor * ggml_view_2d(
|
||||
|
||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
||||
ggml_format_name(result, "%s (view)", a->name);
|
||||
|
||||
ggml_set_op_params(result, &offset, sizeof(offset));
|
||||
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
|
||||
|
||||
result->nb[1] = nb1;
|
||||
result->nb[2] = result->nb[1]*ne1;
|
||||
@@ -6316,10 +6333,7 @@ struct ggml_tensor * ggml_view_3d(
|
||||
|
||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
||||
ggml_format_name(result, "%s (view)", a->name);
|
||||
|
||||
ggml_set_op_params(result, &offset, sizeof(offset));
|
||||
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
|
||||
|
||||
result->nb[1] = nb1;
|
||||
result->nb[2] = nb2;
|
||||
@@ -6354,10 +6368,7 @@ struct ggml_tensor * ggml_view_4d(
|
||||
|
||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
||||
ggml_format_name(result, "%s (view)", a->name);
|
||||
|
||||
ggml_set_op_params(result, &offset, sizeof(offset));
|
||||
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
|
||||
|
||||
result->nb[1] = nb1;
|
||||
result->nb[2] = nb2;
|
||||
@@ -6741,6 +6752,18 @@ struct ggml_tensor * ggml_rope_inplace(
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_custom(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
float freq_base,
|
||||
float freq_scale) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
||||
13
ggml.h
13
ggml.h
@@ -1170,7 +1170,18 @@ extern "C" {
|
||||
int mode,
|
||||
int n_ctx);
|
||||
|
||||
// custom RoPE, in-place, returns view(a)
|
||||
// custom RoPE
|
||||
GGML_API struct ggml_tensor * ggml_rope_custom(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
float freq_base,
|
||||
float freq_scale);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
||||
62
k_quants.c
62
k_quants.c
@@ -39,6 +39,8 @@
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
||||
|
||||
//
|
||||
// 2-6 bit quantization in super-blocks
|
||||
//
|
||||
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
|
||||
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
||||
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
||||
const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
|
||||
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
||||
|
||||
__m256i sumi = _mm256_setzero_si256();
|
||||
|
||||
@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
|
||||
|
||||
// sumf += -dmin * summs in 32bits*8
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
|
||||
|
||||
const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
|
||||
const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
|
||||
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
}
|
||||
|
||||
// sumf += dall * isum - dmin * summs in 32bits
|
||||
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
|
||||
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
|
||||
}
|
||||
|
||||
@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
summs += dmin * smin;
|
||||
|
||||
const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
|
||||
const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
|
||||
const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
|
||||
const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
|
||||
const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
|
||||
|
||||
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
||||
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
|
||||
const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
|
||||
|
||||
const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
|
||||
const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
|
||||
const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
|
||||
const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
|
||||
const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
|
||||
const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
|
||||
const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
|
||||
const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
|
||||
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
|
||||
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
|
||||
const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
|
||||
const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
|
||||
const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
|
||||
const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
|
||||
|
||||
// high bit
|
||||
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
|
||||
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
}
|
||||
|
||||
// multiply with block scale and accumulate
|
||||
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
|
||||
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
||||
|
||||
}
|
||||
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
aux16[0] = a & 0x0f0f;
|
||||
aux16[1] = (a >> 4) & 0x0f0f;
|
||||
|
||||
const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
|
||||
const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
|
||||
const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
|
||||
const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
|
||||
|
||||
memcpy(&aux64, x[i].hmask, 8);
|
||||
|
||||
const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
|
||||
__m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
|
||||
__m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
|
||||
__m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
|
||||
q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
|
||||
q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
|
||||
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
|
||||
|
||||
// prepare low and high bits
|
||||
const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
|
||||
const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
|
||||
const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
|
||||
const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
|
||||
|
||||
@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
p16_0 = _mm_add_epi32(p16_0, p16_2);
|
||||
p16_1 = _mm_add_epi32(p16_1, p16_3);
|
||||
__m256i p16 = _mm256_set_m128i(p16_1, p16_0);
|
||||
__m256i p16 = MM256_SET_M128I(p16_1, p16_0);
|
||||
|
||||
// multiply with block scale and accumulate
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
|
||||
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
|
||||
|
||||
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
||||
const __m256i scales = _mm256_set_m128i(sc128, sc128);
|
||||
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
||||
|
||||
__m256i sumi = _mm256_setzero_si256();
|
||||
|
||||
@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
}
|
||||
|
||||
__m256 vd = _mm256_set1_ps(d);
|
||||
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
|
||||
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
||||
|
||||
}
|
||||
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
|
||||
const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
|
||||
|
||||
const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
|
||||
const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
|
||||
|
||||
}
|
||||
|
||||
@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
summs += dmin * _mm_extract_epi32(hsum, 0);
|
||||
|
||||
const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
|
||||
const __m256i scales = _mm256_set_m128i(sc128, sc128);
|
||||
const __m256i scales = MM256_SET_M128I(sc128, sc128);
|
||||
|
||||
const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
|
||||
__m256i hmask = mone;
|
||||
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
}
|
||||
|
||||
__m256 vd = _mm256_set1_ps(d);
|
||||
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
|
||||
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
|
||||
|
||||
}
|
||||
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
|
||||
|
||||
const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
|
||||
const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
|
||||
const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
|
||||
const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
|
||||
|
||||
int64_t aux64;
|
||||
memcpy(&aux64, x[i].qh, 8);
|
||||
const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
|
||||
const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
|
||||
const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
|
||||
|
||||
const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
|
||||
const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
|
||||
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
|
||||
const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
|
||||
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
|
||||
|
||||
}
|
||||
|
||||
@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
}
|
||||
|
||||
__m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
|
||||
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
|
||||
}
|
||||
|
||||
@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
|
||||
const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
|
||||
|
||||
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
|
||||
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
|
||||
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
|
||||
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
|
||||
|
||||
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
|
||||
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
|
||||
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
||||
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
||||
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
251
llama.cpp
251
llama.cpp
@@ -56,8 +56,14 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
|
||||
#include "ggml-alloc.h"
|
||||
#define LLAMA_USE_ALLOCATOR
|
||||
#else
|
||||
#define LLAMA_USE_SCRATCH
|
||||
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
||||
#endif
|
||||
|
||||
|
||||
// available llama models
|
||||
enum e_model {
|
||||
@@ -327,13 +333,22 @@ struct llama_model {
|
||||
|
||||
struct llama_context {
|
||||
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
||||
#ifdef GGML_USE_METAL
|
||||
~llama_context() {
|
||||
if (model_owner) {
|
||||
delete &model;
|
||||
}
|
||||
#ifdef GGML_USE_METAL
|
||||
if (ctx_metal) {
|
||||
ggml_metal_free(ctx_metal);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
if (alloc) {
|
||||
ggml_allocr_free(alloc);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
@@ -371,7 +386,17 @@ struct llama_context {
|
||||
// memory buffers used to evaluate the model
|
||||
// TODO: move in llama_state
|
||||
llama_ctx_buffer buf_compute;
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
llama_ctx_buffer buf_alloc;
|
||||
ggml_allocr * alloc = NULL;
|
||||
#endif
|
||||
|
||||
#ifdef LLAMA_USE_SCRATCH
|
||||
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
||||
int buf_last = 0;
|
||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
ggml_metal_context * ctx_metal = NULL;
|
||||
@@ -381,9 +406,6 @@ struct llama_context {
|
||||
ggml_mpi_context * ctx_mpi = NULL;
|
||||
#endif
|
||||
|
||||
int buf_last = 0;
|
||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||
|
||||
void use_buf(struct ggml_context * ctx, int i) {
|
||||
#if defined(LLAMA_USE_SCRATCH)
|
||||
size_t last_size = 0;
|
||||
@@ -1230,12 +1252,16 @@ static void llama_model_load_internal(
|
||||
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
||||
|
||||
// this is the total memory required to run the inference
|
||||
const size_t mem_required =
|
||||
size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size - vram_weights + // weights in VRAM not in memory
|
||||
mmapped_size - vram_weights; // weights in VRAM not in memory
|
||||
|
||||
#ifndef LLAMA_USE_ALLOCATOR
|
||||
mem_required +=
|
||||
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at(model.type);
|
||||
#endif
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
@@ -1360,32 +1386,15 @@ static bool llama_model_load(
|
||||
}
|
||||
}
|
||||
|
||||
// evaluate the transformer
|
||||
//
|
||||
// - lctx: llama context
|
||||
// - tokens: new batch of tokens to process
|
||||
// - embd embeddings input
|
||||
// - n_tokens number of tokens
|
||||
// - n_past: the context size so far
|
||||
// - n_threads: number of threads to use
|
||||
//
|
||||
static bool llama_eval_internal(
|
||||
static struct ggml_cgraph * llama_build_graph(
|
||||
llama_context & lctx,
|
||||
const llama_token * tokens,
|
||||
const float * embd,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads,
|
||||
const char * cgraph_fname) {
|
||||
int n_past) {
|
||||
|
||||
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
||||
#endif
|
||||
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
const int N = n_tokens;
|
||||
|
||||
const auto & model = lctx.model;
|
||||
@@ -1401,10 +1410,8 @@ static bool llama_eval_internal(
|
||||
const int64_t n_head = hparams.n_head;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_head = hparams.n_embd_head();
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
|
||||
|
||||
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
const float freq_base = hparams.rope_freq_base;
|
||||
@@ -1416,26 +1423,35 @@ static bool llama_eval_internal(
|
||||
auto & mem_per_token = lctx.mem_per_token;
|
||||
auto & buf_compute = lctx.buf_compute;
|
||||
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size,
|
||||
/*.mem_buffer =*/ buf_compute.addr,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
params.no_alloc = true;
|
||||
#endif
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
if (tokens) {
|
||||
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
||||
}
|
||||
#else
|
||||
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
||||
#endif
|
||||
ggml_set_name(inp_tokens, "inp_tokens");
|
||||
|
||||
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
||||
@@ -1445,7 +1461,15 @@ static bool llama_eval_internal(
|
||||
#endif
|
||||
|
||||
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
ggml_allocr_alloc(lctx.alloc, inpL);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||
}
|
||||
#else
|
||||
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||
#endif
|
||||
}
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
@@ -1472,6 +1496,17 @@ static bool llama_eval_internal(
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||
}
|
||||
#else
|
||||
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||
#endif
|
||||
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_format_name(inpL, "layer_inp_%d", il);
|
||||
|
||||
@@ -1567,9 +1602,6 @@ static bool llama_eval_internal(
|
||||
ggml_set_name(KQ, "KQ");
|
||||
|
||||
// KQ_scaled = KQ / sqrt(n_embd_head)
|
||||
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
||||
|
||||
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||
offload_func_kq(KQ_scaled);
|
||||
@@ -1685,9 +1717,6 @@ static bool llama_eval_internal(
|
||||
|
||||
lctx.use_buf(ctx0, 0);
|
||||
|
||||
// used at the end to optionally extract the embeddings
|
||||
struct ggml_tensor * embeddings = NULL;
|
||||
|
||||
// norm
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||
@@ -1698,8 +1727,6 @@ static bool llama_eval_internal(
|
||||
cur = ggml_mul(ctx0, cur, model.norm);
|
||||
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
||||
ggml_set_name(cur, "result_norm");
|
||||
|
||||
embeddings = cur;
|
||||
}
|
||||
|
||||
// lm_head
|
||||
@@ -1711,12 +1738,88 @@ static bool llama_eval_internal(
|
||||
// logits -> probs
|
||||
//cur = ggml_soft_max_inplace(ctx0, cur);
|
||||
|
||||
// run the computation
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
|
||||
if (mem_per_token == 0) {
|
||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
||||
ggml_used_mem(ctx0)/1024.0/1024.0,
|
||||
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
||||
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
||||
lctx.work_buffer.size()/1024.0/1024.0,
|
||||
n_past, N);
|
||||
#endif
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
// evaluate the transformer
|
||||
//
|
||||
// - lctx: llama context
|
||||
// - tokens: new batch of tokens to process
|
||||
// - embd embeddings input
|
||||
// - n_tokens number of tokens
|
||||
// - n_past: the context size so far
|
||||
// - n_threads: number of threads to use
|
||||
//
|
||||
static bool llama_eval_internal(
|
||||
llama_context & lctx,
|
||||
const llama_token * tokens,
|
||||
const float * embd,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads,
|
||||
const char * cgraph_fname) {
|
||||
|
||||
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
||||
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
||||
#endif
|
||||
|
||||
const int N = n_tokens;
|
||||
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & kv_self = lctx.kv_self;
|
||||
|
||||
LLAMA_ASSERT(!!kv_self.ctx);
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
ggml_allocr_reset(lctx.alloc);
|
||||
#endif
|
||||
|
||||
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
||||
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
||||
#endif
|
||||
|
||||
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
||||
|
||||
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
|
||||
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
||||
|
||||
#if GGML_USE_MPI
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
||||
#endif
|
||||
|
||||
@@ -1728,7 +1831,10 @@ static bool llama_eval_internal(
|
||||
//}
|
||||
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
||||
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
||||
if (!lctx.embedding.empty()) {
|
||||
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
||||
}
|
||||
} else {
|
||||
// IMPORTANT:
|
||||
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
||||
@@ -1759,8 +1865,6 @@ static bool llama_eval_internal(
|
||||
// update kv token count
|
||||
lctx.kv_self.n = n_past + N;
|
||||
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
|
||||
if (cgraph_fname) {
|
||||
ggml_graph_export(gf, cgraph_fname);
|
||||
}
|
||||
@@ -1798,21 +1902,6 @@ static bool llama_eval_internal(
|
||||
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
||||
}
|
||||
|
||||
if (mem_per_token == 0) {
|
||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
||||
ggml_used_mem(ctx0)/1024.0/1024.0,
|
||||
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
||||
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
||||
lctx.work_buffer.size()/1024.0/1024.0,
|
||||
n_past, N);
|
||||
#endif
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
// measure the performance only for the single-token evals
|
||||
if (N == 1) {
|
||||
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
||||
@@ -3180,10 +3269,47 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
||||
#ifdef LLAMA_USE_ALLOCATOR
|
||||
{
|
||||
static const size_t tensor_alignment = 32;
|
||||
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
||||
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
||||
|
||||
// create measure allocator
|
||||
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
|
||||
// build worst-case graph
|
||||
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
||||
int n_past = hparams.n_ctx - n_tokens;
|
||||
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
||||
|
||||
// measure memory requirements for the graph
|
||||
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
||||
|
||||
fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
||||
|
||||
// debug - for comparison with scratch buffer
|
||||
//size_t prev_req =
|
||||
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
||||
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
||||
// MEM_REQ_EVAL().at(ctx->model.type);
|
||||
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
||||
|
||||
// recreate allocator with exact memory requirements
|
||||
ggml_allocr_free(ctx->alloc);
|
||||
|
||||
ctx->buf_alloc.resize(alloc_size);
|
||||
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
||||
}
|
||||
#else
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
||||
#endif
|
||||
|
||||
#ifdef LLAMA_USE_SCRATCH
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
@@ -3253,9 +3379,6 @@ struct llama_context * llama_init_from_file(
|
||||
}
|
||||
|
||||
void llama_free(struct llama_context * ctx) {
|
||||
if (ctx->model_owner) {
|
||||
delete &ctx->model;
|
||||
}
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user