Compare commits

..

22 Commits

Author SHA1 Message Date
Henri Vasserman
acc111caf9 Allow old Make to build server. (#2098)
Also make server build by default.

Tested with Make 3.82
2023-07-04 15:38:04 +03:00
ZhouYuChen
23c7c6fc91 Update Makefile: clean simple (#2097) 2023-07-04 14:15:16 +02:00
Erik Scholz
698efad5fb CI: make the brew update temporarily optional. (#2092)
until they decide to fix the brew installation in the macos runners.
see the open issues. eg https://github.com/actions/runner-images/pull/7710
2023-07-04 01:50:12 +02:00
Govlzkoy
14a2cc71f6 [ggml] fix index for ne03 value in ggml_cl_mul_f32 (#2088) 2023-07-04 07:50:00 +08:00
Henri Vasserman
1cf14ccef1 fix server crashes (#2076) 2023-07-04 00:05:23 +03:00
Howard Su
cc45a7feb8 Fix crash of test-tokenizer-0 under Debug build (#2064)
* Fix crash of test-tokenizer-0 under Debug build

* Change per comment
2023-07-03 20:43:55 +02:00
Howard Su
55dbb915cc [llama] No need to check file version when loading vocab score (#2079) 2023-07-03 19:58:58 +08:00
WangHaoranRobin
d7d2e6a0f0 server: add option to output probabilities for completion (#1962)
* server: add option to output probabilities for completion
* server: fix issue when handling probability output for incomplete tokens for multibyte character generation
* server: fix llama_sample_top_k order
* examples/common.h: put all bool variables in gpt_params together
2023-07-03 00:38:44 +03:00
Georgi Gerganov
46088f7231 ggml : fix build with OpenBLAS (close #2066) 2023-07-02 09:46:46 +03:00
Johannes Gäßler
0bc2cdfc87 Better CUDA synchronization logic (#2057) 2023-07-01 21:49:44 +02:00
Johannes Gäßler
befb3a3562 Test-based VRAM scratch size + context adjustment (#2056) 2023-07-01 21:47:26 +02:00
Daniel Drake
b213227067 cmake : don't force -mcpu=native on aarch64 (#2063)
It's currently not possible to cross-compile llama.cpp for aarch64
because CMakeLists.txt forces -mcpu=native for that target.

-mcpu=native doesn't make sense if your build host is not the
target architecture, and clang rejects it for that reason, aborting the
build. This can be easily reproduced using the current Android NDK to build
for aarch64 on an x86_64 host.

If there is not a specific CPU-tuning target for aarch64 then -mcpu
should be omitted completely. I think that makes sense, there is not
enough variance in the aarch64 instruction set to warrant a fixed -mcpu
optimization at this point. And if someone is building natively and wishes
to enable any possible optimizations for the host device, then there is
already the LLAMA_NATIVE option available.

Fixes #495.
2023-07-01 21:31:44 +03:00
Aaron Miller
2f8cd979ec metal : release buffers when freeing metal context (#2062) 2023-07-01 21:14:59 +03:00
Judd
471aab6e4c convert : add support of baichuan-7b (#2055)
Co-authored-by: Judd <foldl@boxvest.com>
2023-07-01 20:00:25 +03:00
Georgi Gerganov
463f2f4c4f llama : fix return value of llama_load_session_file_internal (#2022) 2023-07-01 19:05:09 +03:00
Rand Xie
cb44dbc7de llama : catch llama_load_session_file_internal exceptions (#2022)
* convert checks in llama_load_session_file to throw and handle them

* make llama_load_session_file_internal static

* address feedbacks to avoid using exceptions
2023-07-01 19:02:58 +03:00
Georgi Gerganov
79f634a19d embd-input : fix returning ptr to temporary 2023-07-01 18:46:00 +03:00
Georgi Gerganov
04606a1599 train : fix compile warning 2023-07-01 18:45:44 +03:00
Qingyou Meng
b1ca8f36a9 ggml : disable GGML_TASK_INIT and GGML_TASK_FINALIZE by default (#1995)
Will not be scheduled unless explicitly enabled.
2023-07-01 18:42:43 +03:00
Howard Su
b8c8dda75f Use unsigned for random seed (#2006)
* Use unsigned for random seed. Keep -1 as the value to use a time based seed.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-29 06:15:15 -07:00
LostRuins
96a712ca1b Porting the improved K-Quant CUDA kernels to OpenCL (#1966)
* Added broken new q4k quant

* xx + ib0

* Fix q2_k fast kernel

* Use preprocessor for QK_K

* Add q6_k fast matmul kernel

* ported q3k speedup successfully

* ported q2k and q5k speedups

* remove old dot kernels and template

* fixed global const struct types

* fixing address spaces

* fixed string too long CI issue

---------

Co-authored-by: 0cc4m <picard12@live.de>
2023-06-29 05:56:43 +02:00
m3ndax
d3494bb86b llama : replacing auto &kv with const auto &kv (#2041)
* Replacing auto &kv with const auto &kv

* Create codacy.yml

* Delete codacy.yml
2023-06-28 21:39:08 +03:00
24 changed files with 725 additions and 307 deletions

View File

@@ -111,6 +111,7 @@ jobs:
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
@@ -129,6 +130,7 @@ jobs:
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update

View File

@@ -386,11 +386,6 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
if (MSVC)
# TODO: arm msvc?
else()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
add_compile_options(-mcpu=native)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)

View File

@@ -1,11 +1,5 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple libembdinput.so embd-input-test
ifdef LLAMA_BUILD_SERVER
BUILD_TARGETS += server
LLAMA_SERVER_VERBOSE ?= 1
server: private CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
default: $(BUILD_TARGETS)
@@ -61,6 +55,10 @@ else
CXXFLAGS += -DNDEBUG
endif
ifdef LLAMA_SERVER_VERBOSE
CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
endif
# warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
@@ -272,7 +270,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean:
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch embd-input-test build-info.h
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h
#
# Examples

View File

@@ -85,6 +85,7 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
**Bindings:**

View File

@@ -136,7 +136,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff:
return n_mult
return 1
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
@dataclass
class Params:
@@ -321,6 +321,10 @@ class Tensor(metaclass=ABCMeta):
@abstractmethod
def permute(self, n_head: int) -> 'Tensor': ...
@abstractmethod
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
@abstractmethod
def part(self, n_part: int) -> 'UnquantizedTensor': ...
@abstractmethod
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
@@ -345,6 +349,14 @@ class UnquantizedTensor(Tensor):
def to_ggml(self) -> 'UnquantizedTensor':
return self
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
def part(self, n_part: int) -> 'UnquantizedTensor':
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
def permute(self, n_head: int) -> 'UnquantizedTensor':
return UnquantizedTensor(permute(self.ndarray, n_head))
@@ -642,6 +654,19 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
return lazy_tensor.load().permute(n_head)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().permute_part(n_part, n_head)
s = lazy_tensor.shape.copy()
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().part(n_part)
s = lazy_tensor.shape.copy()
s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
out: LazyModel = {}
@@ -650,11 +675,17 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
out["output.weight"] = model["lm_head.weight"]
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
else:
break
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]

View File

@@ -110,7 +110,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
params.seed = std::stoi(argv[i]);
params.seed = std::stoul(argv[i]);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;

View File

@@ -22,7 +22,7 @@
int32_t get_num_physical_cores();
struct gpt_params {
int32_t seed = -1; // RNG seed
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
@@ -31,7 +31,7 @@ struct gpt_params {
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
// sampling parameters
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -59,6 +59,7 @@ struct gpt_params {
std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs

View File

@@ -210,9 +210,12 @@ llama_token sampling_id(struct MyModel* mymodel) {
const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel);
std::string ret;
if (id == llama_token_eos()) ret = "</s>";
else ret = llama_token_to_str(ctx, id);
static std::string ret;
if (id == llama_token_eos()) {
ret = "</s>";
} else {
ret = llama_token_to_str(ctx, id);
}
eval_id(mymodel, id);
return ret.c_str();
}

View File

@@ -5,7 +5,6 @@
#include "llama.h"
#include "build-info.h"
extern "C" {
typedef struct MyModel {
@@ -14,14 +13,13 @@ typedef struct MyModel {
int n_past = 0;
} MyModel;
struct MyModel* create_mymodel(int argc, char ** argv);
bool eval_float(void* model, float* input, int N);
bool eval_tokens(void* model, std::vector<llama_token> tokens);
bool eval_id(struct MyModel* mymodel, int id);
bool eval_string(struct MyModel* mymodel, const char* str);
const char* sampling(struct MyModel* mymodel);
const char * sampling(struct MyModel* mymodel);
llama_token sampling_id(struct MyModel* mymodel);
void free_mymodel(struct MyModel* mymodel);

View File

@@ -24,11 +24,11 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {

View File

@@ -242,7 +242,7 @@ Example usage: `--logit-bias 29905-inf`
### RNG Seed
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.

View File

@@ -94,11 +94,11 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {

View File

@@ -136,11 +136,11 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {

View File

@@ -152,7 +152,7 @@ node .
`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
`seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
`seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
`ignore_eos`: Ignore end of stream token and continue generating (default: false).

View File

@@ -26,6 +26,17 @@ struct server_params {
int32_t write_timeout = 600;
};
// completion token output with probabilities
struct completion_token_output {
struct token_prob {
llama_token tok;
float prob;
};
std::vector<token_prob> probs;
llama_token tok;
};
static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
@@ -86,6 +97,40 @@ static void server_log(const char * level, const char * function, int line,
fflush(stdout);
}
// format incomplete utf-8 multibyte character for output
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
// if first bit is 1, meaning it's a partial character
if (out.size() > 0 && (out[0] & 0x80) == 0x80) {
std::stringstream ss;
ss<< std::hex << (out[0] & 0xff);
std::string res ( ss.str() );
out = "byte: \\x" + res;
}
return out;
}
// convert a vector of completion_token_output to json
static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> probs) {
json out = json::array();
for (const auto & prob : probs) {
json probs_for_token = json::array();
for (const auto & p : prob.probs) {
std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
probs_for_token.push_back(json {
{ "tok_str", tok_str },
{ "prob", p.prob },
});
}
std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
out.push_back(json {
{"content", tok_str},
{"probs", probs_for_token},
});
}
return out;
}
static bool server_verbose = false;
#if SERVER_VERBOSE != 1
@@ -107,6 +152,7 @@ struct llama_server_context {
bool stream = false;
bool has_next_token = false;
std::string generated_text;
std::vector<completion_token_output> generated_token_probs;
size_t num_tokens_predicted = 0;
size_t n_past = 0;
@@ -142,6 +188,7 @@ struct llama_server_context {
num_tokens_predicted = 0;
generated_text = "";
generated_text.reserve(params.n_ctx);
generated_token_probs.clear();
truncated = false;
stopped_eos = false;
stopped_word = false;
@@ -221,8 +268,9 @@ struct llama_server_context {
llama_set_rng_seed(ctx, params.seed);
}
llama_token nextToken() {
llama_token result = -1;
completion_token_output nextToken() {
completion_token_output result;
result.tok = -1;
if (embd.size() >= (size_t)params.n_ctx) {
// Reset context
@@ -261,7 +309,8 @@ struct llama_server_context {
if (params.n_predict == 0) {
has_next_token = false;
return llama_token_eos();
result.tok = llama_token_eos();
return result;
}
// out of user input, sample next token
@@ -278,7 +327,7 @@ struct llama_server_context {
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
const int32_t n_probs = params.n_probs;
{
auto * logits = llama_get_logits(ctx);
@@ -312,35 +361,42 @@ struct llama_server_context {
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p);
result.tok = llama_sample_token_greedy(ctx, &candidates_p);
if (n_probs > 0) {
llama_sample_softmax(ctx, &candidates_p);
}
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
size_t min_keep = std::max(1, n_probs);
llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
llama_sample_temperature(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
result.tok = llama_sample_token(ctx, &candidates_p);
}
}
for (size_t i = 0; i < std::min(candidates_p.size, (size_t) n_probs); ++i) {
result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
}
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
last_n_tokens.push_back(result.tok);
num_tokens_predicted++;
}
// add it to the context
embd.push_back(id);
result = id;
embd.push_back(result.tok);
// decrement remaining sampling budget
--n_remain;
@@ -382,12 +438,16 @@ struct llama_server_context {
return stop_pos;
}
std::string doCompletion() {
const llama_token token = nextToken();
completion_token_output doCompletion() {
const completion_token_output token_with_probs = nextToken();
const std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token);
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
generated_text += token_text;
if (params.n_probs > 0) {
generated_token_probs.push_back(token_with_probs);
}
if (multibyte_pending > 0) {
multibyte_pending -= token_text.size();
} else if (token_text.size() == 1) {
@@ -416,8 +476,8 @@ struct llama_server_context {
}
LOG_VERBOSE("next token", {
{ "token", token },
{ "token_text", llama_token_to_str(ctx, token) },
{ "token", token_with_probs.tok },
{ "token_text", tokens_to_output_formatted_string(ctx, token_with_probs.tok) },
{ "has_next_token", has_next_token },
{ "n_remain", n_remain },
{ "num_tokens_predicted", num_tokens_predicted },
@@ -427,7 +487,7 @@ struct llama_server_context {
{ "stopping_word", stopping_word },
});
return token_text;
return token_with_probs;
}
std::vector<float> getEmbedding() {
@@ -669,6 +729,7 @@ static json format_generation_settings(llama_server_context & llama) {
{ "ignore_eos", ignore_eos },
{ "stream", llama.stream },
{ "logit_bias", llama.params.logit_bias },
{ "n_probs", llama.params.n_probs },
};
}
@@ -678,8 +739,9 @@ static json format_embedding_response(llama_server_context & llama) {
};
}
static json format_final_response(llama_server_context & llama, const std::string & content) {
return json {
static json format_final_response(llama_server_context & llama, const std::string & content, const std::vector<completion_token_output> & probs) {
json res = json {
{ "content", content },
{ "stop", true },
{ "model", llama.params.model_alias },
@@ -692,13 +754,25 @@ static json format_final_response(llama_server_context & llama, const std::strin
{ "stopped_limit", llama.stopped_limit },
{ "stopping_word", llama.stopping_word },
};
if (llama.params.n_probs > 0) {
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
}
return res;
}
static json format_partial_response(const std::string & content) {
return json {
static json format_partial_response(llama_server_context & llama, const std::string & content, const std::vector<completion_token_output> & probs) {
json res = json {
{ "content", content },
{ "stop", false },
};
if (llama.params.n_probs > 0) {
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
}
return res;
}
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
@@ -728,6 +802,7 @@ static void parse_options_completion(const json & body, llama_server_context & l
llama.params.n_keep = body.value("n_keep", default_params.n_keep);
llama.params.seed = body.value("seed", default_params.seed);
llama.params.prompt = body.value("prompt", default_params.prompt);
llama.params.n_probs = body.value("n_probs", default_params.n_probs);
llama.params.logit_bias.clear();
if (body.value("ignore_eos", false)) {
@@ -830,7 +905,8 @@ int main(int argc, char ** argv) {
size_t stop_pos = std::string::npos;
while (llama.has_next_token) {
const std::string token_text = llama.doCompletion();
const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
stop_pos = llama.findStoppingStrings(llama.generated_text,
token_text.size(), STOP_FULL);
@@ -844,7 +920,7 @@ int main(int argc, char ** argv) {
llama.generated_text.end());
}
const json data = format_final_response(llama, llama.generated_text);
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
llama_print_timings(llama.ctx);
@@ -853,9 +929,11 @@ int main(int argc, char ** argv) {
} else {
const auto chunked_content_provider = [&](size_t, DataSink & sink) {
size_t sent_count = 0;
size_t sent_token_probs_index = 0;
while (llama.has_next_token) {
const std::string token_text = llama.doCompletion();
const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
if (llama.multibyte_pending > 0) {
continue;
}
@@ -878,10 +956,22 @@ int main(int argc, char ** argv) {
const std::string to_send = llama.generated_text.substr(pos, stop_pos);
sent_count += to_send.size();
std::vector<completion_token_output> probs_output = {};
if (llama.params.n_probs > 0) {
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
if (probs_pos < probs_stop_pos) {
probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
}
sent_token_probs_index = probs_stop_pos;
}
const json data = llama.has_next_token
? format_partial_response(to_send)
? format_partial_response(llama, to_send, probs_output)
// Generation is done, send extra information.
: format_final_response(llama, to_send);
: format_final_response(llama, to_send, llama.generated_token_probs);
const std::string str =
"data: " +

View File

@@ -2671,7 +2671,8 @@ struct train_params {
const char * fn_checkpoint_out;
const char * fn_model_out;
int seed;
uint32_t seed;
int n_ctx;
int n_embd;
int n_mult;
@@ -2768,7 +2769,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out);
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n");
fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx);
fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd);
fprintf(stderr, " --mult N Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
@@ -3034,10 +3035,10 @@ int main(int argc, char ** argv) {
return 1;
}
if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
printf("%s: seed: %d\n", __func__, params.seed);
printf("%s: seed: %u\n", __func__, params.seed);
srand(params.seed);
struct llama_context_params llama_params = llama_context_default_params();

View File

@@ -214,6 +214,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -1970,7 +1975,6 @@ inline void ggml_cuda_op_add(
} else {
GGML_ASSERT(false);
}
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@@ -2002,7 +2006,6 @@ inline void ggml_cuda_op_mul(
// compute
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
}
(void) dst;
@@ -2023,7 +2026,6 @@ inline void ggml_cuda_op_silu(
// compute
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@@ -2046,7 +2048,6 @@ inline void ggml_cuda_op_rms_norm(
// compute
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@@ -2125,7 +2126,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
GGML_ASSERT(false);
break;
}
CUDA_CHECK(cudaGetLastError());
#ifdef GGML_CUDA_DMMV_F16
if (src1_convert_f16) {
@@ -2202,7 +2202,6 @@ inline void ggml_cuda_op_rope(
// compute
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) dst;
(void) src0_ddq_i;
@@ -2226,7 +2225,6 @@ inline void ggml_cuda_op_diag_mask_inf(
// compute
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) dst;
(void) src0_ddq_i;
@@ -2248,7 +2246,6 @@ inline void ggml_cuda_op_soft_max(
// compute
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
(void) src1;
(void) dst;
@@ -2344,10 +2341,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
// if multiple GPUs are used they need to wait for the main GPU to finish
// if multiple devices are used they need to wait for the main device
// here an event is recorded that signifies that the main device has finished calculating the input data
if (split && g_device_count > 1) {
CUDA_CHECK(cudaSetDevice(g_main_device));
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
}
for (int id = 0; id < g_device_count; ++id) {
@@ -2373,6 +2371,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
int64_t row_diff = row_high - row_low;
cudaSetDevice(id);
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
// wait for main GPU data if necessary
if (split && id != g_main_device) {
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
}
if (src0_on_device && src0_is_contiguous) {
if (src0_is_f32) {
@@ -2448,8 +2452,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
}
const int64_t i11 = i13*ne12 + i12;
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
// for split tensors the data begins at i0 == i0_offset_low
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
@@ -2509,6 +2511,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
// do the computation
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
CUDA_CHECK(cudaGetLastError());
// copy dst to host or other device if necessary
if (!dst_on_device) {
@@ -2538,6 +2541,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
}
}
// signify to main device that other device is done
if (split && g_device_count > 1 && id != g_main_device) {
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
}
}
}
}
@@ -2549,7 +2557,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
}
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaDeviceSynchronize());
if (src0_asq[id] > 0) {
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
@@ -2564,6 +2571,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
}
}
// main device waits for all other devices to be finished
if (split && g_device_count > 1) {
CUDA_CHECK(cudaSetDevice(g_main_device));
for (int id = 0; id < g_device_count; ++id) {
if (id != g_main_device) {
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
}
}
}
if (dst->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(cudaSetDevice(g_main_device));
CUDA_CHECK(cudaDeviceSynchronize());
}
}
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2803,25 +2825,32 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
extra->data_device[id] = buf;
if (backend == GGML_BACKEND_GPU_SPLIT) {
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
}
}
tensor->extra = extra;
}
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
return;
}
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
for (int id = 0; id < g_device_count; ++id) {
if (extra->data_device[id] == nullptr) {
continue;
if (extra->data_device[id] != nullptr) {
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaFree(extra->data_device[id]));
}
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaFree(extra->data_device[id]));
if (extra->events[id] != nullptr) {
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
}
}
delete extra;

View File

@@ -8,10 +8,6 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16
struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
};
void ggml_init_cublas(void);
void ggml_cuda_set_tensor_split(const float * tensor_split);

View File

@@ -202,7 +202,9 @@ struct ggml_metal_context * ggml_metal_init(void) {
void ggml_metal_free(struct ggml_metal_context * ctx) {
fprintf(stderr, "%s: deallocating\n", __func__);
for (int i = 0; i < ctx->n_buffers; ++i) {
[ctx->buffers[i].metal release];
}
free(ctx);
}

View File

@@ -21,11 +21,19 @@
#define CL_DMMV_BLOCK_SIZE 32
#ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 1
#else
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif
#define MULTILINE_QUOTE(...) #__VA_ARGS__
static std::string program_source = MULTILINE_QUOTE(
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
@@ -175,7 +183,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float
*v0 = vload_half(0, &x[ib + 0]);
*v1 = vload_half(0, &x[ib + 1]);
}
);
static std::string k_quants_source = MULTILINE_QUOTE(
inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
{
if (j < 4)
@@ -199,7 +209,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
const int is = 8 * n + l / 16;
const uint8_t q = x[i].qs[32 * n + l];
__global float *y = yy + i * 256 + 128 * n;
__global float *y = yy + i * QK_K + 128 * n;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
@@ -231,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
float d_all = vload_half(0, &x[i].d);
float dl = d_all * (us - 32);
__global float *y = yy + i * 256 + 128 * n + 32 * j;
__global float *y = yy + i * QK_K + 128 * n + 32 * j;
const __global uint8_t *q = x[i].qs + 32 * n;
const __global uint8_t *hm = x[i].hmask;
@@ -248,7 +258,7 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
const int is = 2 * il;
const int n = 4;
__global float *y = yy + i * 256 + 64 * il + n * ir;
__global float *y = yy + i * QK_K + 64 * il + n * ir;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
@@ -277,7 +287,7 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
const int ir = tid % 16;
const int is = 2 * il;
__global float *y = yy + i * 256 + 64 * il + 2 * ir;
__global float *y = yy + i * QK_K + 64 * il + 2 * ir;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
@@ -309,7 +319,7 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
const int il = tid - 32 * ip;
const int is = 8 * ip + il / 16;
__global float *y = yy + i * 256 + 128 * ip + il;
__global float *y = yy + i * QK_K + 128 * ip + il;
const float d = vload_half(0, &x[i].d);
@@ -323,161 +333,383 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
}
__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
const int row = get_group_id(0);
int n = iqs / 128;
int r = iqs - 128 * n;
int l = r / 8;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
__global const float *y = yy + 128 * n + l;
__global const uint8_t *q = x[ib].qs + 32 * n + l;
__global const uint8_t *s = x[ib].scales + 8 * n;
__global const struct block_q2_K * x = xx + ib0;
const float dall = vload_half(0, &x[ib].d);
const float dmin = vload_half(0, &x[ib].dmin);
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
const int step = 16/K_QUANTS_PER_ITERATION;
*result = sum;
}
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0...15 or 0...7
void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
const int q_offset = 32*im + l0;
const int s_offset = 8*im;
const int y_offset = 128*im + l0;
const uint32_t kmask1 = 0x03030303;
const uint32_t kmask2 = 0x0f0f0f0f;
tmp[16 * ix + tid] = 0;
uint32_t aux[3];
uint32_t utmp[4];
uint32_t aux[4];
const uint8_t * d = (const uint8_t *)aux;
const uint8_t * m = (const uint8_t *)(aux + 2);
int n = iqs/128;
int r = iqs - 128*n;
int l = r/8;
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
__global const float * y = yy + 128*n + l;
__global const uint8_t * q = x[ib].qs + 32*n + l;
__global const uint8_t * hm = x[ib].hmask + l;
const int8_t * s = (const int8_t *)utmp + 8*n;
__global const float * y = yy + i * QK_K + y_offset;
__global const uint8_t * q = x[i].qs + q_offset;
aux[0] = x[ib].scales[0] | x[ib].scales[1] << 8 | x[ib].scales[2] << 16 | x[ib].scales[3] << 24;
aux[1] = x[ib].scales[4] | x[ib].scales[5] << 8 | x[ib].scales[6] << 16 | x[ib].scales[7] << 24;
aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
__global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
aux[0] = a[0] & 0x0f0f0f0f;
aux[1] = a[1] & 0x0f0f0f0f;
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
const float dall = vload_half(0, &x[ib].d);
const uint8_t m = 1 << (4*n);
float sum1 = 0, sum2 = 0;
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
}
tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
*result = sum * dall;
}
void vec_dot_q4_K(__global const struct block_q4_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
const int j = iqs / 64; // j is in 0...3
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
const int is = 2*j; // is is in 0...6 in steps of 2
__global const float * y = yy + 64*j + ir;
__global const uint8_t * q = x[ib].qs + 32*j + ir;
const float dall = vload_half(0, &x[ib].d);
const float dmin = vload_half(0, &x[ib].dmin);
uint8_t sc, m;
get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
const float d2 = dall * sc;
const float m2 = dmin * m;
float sum = 0;
for (int k = 0; k < 4; ++k) {
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
}
*result = sum;
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=16; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;
const int j = iqs / 64;
const int ir = (iqs - 64*j)/2;
const int is = 2*j;
const int row = get_group_id(0);
__global const float * y = yy + 64*j + ir;
__global const uint8_t * ql = x[ib].qs + 32*j + ir;
__global const uint8_t * qh = x[ib].qh + ir;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const float dall = vload_half(0, &x[ib].d);
const float dmin = vload_half(0, &x[ib].dmin);
__global const struct block_q3_K * x = xx + ib0;
uint8_t sc, m;
get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
const float d2 = dall * sc;
const float m2 = dmin * m;
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
const int step = 16/K_QUANTS_PER_ITERATION;
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0....15 or 0...7
const uint8_t m = 1 << (4*im);
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
const int q_offset = 32*im + l0;
const int y_offset = 128*im + l0;
uint16_t utmp[4];
const int8_t * s = (const int8_t *)utmp;
const uint16_t s_shift = 4*im;
tmp[16 * ix + tid] = 0;
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
__global const float * y = yy + i * QK_K + y_offset;
__global const uint8_t * q = x[i].qs + q_offset;
__global const uint8_t * h = x[i].hmask + l0;
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
const float d = vload_half(0, &x[i].d);
float sum = 0;
for (int l = 0; l < n; ++l) {
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
}
tmp[16 * ix + tid] += d * sum;
uint8_t hm = 1 << is;
float sum = 0;
for (int k = 0; k < 4; ++k) {
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
}
hm <<= 1;
for (int k = 0; k < 4; ++k) {
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
}
*result = sum;
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=16; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
//to rename it later, just to test now
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int ip = iqs / 128; // 0 or 1
const int il = (iqs - 128*ip)/8; // 0...15
const int is = 8*ip;
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
__global const float * y = yy + 128*ip + il;
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
const float d = vload_half(0, &x[ib].d);
const int step = 8/K_QUANTS_PER_ITERATION;
__global const uint8_t * ql = x[ib].ql + 64*ip + il;
__global const uint8_t * qh = x[ib].qh + 32*ip + il;
__global const int8_t * sc = x[ib].scales + is;
const int il = tid/step; // 0...3
const int ir = tid - step*il;// 0...3
const int n = 2*K_QUANTS_PER_ITERATION;
*result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
+ y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
+ y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
+ y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
+ y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
+ y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
+ y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
+ y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
const int l0 = n*(2*ir + in);
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux;
__global const struct block_q4_K * x = xx + ib0;
tmp[16 * ix + tid] = 0;
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
__global const uint8_t * q1 = x[i].qs + q_offset;
__global const uint8_t * q2 = q1 + 64;
__global const float * y1 = yy + i*QK_K + y_offset;
__global const float * y2 = y1 + 128;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
aux[0] = a[im+0] & kmask1;
aux[1] = a[im+2] & kmask1;
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
float4 s = (float4)(0.f);
float smin = 0;
for (int l = 0; l < n; ++l) {
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
}
tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
}
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=16; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int tid = get_local_id(0)/2; // 0...15
const int ix = get_local_id(0)%2;
const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3
const int n = 2;
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
const int l0 = n*(2*ir + in);
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
const uint8_t hm1 = 1 << (2*im);
const uint8_t hm2 = hm1 << 4;
uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux;
__global const struct block_q5_K * x = xx + ib0;
tmp[16 * ix + tid] = 0;
for (int i = ix; i < num_blocks_per_row; i += 2) {
__global const uint8_t * ql1 = x[i].qs + q_offset;
__global const uint8_t * ql2 = ql1 + 64;
__global const uint8_t * qh = x[i].qh + l0;
__global const float * y1 = yy + i*QK_K + y_offset;
__global const float * y2 = y1 + 128;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
aux[0] = a[im+0] & kmask1;
aux[1] = a[im+2] & kmask1;
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
float4 sum = (float4)(0.f);
float smin = 0;
for (int l = 0; l < n; ++l) {
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
}
tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
}
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=16; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
const int row = get_group_id(0);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
__global const struct block_q6_K * x = xx + ib0;
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0...15 or 0...7
#if K_QUANTS_PER_ITERATION == 1
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
const int is = 0;
#else
const int l0 = 4 * in; // 0, 4, 8, ..., 28
const int is = in / 4;
#endif
const int ql_offset = 64*im + l0;
const int qh_offset = 32*im + l0;
const int s_offset = 8*im + is;
const int y_offset = 128*im + l0;
tmp[16 * ix + tid] = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
__global const float * y = yy + i * QK_K + y_offset;
__global const uint8_t * ql = x[i].ql + ql_offset;
__global const uint8_t * qh = x[i].qh + qh_offset;
__global const int8_t * s = x[i].scales + s_offset;
const float d = vload_half(0, &x[i].d);
#if K_QUANTS_PER_ITERATION == 1
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
tmp[16 * ix + tid] += sum;
#else
float sum = 0;
for (int l = 0; l < 4; ++l) {
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
}
tmp[16 * ix + tid] += sum;
#endif
}
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=16; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
);
@@ -549,44 +781,6 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
}
);
std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
const int block_size = get_local_size(0);
const int row = get_group_id(0);
const int tid = get_local_id(0);
const int iter_stride = 256;
const int vals_per_iter = iter_stride / block_size;
const int num_blocks_per_row = ncols / 256;
const int ib0 = row*num_blocks_per_row;
tmp[tid] = 0;
for (int i = 0; i < ncols; i += iter_stride) {
const int col = i + vals_per_iter*tid;
const int ib = ib0 + col/256; // x block index
const int iqs = col%256; // x quant index
const int iybs = col - col%256; // y block start index
// dequantize
float v;
DOT_KERNEL(x, ib, iqs, y + iybs, &v);
tmp[tid] += v;
}
// sum up partial sums and write back result
barrier(CLK_LOCAL_MEM_FENCE);
for (int s=block_size/2; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0) {
dst[row] = tmp[0];
}
}
);
std::string mul_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
@@ -649,18 +843,6 @@ std::array<std::string, 2> mul_str_values = {
"mul_f32", "float"
};
std::array<std::string, 3> dmmv_k_str_keys = {
"KERNEL_NAME", "X_TYPE", "DOT_KERNEL"
};
std::array<std::string, 15> dmmv_k_str_values = {
"dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K",
"dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K",
"dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K",
"dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K",
"dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K",
};
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
size_t pos = 0;
while ((pos = s.find(from, pos)) != std::string::npos) {
@@ -673,6 +855,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
std::string generate_kernels() {
std::stringstream src;
src << program_source << '\n';
src << k_quants_source << '\n';
for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
std::string dequant_kernel = dequant_template;
std::string dmmv_kernel = dequant_mul_mat_vec_template;
@@ -690,13 +873,6 @@ std::string generate_kernels() {
}
src << mul_kernel << '\n';
}
for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) {
std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template;
for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) {
replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]);
}
src << dmmv_k_kernel << '\n';
}
return src.str();
}
@@ -729,10 +905,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
exit(1);
}
const char* compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1";
std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
"-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL);
err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
if(err < 0) {
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
@@ -1199,7 +1376,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];

62
ggml.c
View File

@@ -3846,6 +3846,41 @@ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
// WARN:
// Mis-confguration can lead to problem that's hard to reason about:
// * At best it crash or talks nosense.
// * At worst it talks slightly difference but hard to perceive.
//
// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
// Take care about compile options (e.g., GGML_USE_xxx).
static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
static void ggml_setup_op_has_task_pass(void) {
{ // INIT
bool * p = GGML_OP_HAS_INIT;
p[GGML_OP_ACC ] = true;
p[GGML_OP_MUL_MAT ] = true;
p[GGML_OP_OUT_PROD ] = true;
p[GGML_OP_SET ] = true;
p[GGML_OP_GET_ROWS_BACK ] = true;
p[GGML_OP_DIAG_MASK_INF ] = true;
p[GGML_OP_DIAG_MASK_ZERO ] = true;
p[GGML_OP_CONV_1D_S1_PH ] = true;
p[GGML_OP_CONV_1D_S2_PH ] = true;
p[GGML_OP_CONV_2D_SK_P0 ] = true;
p[GGML_OP_FLASH_ATTN_BACK ] = true;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
}
{ // FINALIZE
bool * p = GGML_OP_HAS_FINALIZE;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
}
}
//
// ggml context
//
@@ -4267,6 +4302,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
ggml_cl_init();
#endif
ggml_setup_op_has_task_pass();
is_first_call = false;
}
@@ -16791,9 +16828,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (node_n != -1) {
/* FINALIZE */
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
}
// distribute new work or execute it direct if 1T
@@ -16805,10 +16844,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
state->shared->perf_node_start_cycles = ggml_perf_cycles();
state->shared->perf_node_start_time_us = ggml_perf_time_us();
params.nth = node->n_tasks;
/* INIT */
params.type = GGML_TASK_INIT;
params.nth = node->n_tasks;
ggml_compute_forward(&params, node);
if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_INIT;
ggml_compute_forward(&params, node);
}
if (node->n_tasks == 1) {
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
@@ -16816,9 +16858,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
params.type = GGML_TASK_COMPUTE;
ggml_compute_forward(&params, node);
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(&params, node);
ggml_graph_compute_perf_stats_node(node, state->shared);
}
} else {
break;
}

3
ggml.h
View File

@@ -444,6 +444,9 @@ extern "C" {
// compute types
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
enum ggml_task_type {
GGML_TASK_INIT = 0,
GGML_TASK_COMPUTE,

View File

@@ -66,6 +66,7 @@ enum e_model {
MODEL_65B,
};
static const size_t kB = 1024;
static const size_t MB = 1024*1024;
// computed for n_ctx == 2048
@@ -129,6 +130,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
return k_sizes;
}
// amount of VRAM needed per batch size to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{
static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 512ull * kB },
{ MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB },
{ MODEL_65B, 1536ull * kB },
};
return k_sizes;
}
// amount of VRAM needed per batch size and context to hold temporary results
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{
static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 128ull },
{ MODEL_7B, 128ull },
{ MODEL_13B, 160ull },
{ MODEL_30B, 208ull },
{ MODEL_65B, 416ull },
};
return k_sizes;
}
// default hparams (LLaMA 7B)
struct llama_hparams {
uint32_t n_vocab = 32000;
@@ -165,8 +194,8 @@ struct llama_layer {
};
struct llama_kv_cache {
struct ggml_tensor * k;
struct ggml_tensor * v;
struct ggml_tensor * k = NULL;
struct ggml_tensor * v = NULL;
struct ggml_context * ctx = NULL;
@@ -253,7 +282,13 @@ struct llama_model {
struct llama_context {
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
#ifdef GGML_USE_METAL
~llama_context() {
if (ctx_metal) {
ggml_metal_free(ctx_metal);
}
}
#endif
std::mt19937 rng;
bool has_evaluated_once = false;
@@ -446,9 +481,7 @@ struct llama_file_loader {
std::string word = file.read_string(len);
float score = 0.0f;
if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
file.read_raw(&score, sizeof(score));
}
file.read_raw(&score, sizeof(score));
vocab.token_to_id[word] = i;
@@ -777,7 +810,7 @@ static bool kv_cache_init(
struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
/*.seed =*/ -1,
/*.seed =*/ LLAMA_DEFAULT_SEED,
/*.n_ctx =*/ 512,
/*.n_batch =*/ 512,
/*.gpu_layers =*/ 0,
@@ -1112,11 +1145,14 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
ggml_cuda_set_scratch_size(0); // disable scratch
} else {
vram_scratch = n_batch * MB;
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
ggml_cuda_set_scratch_size(vram_scratch);
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
__func__, vram_scratch / MB);
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
__func__, vram_scratch_base / kB, vram_scratch_per_context,
(vram_scratch + MB - 1) / MB); // round up
}
}
#endif // GGML_USE_CUBLAS
@@ -2541,7 +2577,7 @@ struct llama_context * llama_new_context_with_model(
llama_context * ctx = new llama_context(*model, model->vocab);
if (params.seed < 0) {
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
@@ -2723,7 +2759,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
// create a name -> tensor map of the model to accelerate lookups
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
for (auto & kv: model.tensors_by_name) {
for (const auto & kv: model.tensors_by_name) {
model_tensors.insert(kv);
}
@@ -2974,8 +3010,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
#define LLAMA_MAX_RNG_STATE (64*1024)
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
if (seed < 0) {
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
seed = time(NULL);
}
ctx->rng.seed(seed);
@@ -3219,7 +3255,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
return nread;
}
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
llama_file file(path_session, "rb");
// sanity checks
@@ -3273,6 +3309,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
return true;
}
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
try {
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
} catch (const std::exception & err) {
fprintf(stderr, "error loading session file: %s\n", err.what());
return false;
}
}
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
llama_file file(path_session, "wb");

14
llama.h
View File

@@ -46,6 +46,8 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD
@@ -81,11 +83,11 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params {
int seed; // RNG seed, -1 for random
int n_ctx; // text context
int n_batch; // prompt processing batch size
int n_gpu_layers; // number of layers to store in VRAM
int main_gpu; // the GPU that is used for scratch and small tensors
uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context
int32_t n_batch; // prompt processing batch size
int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
@@ -196,7 +198,7 @@ extern "C" {
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
// Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
// Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens