mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
5 Commits
master-fb6
...
master-0cd
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0cd22e190a | ||
|
|
6456a4eb9f | ||
|
|
cdd5350892 | ||
|
|
738ace394a | ||
|
|
699b1ad7fe |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -16,6 +16,7 @@ build-debug/
|
||||
build-release/
|
||||
build-static/
|
||||
build-cublas/
|
||||
build-opencl/
|
||||
build-no-accel/
|
||||
build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
|
||||
@@ -9,7 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- Qauntization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
|
||||
- Quantization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
|
||||
- [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
|
||||
|
||||
<details>
|
||||
@@ -333,12 +333,12 @@ Several quantization methods are supported. They differ in the resulting model d
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
|
||||
| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
|
||||
| 7B | perplexity | 5.9066 | 6.1565 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
|
||||
| 7B | file size | 13.0G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G |
|
||||
| 7B | ms/tok @ 4th | 128 | 50 | 54 | 75 | 83 | 75 |
|
||||
| 7B | ms/tok @ 8th | 123 | 44 | 52 | 53 | 58 | 72 |
|
||||
| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 |
|
||||
| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
|
||||
| 13B | perplexity | 5.2543 | 5.3860 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
|
||||
| 13B | file size | 25.0G | 7.6G | 9.1G | 8.4G | 9.1G | 14G |
|
||||
| 13B | ms/tok @ 4th | 239 | 93 | 101 | 150 | 164 | 141 |
|
||||
| 13B | ms/tok @ 8th | 240 | 81 | 96 | 96 | 104 | 136 |
|
||||
|
||||
@@ -56,9 +56,6 @@ int main(int argc, char ** argv) {
|
||||
// tokenize the prompt
|
||||
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
// determine newline token
|
||||
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
|
||||
209
ggml-opencl.c
209
ggml-opencl.c
@@ -12,109 +12,129 @@
|
||||
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
||||
const char * clblast_dequant = MULTILINE_QUOTE(
|
||||
|
||||
typedef uchar uint8_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
constant uint QK4_0 = 32;
|
||||
struct block_q4_0
|
||||
{
|
||||
float d;
|
||||
uchar qs[16];
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 32;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
const float d = blocks[i].d;
|
||||
|
||||
const uchar vi = blocks[i].qs[l];
|
||||
|
||||
const uint index = i*32 + l*2;
|
||||
result[index + 0] = ((vi & 0xf) - 8)*d;
|
||||
result[index + 1] = ((vi >> 4) - 8)*d;
|
||||
}
|
||||
|
||||
constant uint QK4_1 = 32;
|
||||
struct block_q4_1
|
||||
{
|
||||
float d;
|
||||
float m;
|
||||
uchar qs[16];
|
||||
uint8_t qs[QK4_1 / 2];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 32;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
const float d = blocks[i].d;
|
||||
const float m = blocks[i].m;
|
||||
|
||||
const uchar vi = blocks[i].qs[l];
|
||||
|
||||
const uint index = i*32 + l*2;
|
||||
result[index + 0] = (vi & 0xf) * d + m;
|
||||
result[index + 1] = (vi >> 4) * d + m;
|
||||
}
|
||||
|
||||
struct block_q5_0
|
||||
constant uint QK5_0 = 32;
|
||||
struct __attribute__ ((packed)) block_q5_0
|
||||
{
|
||||
float d;
|
||||
uint qh;
|
||||
uchar qs[16];
|
||||
half d;
|
||||
uint32_t qh;
|
||||
uint8_t qs[QK5_0 / 2];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 32;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
const float d = blocks[i].d;
|
||||
|
||||
const uchar vi = blocks[i].qs[l];
|
||||
|
||||
const uint l2 = l * 2;
|
||||
|
||||
const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
|
||||
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
|
||||
|
||||
const uint index = i*32 + l2;
|
||||
result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
|
||||
result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
|
||||
}
|
||||
|
||||
constant uint QK5_1 = 32;
|
||||
struct block_q5_1
|
||||
{
|
||||
ushort d;
|
||||
ushort m;
|
||||
uint qh;
|
||||
uchar qs[16];
|
||||
half d;
|
||||
half m;
|
||||
uint32_t qh;
|
||||
uint8_t qs[QK5_1 / 2];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 32;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &blocks[i].d);
|
||||
const float m = vload_half(0, (__global half*) &blocks[i].m);
|
||||
|
||||
const uchar vi = blocks[i].qs[l];
|
||||
|
||||
const uint l2 = l * 2;
|
||||
|
||||
const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
|
||||
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
|
||||
|
||||
const uint index = i*32 + l2;
|
||||
result[index + 0] = ((vi & 0xf) | vh0)*d + m;
|
||||
result[index + 1] = ((vi >> 4) | vh1)*d + m;
|
||||
}
|
||||
|
||||
constant uint QK8_0 = 32;
|
||||
struct block_q8_0
|
||||
{
|
||||
float d;
|
||||
char qs[32];
|
||||
uint8_t qs[QK8_0];
|
||||
};
|
||||
|
||||
__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
|
||||
const uint i = get_global_id(0) / 32;
|
||||
const uint l = get_local_id(0);
|
||||
|
||||
result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
|
||||
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
|
||||
constant uint qk = QK4_0;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
const int x0 = (x[i].qs[j] & 0xf) - 8;
|
||||
const int x1 = (x[i].qs[j] >> 4) - 8;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d;
|
||||
y[i*qk + j + qk/2] = x1*d;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
|
||||
constant uint qk = QK4_1;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
const float m = x[i].m;
|
||||
|
||||
const int x0 = (x[i].qs[j] & 0xf);
|
||||
const int x1 = (x[i].qs[j] >> 4);
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d + m;
|
||||
y[i*qk + j + qk/2] = x1*d + m;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
|
||||
constant uint qk = QK5_0;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
|
||||
uint32_t qh = x[i].qh;
|
||||
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
|
||||
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d;
|
||||
y[i*qk + j + qk/2] = x1*d;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
|
||||
constant uint qk = QK5_1;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
const float m = vload_half(0, (__global half*) &x[i].m);
|
||||
|
||||
uint32_t qh = x[i].qh;
|
||||
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
|
||||
const int x1 = (x[i].qs[j] >> 4) | xh_1;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d + m;
|
||||
y[i*qk + j + qk/2] = x1*d + m;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
|
||||
constant uint qk = QK8_0;
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
y[i*qk + j] = x[i].qs[j]*d;
|
||||
}
|
||||
|
||||
);
|
||||
@@ -128,20 +148,6 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define QK5_0 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
||||
} block_q5_0;
|
||||
|
||||
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
uint32_t qh; // 5-th bit of quants
|
||||
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
||||
} cl_block_q5_0;
|
||||
|
||||
static cl_platform_id platform;
|
||||
static cl_device_id device;
|
||||
static cl_context context;
|
||||
@@ -252,7 +258,6 @@ void ggml_cl_sgemm_wrapper(
|
||||
cl_kernel kernel;
|
||||
size_t global = n * k, local, size_qb;
|
||||
bool dequant;
|
||||
cl_block_q5_0* cl_host_b;
|
||||
|
||||
switch (btype) {
|
||||
case GGML_TYPE_F32:
|
||||
@@ -274,18 +279,7 @@ void ggml_cl_sgemm_wrapper(
|
||||
dequant = true;
|
||||
kernel = kernel_q5_0;
|
||||
local = 16;
|
||||
// For some reason OpenCL seems to be incapable of working with structs of size 22.
|
||||
// 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
|
||||
// TODO Find the reason, fix and remove workaround.
|
||||
const block_q5_0* b = (const block_q5_0*) host_b;
|
||||
cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
|
||||
for (size_t i = 0; i < global / 32; i++) {
|
||||
cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
|
||||
memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
|
||||
memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
|
||||
}
|
||||
host_b = (const float*) cl_host_b;
|
||||
size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
dequant = true;
|
||||
@@ -364,7 +358,4 @@ void ggml_cl_sgemm_wrapper(
|
||||
clWaitForEvents(1, &ev_c);
|
||||
clReleaseEvent(ev_sgemm);
|
||||
clReleaseEvent(ev_c);
|
||||
if (btype == GGML_TYPE_Q5_0) {
|
||||
free((void*) cl_host_b);
|
||||
}
|
||||
}
|
||||
|
||||
133
llama.cpp
133
llama.cpp
@@ -50,49 +50,49 @@ static const size_t MB = 1024*1024;
|
||||
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH0;
|
||||
return k_sizes;
|
||||
}
|
||||
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH1;
|
||||
return k_sizes;
|
||||
}
|
||||
|
||||
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
||||
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_7B, 1026ull * MB },
|
||||
{ MODEL_13B, 1608ull * MB },
|
||||
{ MODEL_30B, 3124ull * MB },
|
||||
{ MODEL_65B, 5120ull * MB },
|
||||
};
|
||||
return _MEM_REQ_KV_SELF;
|
||||
return k_sizes;
|
||||
}
|
||||
|
||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
||||
// not actually needed if BLAS is disabled
|
||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_7B, 768ull * MB },
|
||||
{ MODEL_13B, 1024ull * MB },
|
||||
{ MODEL_30B, 1280ull * MB },
|
||||
{ MODEL_65B, 1536ull * MB },
|
||||
};
|
||||
return _MEM_REQ_EVAL;
|
||||
return k_sizes;
|
||||
}
|
||||
|
||||
// default hparams (LLaMA 7B)
|
||||
@@ -586,12 +586,12 @@ struct llama_model_loader {
|
||||
std::unique_ptr<llama_mmap> mapping;
|
||||
|
||||
llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
|
||||
auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
||||
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
|
||||
file_loaders.emplace_back(first_file);
|
||||
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
|
||||
for (uint32_t i = 1; i < n_parts; i++) {
|
||||
std::string fname = fname_base + "." + std::to_string(i);
|
||||
auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
||||
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
|
||||
file_loaders.emplace_back(ith_file);
|
||||
if (ith_file->hparams != first_file->hparams) {
|
||||
throw format("llama.cpp: hparams inconsistent between files");
|
||||
@@ -638,7 +638,7 @@ struct llama_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
||||
auto it = tensors_map.name_to_idx.find(name);
|
||||
if (it == tensors_map.name_to_idx.end()) {
|
||||
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
||||
@@ -667,7 +667,7 @@ struct llama_model_loader {
|
||||
return tensor;
|
||||
}
|
||||
|
||||
void done_getting_tensors() {
|
||||
void done_getting_tensors() const {
|
||||
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
||||
throw std::string("llama.cpp: file contained more tensors than expected");
|
||||
}
|
||||
@@ -934,7 +934,8 @@ static void llama_model_load_internal(
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
size_t ctx_size, mmapped_size;
|
||||
size_t ctx_size;
|
||||
size_t mmapped_size;
|
||||
ml->calc_sizes(&ctx_size, &mmapped_size);
|
||||
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
|
||||
|
||||
@@ -1074,7 +1075,7 @@ static bool llama_eval_internal(
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
auto & kv_self = model.kv_self;
|
||||
const auto & kv_self = model.kv_self;
|
||||
|
||||
LLAMA_ASSERT(!!kv_self.ctx);
|
||||
|
||||
@@ -1318,7 +1319,7 @@ static bool llama_eval_internal(
|
||||
}
|
||||
|
||||
// extract embeddings
|
||||
if (lctx.embedding.size()) {
|
||||
if (!lctx.embedding.empty()) {
|
||||
auto & embedding_out = lctx.embedding;
|
||||
|
||||
embedding_out.resize(n_embd);
|
||||
@@ -1369,6 +1370,8 @@ struct llama_sp_symbol {
|
||||
size_t n;
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
|
||||
|
||||
struct llama_sp_bigram {
|
||||
struct comparator {
|
||||
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
||||
@@ -1401,7 +1404,7 @@ struct llama_tokenizer {
|
||||
sym.prev = index - 1;
|
||||
sym.next = offs == text.size() ? -1 : index + 1;
|
||||
index++;
|
||||
symbols_.emplace_back(std::move(sym));
|
||||
symbols_.emplace_back(sym);
|
||||
}
|
||||
|
||||
// seed the work queue with all possible 2-character tokens.
|
||||
@@ -1492,7 +1495,7 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
||||
llama_tokenizer tokenizer(vocab);
|
||||
std::vector<llama_vocab::id> output;
|
||||
|
||||
if (text.size() == 0) {
|
||||
if (text.empty()) {
|
||||
return output;
|
||||
}
|
||||
|
||||
@@ -1728,7 +1731,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
||||
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
||||
if (token_iter == last_tokens + last_tokens_size) {
|
||||
continue;
|
||||
}
|
||||
@@ -1872,7 +1875,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Find max element
|
||||
auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||
return a.logit < b.logit;
|
||||
});
|
||||
|
||||
@@ -1925,7 +1928,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
nthread = std::thread::hardware_concurrency();
|
||||
}
|
||||
|
||||
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
||||
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
||||
/*vocab_only*/ false));
|
||||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
||||
|
||||
@@ -1979,7 +1982,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
} else if (tensor.type == GGML_TYPE_F16) {
|
||||
f32_conv_buf.resize(nelements * sizeof(float));
|
||||
f32_data = (float *) f32_conv_buf.addr;
|
||||
auto f16_data = (const ggml_fp16_t *) tensor.data;
|
||||
const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
||||
for (size_t i = 0; i < nelements; i++) {
|
||||
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
||||
}
|
||||
@@ -2010,21 +2013,31 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
size_t first = counter; counter += chunk_size;
|
||||
if (first >= nelements) {
|
||||
if (!local_hist.empty()) {
|
||||
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
|
||||
for (int j=0; j<int(local_hist.size()); ++j) {
|
||||
hist_cur[j] += local_hist[j];
|
||||
}
|
||||
new_size += local_size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
size_t last = std::min(nelements, first + chunk_size);
|
||||
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
|
||||
if (local_hist.empty()) {
|
||||
local_hist.resize(hist_cur.size(), 0);
|
||||
}
|
||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
||||
}
|
||||
};
|
||||
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
|
||||
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
|
||||
if ((int) workers.size() < nthread_use - 1) {
|
||||
workers.resize(nthread_use - 1);
|
||||
}
|
||||
for (int it = 0; it < nthread_use - 1; ++it) {
|
||||
workers[it] = std::thread(compute);
|
||||
}
|
||||
compute();
|
||||
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
|
||||
for (int it = 0; it < nthread_use - 1; ++it) {
|
||||
workers[it].join();
|
||||
}
|
||||
}
|
||||
|
||||
printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
@@ -2222,7 +2235,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
||||
|
||||
size_t ctx_size, mmapped_size;
|
||||
size_t ctx_size;
|
||||
size_t mmapped_size;
|
||||
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
||||
base_buf.resize(ctx_size);
|
||||
|
||||
@@ -2261,8 +2275,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
}
|
||||
|
||||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
std::string name;
|
||||
{
|
||||
char buf[1024];
|
||||
fin.read(buf, length);
|
||||
name = std::string(buf, length);
|
||||
}
|
||||
|
||||
// check for lora suffix and get the type of tensor
|
||||
const std::string lora_suffix = ".lora";
|
||||
@@ -2277,7 +2295,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
base_name.erase(pos);
|
||||
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
||||
|
||||
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
||||
if (model_tensors.find(base_name) == model_tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
||||
return 1;
|
||||
}
|
||||
@@ -2379,8 +2397,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
lora_tensors.clear();
|
||||
|
||||
n_tensors++;
|
||||
if (n_tensors % 4 == 0)
|
||||
if (n_tensors % 4 == 0) {
|
||||
fprintf(stderr, ".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2409,7 +2428,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.n;
|
||||
}
|
||||
|
||||
#define LLAMA_MAX_RNG_STATE 64*1024
|
||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||
|
||||
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
||||
if (seed < 0) {
|
||||
@@ -2450,8 +2469,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||
}
|
||||
|
||||
// Copies the state to the specified destination address
|
||||
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
||||
uint8_t * out = dest;
|
||||
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||
uint8_t * out = dst;
|
||||
|
||||
// copy rng
|
||||
{
|
||||
@@ -2511,7 +2530,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
||||
|
||||
if (kv_size) {
|
||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
||||
|
||||
char buffer[4096];
|
||||
|
||||
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
gf.n_threads = 1;
|
||||
@@ -2535,10 +2556,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||
ggml_graph_compute(cpy_ctx, &gf);
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t written = out - dest;
|
||||
const size_t written = out - dst;
|
||||
const size_t max_size = llama_get_state_size(ctx);
|
||||
|
||||
LLAMA_ASSERT(written <= max_size);
|
||||
@@ -2548,15 +2571,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
||||
|
||||
// Sets the state reading from the specified source address
|
||||
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
const uint8_t * in = src;
|
||||
const uint8_t * inp = src;
|
||||
|
||||
// set rng
|
||||
{
|
||||
size_t rng_size;
|
||||
char rng_buf[LLAMA_MAX_RNG_STATE];
|
||||
|
||||
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
||||
memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
|
||||
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
||||
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
||||
|
||||
std::stringstream rng_ss;
|
||||
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
||||
@@ -2570,30 +2593,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
size_t logits_cap;
|
||||
size_t logits_size;
|
||||
|
||||
memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
|
||||
memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
|
||||
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
||||
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
||||
|
||||
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
||||
|
||||
if (logits_size) {
|
||||
ctx->logits.resize(logits_size);
|
||||
memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
|
||||
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
||||
}
|
||||
|
||||
in += logits_cap * sizeof(float);
|
||||
inp += logits_cap * sizeof(float);
|
||||
}
|
||||
|
||||
// set embeddings
|
||||
{
|
||||
size_t embedding_size;
|
||||
|
||||
memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
|
||||
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
|
||||
|
||||
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
||||
|
||||
if (embedding_size) {
|
||||
memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
|
||||
in += embedding_size * sizeof(float);
|
||||
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
|
||||
inp += embedding_size * sizeof(float);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2608,25 +2631,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
size_t kv_size;
|
||||
int kv_ntok;
|
||||
|
||||
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
||||
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
||||
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
||||
memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
|
||||
|
||||
if (kv_size) {
|
||||
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
||||
|
||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
||||
|
||||
char buffer[4096];
|
||||
|
||||
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
gf.n_threads = 1;
|
||||
|
||||
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||
kin3d->data = (void *) in;
|
||||
in += ggml_nbytes(kin3d);
|
||||
kin3d->data = (void *) inp;
|
||||
inp += ggml_nbytes(kin3d);
|
||||
|
||||
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
||||
vin3d->data = (void *) in;
|
||||
in += ggml_nbytes(vin3d);
|
||||
vin3d->data = (void *) inp;
|
||||
inp += ggml_nbytes(vin3d);
|
||||
|
||||
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
||||
n_embd, kv_ntok, n_layer,
|
||||
@@ -2639,12 +2664,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||
ggml_graph_compute(cpy_ctx, &gf);
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
|
||||
ctx->model.kv_self.n = kv_ntok;
|
||||
}
|
||||
|
||||
const size_t nread = in - src;
|
||||
const size_t nread = inp - src;
|
||||
const size_t max_size = llama_get_state_size(ctx);
|
||||
|
||||
LLAMA_ASSERT(nread <= max_size);
|
||||
@@ -2660,7 +2687,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
||||
const uint32_t magic = file.read_u32();
|
||||
const uint32_t version = file.read_u32();
|
||||
|
||||
if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
|
||||
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
||||
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
||||
return false;
|
||||
}
|
||||
|
||||
2
llama.h
2
llama.h
@@ -134,7 +134,7 @@ extern "C" {
|
||||
// Copies the state to the specified destination address.
|
||||
// Destination needs to have allocated enough memory.
|
||||
// Returns the number of bytes copied
|
||||
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
|
||||
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
||||
|
||||
// Set the state reading from the specified address
|
||||
// Returns the number of bytes read
|
||||
|
||||
Reference in New Issue
Block a user