mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-12 14:03:20 +02:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e85bb1a8e7 | ||
|
|
3e916a07ac | ||
|
|
947f64f163 | ||
|
|
b83e149ec6 | ||
|
|
4f447a4833 |
@@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
|
||||
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
|
||||
|
||||
randomize_tensor_normal(lora->tok_embeddings_a, rnd);
|
||||
randomize_tensor_normal(lora->tok_embeddings_b, rnd);
|
||||
ggml_set_zero(lora->tok_embeddings_b);
|
||||
randomize_tensor_normal(lora->norm_a, rnd);
|
||||
randomize_tensor_normal(lora->norm_b, rnd);
|
||||
ggml_set_zero(lora->norm_b);
|
||||
randomize_tensor_normal(lora->output_a, rnd);
|
||||
randomize_tensor_normal(lora->output_b, rnd);
|
||||
ggml_set_zero(lora->output_b);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
auto & layer = lora->layers[i];
|
||||
randomize_tensor_normal(layer.attention_norm_a, rnd);
|
||||
randomize_tensor_normal(layer.attention_norm_b, rnd);
|
||||
ggml_set_zero(layer.attention_norm_b);
|
||||
|
||||
randomize_tensor_normal(layer.wq_a, rnd);
|
||||
randomize_tensor_normal(layer.wq_b, rnd);
|
||||
ggml_set_zero(layer.wq_b);
|
||||
randomize_tensor_normal(layer.wk_a, rnd);
|
||||
randomize_tensor_normal(layer.wk_b, rnd);
|
||||
ggml_set_zero(layer.wk_b);
|
||||
randomize_tensor_normal(layer.wv_a, rnd);
|
||||
randomize_tensor_normal(layer.wv_b, rnd);
|
||||
ggml_set_zero(layer.wv_b);
|
||||
randomize_tensor_normal(layer.wo_a, rnd);
|
||||
randomize_tensor_normal(layer.wo_b, rnd);
|
||||
ggml_set_zero(layer.wo_b);
|
||||
|
||||
randomize_tensor_normal(layer.ffn_norm_a, rnd);
|
||||
randomize_tensor_normal(layer.ffn_norm_b, rnd);
|
||||
ggml_set_zero(layer.ffn_norm_b);
|
||||
|
||||
randomize_tensor_normal(layer.w1_a, rnd);
|
||||
randomize_tensor_normal(layer.w1_b, rnd);
|
||||
ggml_set_zero(layer.w1_b);
|
||||
randomize_tensor_normal(layer.w2_a, rnd);
|
||||
randomize_tensor_normal(layer.w2_b, rnd);
|
||||
ggml_set_zero(layer.w2_b);
|
||||
randomize_tensor_normal(layer.w3_a, rnd);
|
||||
randomize_tensor_normal(layer.w3_b, rnd);
|
||||
ggml_set_zero(layer.w3_b);
|
||||
}
|
||||
|
||||
free_random_normal_distribution(rnd);
|
||||
|
||||
@@ -5840,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||
return ptr;
|
||||
}
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
||||
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
|
||||
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
||||
#endif
|
||||
void * ptr;
|
||||
@@ -5978,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
||||
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
||||
// This can fixed the OOM error in WSL.
|
||||
cudaGetLastError();
|
||||
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
||||
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
|
||||
size/1024.0/1024.0, cudaGetErrorString(err));
|
||||
return nullptr;
|
||||
}
|
||||
@@ -6356,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return 1;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
||||
@@ -6378,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
return 64;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return 1;
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
|
||||
14
ggml-metal.m
14
ggml-metal.m
@@ -345,10 +345,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
}
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MiB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
if (ctx->device.maxTransferRate != 0) {
|
||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MiB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
} else {
|
||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
||||
}
|
||||
@@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
|
||||
++ctx->n_buffers;
|
||||
} else {
|
||||
@@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||
if (i + size_step < size) {
|
||||
GGML_METAL_LOG_INFO("\n");
|
||||
}
|
||||
|
||||
94
ggml.c
94
ggml.c
@@ -9611,10 +9611,12 @@ static void ggml_compute_forward_out_prod_f32(
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_ASSERT(ne0 == ne00);
|
||||
GGML_ASSERT(ne1 == ne10);
|
||||
GGML_ASSERT(ne2 == ne02);
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
GGML_ASSERT(ne03 == ne13);
|
||||
GGML_ASSERT(ne2 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
GGML_ASSERT(ne03 == ne13);
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
@@ -9625,18 +9627,25 @@ static void ggml_compute_forward_out_prod_f32(
|
||||
// GGML_ASSERT(nb1 <= nb2);
|
||||
// GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
GGML_ASSERT(ne0 == ne00);
|
||||
GGML_ASSERT(ne1 == ne10);
|
||||
GGML_ASSERT(ne2 == ne02);
|
||||
GGML_ASSERT(ne3 == ne03);
|
||||
|
||||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
||||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
// TODO: #if defined(GGML_USE_CLBLAST)
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
bool use_blas = ggml_is_matrix(src0) &&
|
||||
ggml_is_matrix(src1) &&
|
||||
ggml_is_contiguous(src0) &&
|
||||
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
||||
#endif
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
||||
if (use_blas) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
||||
return;
|
||||
}
|
||||
@@ -9645,6 +9654,50 @@ static void ggml_compute_forward_out_prod_f32(
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (use_blas) {
|
||||
if (params->ith != 0) { // All threads other than the first do no work.
|
||||
return;
|
||||
}
|
||||
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
||||
// src0: (k,n)
|
||||
// src1: (k,m)
|
||||
// dst: (m,n)
|
||||
//
|
||||
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
||||
// Also expressed as (major,minor)
|
||||
// a: (m,k): so src1 transposed
|
||||
// b: (k,n): so src0
|
||||
// c: (m,n)
|
||||
//
|
||||
// However, if ggml_is_transposed(src1) is true, then
|
||||
// src1->data already contains a transposed version, so sgemm mustn't
|
||||
// transpose it further.
|
||||
|
||||
int n = src0->ne[0];
|
||||
int k = src0->ne[1];
|
||||
int m = src1->ne[0];
|
||||
|
||||
int transposeA, lda;
|
||||
|
||||
if (!ggml_is_transposed(src1)) {
|
||||
transposeA = CblasTrans;
|
||||
lda = m;
|
||||
} else {
|
||||
transposeA = CblasNoTrans;
|
||||
lda = k;
|
||||
}
|
||||
|
||||
float * a = (float *) ((char *) src1->data);
|
||||
float * b = (float *) ((char *) src0->data);
|
||||
float * c = (float *) ((char *) dst->data);
|
||||
|
||||
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
// dst[:,:,:,:] = 0
|
||||
// for i2,i3:
|
||||
// for i1:
|
||||
@@ -18399,24 +18452,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
||||
}
|
||||
|
||||
const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
return ctx->kv[key_id].key.data;
|
||||
}
|
||||
|
||||
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
return ctx->kv[key_id].type;
|
||||
}
|
||||
|
||||
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
||||
return ctx->kv[key_id].value.arr.type;
|
||||
}
|
||||
|
||||
const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
||||
return ctx->kv[key_id].value.arr.data;
|
||||
}
|
||||
|
||||
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
||||
struct gguf_kv * kv = &ctx->kv[key_id];
|
||||
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
||||
@@ -18424,70 +18482,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
|
||||
}
|
||||
|
||||
int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
||||
return ctx->kv[key_id].value.arr.n;
|
||||
}
|
||||
|
||||
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
|
||||
return ctx->kv[key_id].value.uint8;
|
||||
}
|
||||
|
||||
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
|
||||
return ctx->kv[key_id].value.int8;
|
||||
}
|
||||
|
||||
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
|
||||
return ctx->kv[key_id].value.uint16;
|
||||
}
|
||||
|
||||
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
|
||||
return ctx->kv[key_id].value.int16;
|
||||
}
|
||||
|
||||
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
|
||||
return ctx->kv[key_id].value.uint32;
|
||||
}
|
||||
|
||||
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
|
||||
return ctx->kv[key_id].value.int32;
|
||||
}
|
||||
|
||||
float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
|
||||
return ctx->kv[key_id].value.float32;
|
||||
}
|
||||
|
||||
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
|
||||
return ctx->kv[key_id].value.uint64;
|
||||
}
|
||||
|
||||
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
|
||||
return ctx->kv[key_id].value.int64;
|
||||
}
|
||||
|
||||
double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
|
||||
return ctx->kv[key_id].value.float64;
|
||||
}
|
||||
|
||||
bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
|
||||
return ctx->kv[key_id].value.bool_;
|
||||
}
|
||||
|
||||
const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
|
||||
return ctx->kv[key_id].value.str.data;
|
||||
}
|
||||
|
||||
const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
|
||||
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
|
||||
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
|
||||
return &ctx->kv[key_id].value;
|
||||
}
|
||||
|
||||
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
||||
return ctx->header.n_tensors;
|
||||
}
|
||||
|
||||
1
ggml.h
1
ggml.h
@@ -2045,6 +2045,7 @@ extern "C" {
|
||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
||||
|
||||
163
llama.cpp
163
llama.cpp
@@ -604,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||
}
|
||||
|
||||
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
||||
switch (type) {
|
||||
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
||||
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
||||
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
||||
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
||||
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
||||
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
||||
default: return format("unknown type %d", type);
|
||||
}
|
||||
}
|
||||
|
||||
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
|
||||
switch (type) {
|
||||
case GGUF_TYPE_STRING:
|
||||
return gguf_get_val_str(ctx_gguf, i);
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
if (arr_type == GGUF_TYPE_STRING) {
|
||||
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
||||
// escape quotes
|
||||
replace_all(val, "\\", "\\\\");
|
||||
replace_all(val, "\"", "\\\"");
|
||||
ss << '"' << val << '"';
|
||||
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
||||
ss << "???";
|
||||
} else {
|
||||
ss << gguf_data_to_str(arr_type, data, j);
|
||||
}
|
||||
if (j < arr_n - 1) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
default:
|
||||
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// ggml helpers
|
||||
//
|
||||
@@ -1087,9 +1141,9 @@ enum e_model {
|
||||
MODEL_70B,
|
||||
};
|
||||
|
||||
static const size_t kB = 1024;
|
||||
static const size_t MB = 1024*kB;
|
||||
static const size_t GB = 1024*MB;
|
||||
static const size_t kiB = 1024;
|
||||
static const size_t MiB = 1024*kiB;
|
||||
static const size_t GiB = 1024*MiB;
|
||||
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
@@ -1327,6 +1381,9 @@ struct llama_model {
|
||||
|
||||
int n_gpu_layers;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
// context
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
@@ -1488,7 +1545,7 @@ static bool llama_kv_cache_init(
|
||||
vram_kv_cache += ggml_nbytes(cache.k);
|
||||
}
|
||||
if (vram_kv_cache > 0) {
|
||||
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -1785,10 +1842,10 @@ struct llama_model_loader {
|
||||
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
||||
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
ftype = LLAMA_FTYPE_ALL_F32;
|
||||
} break;
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
ftype = LLAMA_FTYPE_ALL_F32;
|
||||
} break;
|
||||
}
|
||||
|
||||
// this is a way to mark that we have "guessed" the file type
|
||||
@@ -1802,10 +1859,20 @@ struct llama_model_loader {
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(ctx_gguf, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
const char * name = gguf_get_key(ctx_gguf, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
|
||||
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
||||
const size_t MAX_VALUE_LEN = 40;
|
||||
if (value.size() > MAX_VALUE_LEN) {
|
||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||
}
|
||||
|
||||
// print type counts
|
||||
@@ -2100,6 +2167,17 @@ static void llm_load_hparams(
|
||||
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
// get metadata as string
|
||||
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
||||
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||
if (type == GGUF_TYPE_ARRAY) {
|
||||
continue;
|
||||
}
|
||||
const char * name = gguf_get_key(ctx, i);
|
||||
const std::string value = gguf_kv_to_str(ctx, i);
|
||||
model.gguf_kv.emplace(name, value);
|
||||
}
|
||||
|
||||
// get general kv
|
||||
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
||||
|
||||
@@ -2543,8 +2621,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
||||
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
||||
if (ml.n_bytes < GB) {
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
if (ml.n_bytes < GiB) {
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
}
|
||||
@@ -2582,7 +2660,7 @@ static void llm_load_tensors(
|
||||
|
||||
ml.calc_sizes(ctx_size, mmapped_size);
|
||||
|
||||
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
@@ -3231,7 +3309,7 @@ static void llm_load_tensors(
|
||||
ctx_size +
|
||||
mmapped_size - vram_weights; // weights in VRAM not in memory
|
||||
|
||||
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
@@ -3250,7 +3328,7 @@ static void llm_load_tensors(
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
||||
#else
|
||||
(void) n_gpu_layers;
|
||||
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
@@ -7962,7 +8040,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
workers.clear();
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
int64_t tot_count = 0;
|
||||
for (size_t i = 0; i < hist_cur.size(); i++) {
|
||||
hist_all[i] += hist_cur[i];
|
||||
@@ -8502,7 +8580,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
{
|
||||
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
||||
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
// resized during inference
|
||||
@@ -8547,7 +8625,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
// measure memory requirements for the graph
|
||||
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
||||
|
||||
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
||||
|
||||
// recreate allocator with exact memory requirements
|
||||
ggml_allocr_free(ctx->alloc);
|
||||
@@ -8561,7 +8639,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
#endif
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
ggml_cuda_set_scratch_size(alloc_size);
|
||||
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
||||
|
||||
// calculate total VRAM usage
|
||||
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
||||
@@ -8581,10 +8659,10 @@ struct llama_context * llama_new_context_with_model(
|
||||
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
||||
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
||||
|
||||
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
|
||||
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
||||
total_vram_size / 1024.0 / 1024.0,
|
||||
model_vram_size / 1024.0 / 1024.0,
|
||||
ctx_vram_size / 1024.0 / 1024.0);
|
||||
ctx_vram_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -8605,7 +8683,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
||||
|
||||
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
|
||||
|
||||
#define LLAMA_METAL_CHECK_BUF(result) \
|
||||
if (!(result)) { \
|
||||
@@ -8671,6 +8749,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
||||
return model->hparams.rope_freq_scale_train;
|
||||
}
|
||||
|
||||
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
||||
const auto & it = model->gguf_kv.find(key);
|
||||
if (it == model->gguf_kv.end()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
int llama_model_meta_count(const struct llama_model * model) {
|
||||
return (int)model->gguf_kv.size();
|
||||
}
|
||||
|
||||
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
||||
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
auto it = model->gguf_kv.begin();
|
||||
std::advance(it, i);
|
||||
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
||||
}
|
||||
|
||||
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
||||
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
auto it = model->gguf_kv.begin();
|
||||
std::advance(it, i);
|
||||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
return snprintf(buf, buf_size, "%s %s %s",
|
||||
llama_model_arch_name(model->arch).c_str(),
|
||||
|
||||
17
llama.h
17
llama.h
@@ -301,6 +301,23 @@ extern "C" {
|
||||
// Get the model's RoPE frequency scaling factor
|
||||
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||
|
||||
// Functions to access the model's GGUF metadata scalar values
|
||||
// - The functions return the length of the string on success, or -1 on failure
|
||||
// - The output string is always null-terminated and cleared on failure
|
||||
// - GGUF array values are not supported by these functions
|
||||
|
||||
// Get metadata value as a string by key name
|
||||
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
||||
|
||||
// Get the number of metadata key/value pairs
|
||||
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
|
||||
|
||||
// Get metadata key name by index
|
||||
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
||||
|
||||
// Get metadata value as a string by index
|
||||
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
||||
|
||||
// Get a string describing the model type
|
||||
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user