Compare commits

..

5 Commits
b1520 ... b1525

Author SHA1 Message Date
slaren
e85bb1a8e7 llama : add functions to get the model's metadata (#4013)
* llama : add functions to get the model's metadata

* format -> std::to_string

* better documentation
2023-11-17 17:17:37 +02:00
gwjr
3e916a07ac finetune : speed-up ggml_compute_forward_out_prod_f32 via BLAS (#4079)
* Remove logically superfluous assertions and order by dimension

* Use cblas_sgemm() to implement ggml_compute_forward_out_prod()

* Remove ggml_compute_forward_out_prod_use_blas(), fix compiling errors on cmake/zig, remove trailing whitespace

* Add openBLAS support for sgemm() in compute_forward_out_prod()
2023-11-17 16:48:19 +02:00
Andrew Godfrey
947f64f163 finetune : zero the loraB initial vectors (#4082)
* finetune : zero the loraB initial vectors

Without this, the first iteration is starting out far from the base model, instead of exactly on it.
Zeroing loraB is what the paper recommends. loralib also zeroes at least one of the init vector pairs
(though it departs from the paper in using a different distribution for the other vector, in some cases).

* tabs to spaces

* Use ggml_set_zero instead of adding a new function
2023-11-17 11:23:11 +01:00
Andrew Godfrey
b83e149ec6 cuda : get_row_rounding F32 (#4095)
* Fix #4017

* Update ggml-cuda.cu

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

---------

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
2023-11-17 10:01:15 +02:00
Georgi Gerganov
4f447a4833 llama : fix data units (#4101)
* llama : fix data units

ggml-ci

* Revert "llama : fix data units"

This reverts commit f5feac831f.

* llama : disambiguate data units

ggml-ci
2023-11-17 10:00:15 +02:00
7 changed files with 267 additions and 52 deletions

View File

@@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
randomize_tensor_normal(lora->tok_embeddings_a, rnd);
randomize_tensor_normal(lora->tok_embeddings_b, rnd);
ggml_set_zero(lora->tok_embeddings_b);
randomize_tensor_normal(lora->norm_a, rnd);
randomize_tensor_normal(lora->norm_b, rnd);
ggml_set_zero(lora->norm_b);
randomize_tensor_normal(lora->output_a, rnd);
randomize_tensor_normal(lora->output_b, rnd);
ggml_set_zero(lora->output_b);
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = lora->layers[i];
randomize_tensor_normal(layer.attention_norm_a, rnd);
randomize_tensor_normal(layer.attention_norm_b, rnd);
ggml_set_zero(layer.attention_norm_b);
randomize_tensor_normal(layer.wq_a, rnd);
randomize_tensor_normal(layer.wq_b, rnd);
ggml_set_zero(layer.wq_b);
randomize_tensor_normal(layer.wk_a, rnd);
randomize_tensor_normal(layer.wk_b, rnd);
ggml_set_zero(layer.wk_b);
randomize_tensor_normal(layer.wv_a, rnd);
randomize_tensor_normal(layer.wv_b, rnd);
ggml_set_zero(layer.wv_b);
randomize_tensor_normal(layer.wo_a, rnd);
randomize_tensor_normal(layer.wo_b, rnd);
ggml_set_zero(layer.wo_b);
randomize_tensor_normal(layer.ffn_norm_a, rnd);
randomize_tensor_normal(layer.ffn_norm_b, rnd);
ggml_set_zero(layer.ffn_norm_b);
randomize_tensor_normal(layer.w1_a, rnd);
randomize_tensor_normal(layer.w1_b, rnd);
ggml_set_zero(layer.w1_b);
randomize_tensor_normal(layer.w2_a, rnd);
randomize_tensor_normal(layer.w2_b, rnd);
ggml_set_zero(layer.w2_b);
randomize_tensor_normal(layer.w3_a, rnd);
randomize_tensor_normal(layer.w3_b, rnd);
ggml_set_zero(layer.w3_b);
}
free_random_normal_distribution(rnd);

View File

@@ -5840,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
return ptr;
}
#ifdef DEBUG_CUDA_MALLOC
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
#endif
void * ptr;
@@ -5978,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
// The allocation error can be bypassed. A null ptr will assigned out of this function.
// This can fixed the OOM error in WSL.
cudaGetLastError();
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
size/1024.0/1024.0, cudaGetErrorString(err));
return nullptr;
}
@@ -6356,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
case GGML_TYPE_Q8_0:
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
case GGML_TYPE_F16:
case GGML_TYPE_F32:
return 1;
case GGML_TYPE_Q2_K:
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
@@ -6378,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
case GGML_TYPE_Q8_0:
return 64;
case GGML_TYPE_F16:
case GGML_TYPE_F32:
return 1;
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:

View File

@@ -345,10 +345,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
}
}
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MiB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.maxTransferRate != 0) {
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MiB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
} else {
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
}
@@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
return false;
}
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
++ctx->n_buffers;
} else {
@@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
return false;
}
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
if (i + size_step < size) {
GGML_METAL_LOG_INFO("\n");
}

94
ggml.c
View File

@@ -9611,10 +9611,12 @@ static void ggml_compute_forward_out_prod_f32(
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ne0 == ne00);
GGML_ASSERT(ne1 == ne10);
GGML_ASSERT(ne2 == ne02);
GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne03 == ne13);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
GGML_ASSERT(ne03 == ne13);
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == sizeof(float));
@@ -9625,18 +9627,25 @@ static void ggml_compute_forward_out_prod_f32(
// GGML_ASSERT(nb1 <= nb2);
// GGML_ASSERT(nb2 <= nb3);
GGML_ASSERT(ne0 == ne00);
GGML_ASSERT(ne1 == ne10);
GGML_ASSERT(ne2 == ne02);
GGML_ASSERT(ne3 == ne03);
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
// TODO: #if defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
bool use_blas = ggml_is_matrix(src0) &&
ggml_is_matrix(src1) &&
ggml_is_contiguous(src0) &&
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
#endif
if (params->type == GGML_TASK_INIT) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
if (use_blas) {
return;
}
#endif
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
return;
}
@@ -9645,6 +9654,50 @@ static void ggml_compute_forward_out_prod_f32(
return;
}
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (use_blas) {
if (params->ith != 0) { // All threads other than the first do no work.
return;
}
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
// src0: (k,n)
// src1: (k,m)
// dst: (m,n)
//
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
// Also expressed as (major,minor)
// a: (m,k): so src1 transposed
// b: (k,n): so src0
// c: (m,n)
//
// However, if ggml_is_transposed(src1) is true, then
// src1->data already contains a transposed version, so sgemm mustn't
// transpose it further.
int n = src0->ne[0];
int k = src0->ne[1];
int m = src1->ne[0];
int transposeA, lda;
if (!ggml_is_transposed(src1)) {
transposeA = CblasTrans;
lda = m;
} else {
transposeA = CblasNoTrans;
lda = k;
}
float * a = (float *) ((char *) src1->data);
float * b = (float *) ((char *) src0->data);
float * c = (float *) ((char *) dst->data);
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
return;
}
#endif
// dst[:,:,:,:] = 0
// for i2,i3:
// for i1:
@@ -18399,24 +18452,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
}
const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
return ctx->kv[key_id].key.data;
}
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
return ctx->kv[key_id].type;
}
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
return ctx->kv[key_id].value.arr.type;
}
const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
return ctx->kv[key_id].value.arr.data;
}
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
struct gguf_kv * kv = &ctx->kv[key_id];
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
@@ -18424,70 +18482,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
}
int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
return ctx->kv[key_id].value.arr.n;
}
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
return ctx->kv[key_id].value.uint8;
}
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
return ctx->kv[key_id].value.int8;
}
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
return ctx->kv[key_id].value.uint16;
}
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
return ctx->kv[key_id].value.int16;
}
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
return ctx->kv[key_id].value.uint32;
}
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
return ctx->kv[key_id].value.int32;
}
float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
return ctx->kv[key_id].value.float32;
}
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
return ctx->kv[key_id].value.uint64;
}
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
return ctx->kv[key_id].value.int64;
}
double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
return ctx->kv[key_id].value.float64;
}
bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
return ctx->kv[key_id].value.bool_;
}
const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
return ctx->kv[key_id].value.str.data;
}
const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
return &ctx->kv[key_id].value;
}
int gguf_get_n_tensors(const struct gguf_context * ctx) {
return ctx->header.n_tensors;
}

1
ggml.h
View File

@@ -2045,6 +2045,7 @@ extern "C" {
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);

163
llama.cpp
View File

@@ -604,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
return LLAMA_ROPE_SCALING_UNSPECIFIED;
}
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
switch (type) {
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
default: return format("unknown type %d", type);
}
}
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
// escape quotes
replace_all(val, "\\", "\\\\");
replace_all(val, "\"", "\\\"");
ss << '"' << val << '"';
} else if (arr_type == GGUF_TYPE_ARRAY) {
ss << "???";
} else {
ss << gguf_data_to_str(arr_type, data, j);
}
if (j < arr_n - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
default:
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}
//
// ggml helpers
//
@@ -1087,9 +1141,9 @@ enum e_model {
MODEL_70B,
};
static const size_t kB = 1024;
static const size_t MB = 1024*kB;
static const size_t GB = 1024*MB;
static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
static const size_t GiB = 1024*MiB;
struct llama_hparams {
bool vocab_only;
@@ -1327,6 +1381,9 @@ struct llama_model {
int n_gpu_layers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
// context
struct ggml_context * ctx = NULL;
@@ -1488,7 +1545,7 @@ static bool llama_kv_cache_init(
vram_kv_cache += ggml_nbytes(cache.k);
}
if (vram_kv_cache > 0) {
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
}
}
#endif
@@ -1785,10 +1842,10 @@ struct llama_model_loader {
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
ftype = LLAMA_FTYPE_ALL_F32;
} break;
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
ftype = LLAMA_FTYPE_ALL_F32;
} break;
}
// this is a way to mark that we have "guessed" the file type
@@ -1802,10 +1859,20 @@ struct llama_model_loader {
}
for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx_gguf, i);
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
const char * name = gguf_get_key(ctx_gguf, i);
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
: gguf_type_name(type);
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
std::string value = gguf_kv_to_str(ctx_gguf, i);
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
}
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
}
// print type counts
@@ -2100,6 +2167,17 @@ static void llm_load_hparams(
auto & hparams = model.hparams;
// get metadata as string
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
enum gguf_type type = gguf_get_kv_type(ctx, i);
if (type == GGUF_TYPE_ARRAY) {
continue;
}
const char * name = gguf_get_key(ctx, i);
const std::string value = gguf_kv_to_str(ctx, i);
model.gguf_kv.emplace(name, value);
}
// get general kv
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
@@ -2543,8 +2621,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
if (ml.n_bytes < GB) {
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
if (ml.n_bytes < GiB) {
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
} else {
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
}
@@ -2582,7 +2660,7 @@ static void llm_load_tensors(
ml.calc_sizes(ctx_size, mmapped_size);
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
// create the ggml context
{
@@ -3231,7 +3309,7 @@ static void llm_load_tensors(
ctx_size +
mmapped_size - vram_weights; // weights in VRAM not in memory
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@@ -3250,7 +3328,7 @@ static void llm_load_tensors(
#endif // GGML_USE_CUBLAS
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
#else
(void) n_gpu_layers;
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@@ -7962,7 +8040,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
workers.clear();
}
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
int64_t tot_count = 0;
for (size_t i = 0; i < hist_cur.size(); i++) {
hist_all[i] += hist_cur[i];
@@ -8502,7 +8580,7 @@ struct llama_context * llama_new_context_with_model(
{
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
}
// resized during inference
@@ -8547,7 +8625,7 @@ struct llama_context * llama_new_context_with_model(
// measure memory requirements for the graph
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
// recreate allocator with exact memory requirements
ggml_allocr_free(ctx->alloc);
@@ -8561,7 +8639,7 @@ struct llama_context * llama_new_context_with_model(
#endif
#ifdef GGML_USE_CUBLAS
ggml_cuda_set_scratch_size(alloc_size);
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
// calculate total VRAM usage
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@@ -8581,10 +8659,10 @@ struct llama_context * llama_new_context_with_model(
size_t ctx_vram_size = alloc_size + kv_vram_size;
size_t total_vram_size = model_vram_size + ctx_vram_size;
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
total_vram_size / 1024.0 / 1024.0,
model_vram_size / 1024.0 / 1024.0,
ctx_vram_size / 1024.0 / 1024.0);
ctx_vram_size / 1024.0 / 1024.0);
#endif
}
@@ -8605,7 +8683,7 @@ struct llama_context * llama_new_context_with_model(
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
#define LLAMA_METAL_CHECK_BUF(result) \
if (!(result)) { \
@@ -8671,6 +8749,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
return model->hparams.rope_freq_scale_train;
}
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
const auto & it = model->gguf_kv.find(key);
if (it == model->gguf_kv.end()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
int llama_model_meta_count(const struct llama_model * model) {
return (int)model->gguf_kv.size();
}
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)model->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = model->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->first.c_str());
}
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)model->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = model->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
return snprintf(buf, buf_size, "%s %s %s",
llama_model_arch_name(model->arch).c_str(),

17
llama.h
View File

@@ -301,6 +301,23 @@ extern "C" {
// Get the model's RoPE frequency scaling factor
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
// Functions to access the model's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure
// - The output string is always null-terminated and cleared on failure
// - GGUF array values are not supported by these functions
// Get metadata value as a string by key name
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
// Get the number of metadata key/value pairs
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
// Get metadata key name by index
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
// Get metadata value as a string by index
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
// Get a string describing the model type
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);