mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
11 Commits
master-7a9
...
master-29b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
29b7baab67 | ||
|
|
4a7129acd2 | ||
|
|
6b6dbc8910 | ||
|
|
2a2e63ce05 | ||
|
|
e899bf54b2 | ||
|
|
fbd4d38c64 | ||
|
|
58e6c9f36f | ||
|
|
36d07532ef | ||
|
|
6f1ee4b640 | ||
|
|
8520fc310e | ||
|
|
b3f460e941 |
10
README.md
10
README.md
@@ -17,7 +17,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
The main goal is to run the model using 4-bit quantization on a MacBook
|
||||
|
||||
- Plain C/C++ implementation without dependencies
|
||||
- Apple silicon first-class citizen - optimized via ARM NEON
|
||||
- Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
|
||||
- AVX2 support for x86 architectures
|
||||
- Mixed F16 / F32 precision
|
||||
- 4-bit quantization support
|
||||
@@ -323,14 +323,6 @@ or with light image:
|
||||
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- Probably the token sampling can be improved
|
||||
- The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder,
|
||||
there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simply don't
|
||||
know how to utilize it properly. But in any case, you can even disable it with `LLAMA_NO_ACCELERATE=1 make` and the
|
||||
performance will be the same, since no BLAS calls are invoked by the current implementation
|
||||
|
||||
### Contributing
|
||||
|
||||
- Contributors can open PRs
|
||||
|
||||
173
ggml.c
173
ggml.c
@@ -2638,7 +2638,7 @@ static inline int ggml_up(int n, int m) {
|
||||
|
||||
// assert that pointer is aligned to GGML_MEM_ALIGN
|
||||
#define ggml_assert_aligned(ptr) \
|
||||
assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
||||
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -4566,7 +4566,7 @@ static void ggml_compute_forward_dup_f16(
|
||||
|
||||
if (src0->nb[0] == sizeof(ggml_fp16_t)) {
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
const size_t rs = ne00*nb00;
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -4582,7 +4582,7 @@ static void ggml_compute_forward_dup_f16(
|
||||
}
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_F32) {
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
float * dst_ptr = (float *) dst->data;
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -4604,7 +4604,7 @@ static void ggml_compute_forward_dup_f16(
|
||||
//printf("%s: this is not optimal - fix me\n", __func__);
|
||||
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
float * dst_ptr = (float *) dst->data;
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -4620,7 +4620,7 @@ static void ggml_compute_forward_dup_f16(
|
||||
}
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -4670,7 +4670,7 @@ static void ggml_compute_forward_dup_f32(
|
||||
|
||||
if (src0->nb[0] == sizeof(float)) {
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
const size_t rs = ne00*nb00;
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -4686,7 +4686,7 @@ static void ggml_compute_forward_dup_f32(
|
||||
}
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -4708,7 +4708,7 @@ static void ggml_compute_forward_dup_f32(
|
||||
//printf("%s: this is not optimal - fix me\n", __func__);
|
||||
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
float * dst_ptr = (float *) dst->data;
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -4724,7 +4724,7 @@ static void ggml_compute_forward_dup_f32(
|
||||
}
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -5858,13 +5858,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
||||
if (ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
|
||||
|
||||
//// disable BLAS for Q4_0 and Q4_1
|
||||
//// looks like there is no benefit and we only waste a lot of memory
|
||||
//if (src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1) {
|
||||
// return false;
|
||||
//}
|
||||
|
||||
//printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);
|
||||
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -5957,19 +5951,17 @@ static void ggml_compute_forward_mul_mat_f32(
|
||||
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
for (int i02 = 0; i02 < ne02; i02++) {
|
||||
const float * x = (float *) (src0->data);
|
||||
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
// zT = y * xT
|
||||
{
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6205,7 +6197,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
for (int i02 = 0; i02 < ne02; i02++) {
|
||||
{
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
for (int i01 = 0; i01 < ne01; ++i01) {
|
||||
for (int i00 = 0; i00 < ne00; ++i00) {
|
||||
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
||||
@@ -6216,43 +6208,14 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
||||
const float * x = wdata;
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
|
||||
// float * z = wdata + ne00*ne01;
|
||||
|
||||
// z = x * yT
|
||||
//{
|
||||
// cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
// ne01, ne11, ne00,
|
||||
// 1.0f, x, ne00,
|
||||
// y, ne00,
|
||||
// 0.0f, z, ne11);
|
||||
//}
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
// transpose z
|
||||
//for (int j = 0; j < ne11; ++j) {
|
||||
// for (int i = 0; i < ne01; ++i) {
|
||||
// d[j*ne01 + i] = z[i*ne11 + j];
|
||||
// }
|
||||
//}
|
||||
|
||||
{
|
||||
#if 1
|
||||
// zT = y * xT
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne00,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
#else
|
||||
// zT = (xT * y)T
|
||||
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
|
||||
ne01, ne11, ne10,
|
||||
1.0f, x, ne00,
|
||||
y, ne00,
|
||||
0.0f, d, ne01);
|
||||
#endif
|
||||
}
|
||||
// zT = y * xT
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6266,7 +6229,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
||||
if (nb01 >= nb00) {
|
||||
ggml_fp16_t * const wdata = params->wdata;
|
||||
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int i11 = 0; i11 < ne11; ++i11) {
|
||||
@@ -6354,8 +6317,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
||||
|
||||
float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
|
||||
|
||||
assert(ne00 % 32 == 0);
|
||||
|
||||
for (int ic = 0; ic < ne11; ++ic) {
|
||||
ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
|
||||
}
|
||||
@@ -6511,7 +6472,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
for (int i02 = 0; i02 < ne02; i02++) {
|
||||
{
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
for (int i01 = 0; i01 < ne01; ++i01) {
|
||||
//for (int i00 = 0; i00 < ne00; ++i00) {
|
||||
// wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
||||
@@ -6524,43 +6485,14 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
|
||||
const float * x = wdata;
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
|
||||
// float * z = wdata + ne00*ne01;
|
||||
|
||||
// z = x * yT
|
||||
//{
|
||||
// cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
// ne01, ne11, ne00,
|
||||
// 1.0f, x, ne00,
|
||||
// y, ne00,
|
||||
// 0.0f, z, ne11);
|
||||
//}
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
// transpose z
|
||||
//for (int j = 0; j < ne11; ++j) {
|
||||
// for (int i = 0; i < ne01; ++i) {
|
||||
// d[j*ne01 + i] = z[i*ne11 + j];
|
||||
// }
|
||||
//}
|
||||
|
||||
{
|
||||
#if 1
|
||||
// zT = y * xT
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne00,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
#else
|
||||
// zT = (xT * y)T
|
||||
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
|
||||
ne01, ne11, ne10,
|
||||
1.0f, x, ne00,
|
||||
y, ne00,
|
||||
0.0f, d, ne01);
|
||||
#endif
|
||||
}
|
||||
// zT = y * xT
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6811,7 +6743,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
|
||||
for (int i03 = 0; i03 < ne03; i03++) {
|
||||
for (int i02 = 0; i02 < ne02; i02++) {
|
||||
{
|
||||
int id = 0;
|
||||
size_t id = 0;
|
||||
for (int i01 = 0; i01 < ne01; ++i01) {
|
||||
//for (int i00 = 0; i00 < ne00; ++i00) {
|
||||
// wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
||||
@@ -6824,43 +6756,14 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
|
||||
const float * x = wdata;
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
|
||||
// float * z = wdata + ne00*ne01;
|
||||
|
||||
// z = x * yT
|
||||
//{
|
||||
// cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
// ne01, ne11, ne00,
|
||||
// 1.0f, x, ne00,
|
||||
// y, ne00,
|
||||
// 0.0f, z, ne11);
|
||||
//}
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
// transpose z
|
||||
//for (int j = 0; j < ne11; ++j) {
|
||||
// for (int i = 0; i < ne01; ++i) {
|
||||
// d[j*ne01 + i] = z[i*ne11 + j];
|
||||
// }
|
||||
//}
|
||||
|
||||
{
|
||||
#if 1
|
||||
// zT = y * xT
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne00,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
#else
|
||||
// zT = (xT * y)T
|
||||
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
|
||||
ne01, ne11, ne10,
|
||||
1.0f, x, ne00,
|
||||
y, ne00,
|
||||
0.0f, d, ne01);
|
||||
#endif
|
||||
}
|
||||
// zT = y * xT
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
66
llama.cpp
66
llama.cpp
@@ -168,9 +168,11 @@ struct llama_context {
|
||||
|
||||
int64_t t_sample_us = 0;
|
||||
int64_t t_eval_us = 0;
|
||||
int64_t t_p_eval_us = 0;
|
||||
|
||||
int32_t n_sample = 0; // number of tokens sampled
|
||||
int32_t n_eval = 0; // number of eval calls
|
||||
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||
|
||||
llama_model model;
|
||||
llama_vocab vocab;
|
||||
@@ -239,7 +241,7 @@ static bool kv_cache_init(
|
||||
const int n_mem = n_layer*n_ctx;
|
||||
const int n_elements = n_embd*n_mem;
|
||||
|
||||
cache.buf.resize(2*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = cache.buf.size();
|
||||
@@ -267,14 +269,16 @@ static void kv_cache_free(struct llama_kv_cache & cache) {
|
||||
|
||||
struct llama_context_params llama_context_default_params() {
|
||||
struct llama_context_params result = {
|
||||
/*.n_ctx =*/ 512,
|
||||
/*.n_parts =*/ -1,
|
||||
/*.seed =*/ 0,
|
||||
/*.f16_kv =*/ false,
|
||||
/*.logits_all =*/ false,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.embedding =*/ false,
|
||||
/*.n_ctx =*/ 512,
|
||||
/*.n_parts =*/ -1,
|
||||
/*.seed =*/ 0,
|
||||
/*.f16_kv =*/ false,
|
||||
/*.logits_all =*/ false,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.embedding =*/ false,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
};
|
||||
|
||||
return result;
|
||||
@@ -290,7 +294,9 @@ static bool llama_model_load(
|
||||
int n_ctx,
|
||||
int n_parts,
|
||||
ggml_type memory_type,
|
||||
bool vocab_only) {
|
||||
bool vocab_only,
|
||||
llama_progress_callback progress_callback,
|
||||
void *progress_callback_user_data) {
|
||||
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
@@ -576,6 +582,10 @@ static bool llama_model_load(
|
||||
|
||||
std::vector<uint8_t> tmp;
|
||||
|
||||
if (progress_callback) {
|
||||
progress_callback(0.0, progress_callback_user_data);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_parts; ++i) {
|
||||
const int part_id = i;
|
||||
//const int part_id = n_parts - i - 1;
|
||||
@@ -589,6 +599,10 @@ static bool llama_model_load(
|
||||
|
||||
fin = std::ifstream(fname_part, std::ios::binary);
|
||||
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
||||
|
||||
fin.seekg(0, fin.end);
|
||||
const size_t file_size = fin.tellg();
|
||||
|
||||
fin.seekg(file_offset);
|
||||
|
||||
// load weights
|
||||
@@ -764,6 +778,11 @@ static bool llama_model_load(
|
||||
model.n_loaded++;
|
||||
|
||||
// progress
|
||||
if (progress_callback) {
|
||||
double current_file_progress = double(size_t(fin.tellg()) - file_offset) / double(file_size - file_offset);
|
||||
double current_progress = (double(i) + current_file_progress) / double(n_parts);
|
||||
progress_callback(current_progress, progress_callback_user_data);
|
||||
}
|
||||
if (model.n_loaded % 8 == 0) {
|
||||
fprintf(stderr, ".");
|
||||
fflush(stderr);
|
||||
@@ -786,6 +805,10 @@ static bool llama_model_load(
|
||||
|
||||
lctx.t_load_us = ggml_time_us() - t_start_us;
|
||||
|
||||
if (progress_callback) {
|
||||
progress_callback(1.0, progress_callback_user_data);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -896,8 +919,7 @@ static bool llama_eval_internal(
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
||||
);
|
||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)));
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
||||
@@ -913,7 +935,7 @@ static bool llama_eval_internal(
|
||||
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
1, 2, 0, 3),
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
|
||||
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
||||
|
||||
// KQV = transpose(V) * KQ_soft_max
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||
@@ -1050,6 +1072,10 @@ static bool llama_eval_internal(
|
||||
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
||||
lctx.n_eval++;
|
||||
}
|
||||
else if (N > 1) {
|
||||
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
||||
lctx.n_p_eval += N;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -1617,7 +1643,8 @@ struct llama_context * llama_init_from_file(
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
|
||||
params.vocab_only)) {
|
||||
params.vocab_only, params.progress_callback,
|
||||
params.progress_callback_user_data)) {
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
@@ -1790,12 +1817,14 @@ void llama_print_timings(struct llama_context * ctx) {
|
||||
|
||||
const int32_t n_sample = std::max(1, ctx->n_sample);
|
||||
const int32_t n_eval = std::max(1, ctx->n_eval);
|
||||
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
|
||||
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_sample_us, n_sample, 1e-3f * ctx->t_sample_us / n_sample);
|
||||
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_eval_us, n_eval, 1e-3f * ctx->t_eval_us / n_eval);
|
||||
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
|
||||
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
|
||||
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_sample_us, n_sample, 1e-3f * ctx->t_sample_us / n_sample);
|
||||
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3f * ctx->t_p_eval_us, n_p_eval, 1e-3f * ctx->t_p_eval_us / n_p_eval);
|
||||
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_eval_us, n_eval, 1e-3f * ctx->t_eval_us / n_eval);
|
||||
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
|
||||
}
|
||||
|
||||
void llama_reset_timings(struct llama_context * ctx) {
|
||||
@@ -1803,6 +1832,7 @@ void llama_reset_timings(struct llama_context * ctx) {
|
||||
|
||||
ctx->t_sample_us = ctx->n_sample = 0;
|
||||
ctx->t_eval_us = ctx->n_eval = 0;
|
||||
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||
}
|
||||
|
||||
const char * llama_print_system_info(void) {
|
||||
|
||||
9
llama.h
9
llama.h
@@ -45,6 +45,8 @@ extern "C" {
|
||||
|
||||
} llama_token_data;
|
||||
|
||||
typedef void (*llama_progress_callback)(double progress, void *ctx);
|
||||
|
||||
struct llama_context_params {
|
||||
int n_ctx; // text context
|
||||
int n_parts; // -1 for default
|
||||
@@ -55,6 +57,11 @@ extern "C" {
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool embedding; // embedding mode only
|
||||
|
||||
// called with a progress value between 0 and 1, pass NULL to disable
|
||||
llama_progress_callback progress_callback;
|
||||
// context pointer passed to the progress callback
|
||||
void * progress_callback_user_data;
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
@@ -123,7 +130,7 @@ extern "C" {
|
||||
|
||||
// TODO: improve the last_n_tokens interface ?
|
||||
LLAMA_API llama_token llama_sample_top_p_top_k(
|
||||
llama_context * ctx,
|
||||
struct llama_context * ctx,
|
||||
const llama_token * last_n_tokens_data,
|
||||
int last_n_tokens_size,
|
||||
int top_k,
|
||||
|
||||
9
main.cpp
9
main.cpp
@@ -300,6 +300,10 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.input_prefix.empty()) {
|
||||
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
||||
fprintf(stderr, "\n\n");
|
||||
@@ -472,6 +476,11 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if (!params.input_prefix.empty()) {
|
||||
buffer += params.input_prefix;
|
||||
printf("%s", buffer.c_str());
|
||||
}
|
||||
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
do {
|
||||
|
||||
@@ -155,6 +155,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
exit(0);
|
||||
} else if (arg == "--random-prompt") {
|
||||
params.random_prompt = true;
|
||||
} else if (arg == "--in-prefix") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.input_prefix = argv[i];
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, params);
|
||||
@@ -187,6 +193,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
||||
fprintf(stderr, " prompt to start generation with (default: empty)\n");
|
||||
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
||||
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
||||
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
||||
fprintf(stderr, " prompt file to start generation.\n");
|
||||
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
||||
|
||||
1
utils.h
1
utils.h
@@ -30,6 +30,7 @@ struct gpt_params {
|
||||
|
||||
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
||||
std::string prompt = "";
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
|
||||
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
|
||||
Reference in New Issue
Block a user