Compare commits

...

5 Commits

Author SHA1 Message Date
Howard Su
58970a4c39 Leverage mmap for offloading tensors to GPU (#1597)
* Rebase to latest

* Show progress

* Add assert to make sure we only allocate temp buffer for non-CPU backend tensor

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2023-06-12 14:44:16 +02:00
Kawrakow
8c0a10e64d metal : fix failure to load model (#1817)
The number of buffers in the ggml context was left unitialized.
This leads to sporadic failures to load the model on
startup. It is actually strange that the failure occurred so
infrequantly.

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-12 14:31:36 +03:00
Kerfuffle
fa84c4b3e8 Fix issue where interactive mode crashes when input exceeds ctx size (#1789)
* Fix issue where interactive mode in the main example crashes when input exceeds ctx size

* Ensure the context size is at least 8 tokens in the main example.

Closes #1768
2023-06-11 08:19:17 -06:00
Kyle Liang
12b063f0ec Fixed WSL cuda's OOM error (#1594)
* In the function , add the cuda error bypass.

* remove excessive codes and prints

---------

Co-authored-by: liang <liangmanlai@126.com>
2023-06-11 15:20:52 +02:00
Ryan Landay
31d2b5f4a4 Update SHA256SUMS with current hashes for models quantized using q4_0 (#1798) 2023-06-11 12:38:53 +03:00
10 changed files with 85 additions and 120 deletions

View File

@@ -1,6 +1,6 @@
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_0.bin
ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin
@@ -8,7 +8,7 @@ ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_0.bin
fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin
@@ -18,7 +18,7 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/con
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_0.bin
d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin
@@ -32,7 +32,7 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_0.bin
cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin

View File

@@ -632,6 +632,9 @@ void console_set_color(console_state & con_st, console_color_t color) {
case CONSOLE_COLOR_USER_INPUT:
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
break;
case CONSOLE_COLOR_ERROR:
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
break;
}
con_st.color = color;
fflush(con_st.out);

View File

@@ -112,7 +112,8 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
enum console_color_t {
CONSOLE_COLOR_DEFAULT=0,
CONSOLE_COLOR_PROMPT,
CONSOLE_COLOR_USER_INPUT
CONSOLE_COLOR_USER_INPUT,
CONSOLE_COLOR_ERROR
};
struct console_state {

View File

@@ -81,6 +81,9 @@ int main(int argc, char ** argv) {
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) {
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -331,6 +334,19 @@ int main(int argc, char ** argv) {
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (embd.size() > 0) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
auto max_embd_size = n_ctx - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int)embd.size() > max_embd_size) {
auto skipped_tokens = embd.size() - max_embd_size;
console_set_color(con_st, CONSOLE_COLOR_ERROR);
printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
fflush(stdout);
embd.resize(max_embd_size);
}
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)

View File

@@ -1105,6 +1105,9 @@ void * ggml_cuda_host_malloc(size_t size) {
void * ptr = nullptr;
cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) {
// The allocation error can be bypassed. A null ptr will assigned out of this function.
// This can fixed the OOM error in WSL.
cudaGetLastError();
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
size/1024.0/1024.0, cudaGetErrorString(err));
return nullptr;
@@ -1710,8 +1713,7 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
(void) dst;
}
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
FILE * fp = fopen(fname, "rb");
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
int nrows = ggml_nrows(tensor);
const size_t nb1 = tensor->nb[1];
ggml_backend backend = tensor->backend;
@@ -1745,35 +1747,19 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
int64_t nrows_split = row_high - row_low;
const size_t offset_split = offset + row_low*nb1;
const size_t offset_split = row_low*nb1;
const size_t size = ggml_nbytes_split(tensor, nrows_split);
void * buf;
CUDA_CHECK(cudaMalloc(&buf, size));
void * buf_host = malloc(size);
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
#else
int ret = fseek(fp, (long) offset_split, SEEK_SET);
#endif
GGML_ASSERT(ret == 0); // same
size_t ret2 = fread(buf_host, size, 1, fp);
if (ret2 != 1) {
fprintf(stderr, "unexpectedly reached end of file");
exit(1);
}
void * buf_host = (char*)data + offset_split;
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
free(buf_host);
extra->data_device[id] = buf;
}
tensor->extra = extra;
fclose(fp);
}
void ggml_cuda_free_data(struct ggml_tensor * tensor) {

View File

@@ -24,7 +24,8 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
void * ggml_cuda_host_malloc(size_t size);
void ggml_cuda_host_free(void * ptr);
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
void ggml_cuda_free_data(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
void ggml_cuda_set_main_device(int main_device);

View File

@@ -86,6 +86,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
ctx->device = MTLCreateSystemDefaultDevice();
ctx->queue = [ctx->device newCommandQueue];
ctx->n_buffers = 0;
// determine if we can use MPS
if (MPSSupportsMTLDevice(ctx->device)) {

View File

@@ -1167,7 +1167,7 @@ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct g
return 0;
}
void ggml_cl_transform_tensor(ggml_tensor * tensor) {
void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
const int64_t ne0 = tensor->ne[0];
const int64_t ne1 = tensor->ne[1];
const int64_t ne2 = tensor->ne[2];
@@ -1179,6 +1179,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
size_t q_size;
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
tensor->data = data;
// copy tensor to device
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
@@ -1190,35 +1191,5 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
CL_CHECK(clFinish(queue));
tensor->data = dst;
tensor->backend = GGML_BACKEND_GPU;
}
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
cl_int err;
FILE * fp = fopen(fname, "rb");
const size_t size = ggml_nbytes(tensor);
cl_mem dst;
CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
void * buf_host = malloc(size);
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
#else
int ret = fseek(fp, (long) offset, SEEK_SET);
#endif
GGML_ASSERT(ret == 0); // same
size_t ret2 = fread(buf_host, size, 1, fp);
if (ret2 != 1) {
fprintf(stderr, "unexpectedly reached end of file");
exit(1);
}
clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
tensor->data = dst;
free(buf_host);
fclose(fp);
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
}

View File

@@ -18,8 +18,7 @@ void ggml_cl_host_free(void * ptr);
void ggml_cl_free_data(const struct ggml_tensor* tensor);
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, size_t offset);
void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
#ifdef __cplusplus
}

107
llama.cpp
View File

@@ -707,6 +707,9 @@ struct llama_model_loader {
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
struct ggml_tensor * tensor;
if (backend != GGML_BACKEND_CPU) {
ggml_set_no_alloc(ggml_ctx, true);
}
if (lt.ne.size() == 2) {
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
} else {
@@ -716,6 +719,9 @@ struct llama_model_loader {
ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
if (backend != GGML_BACKEND_CPU) {
ggml_set_no_alloc(ggml_ctx, use_mmap);
}
tensor->backend = backend;
lt.ggml_tensor = tensor;
num_ggml_tensors_created++;
@@ -731,6 +737,7 @@ struct llama_model_loader {
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0;
size_t prefetch_size = 0;
size_t lock_size = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@@ -740,11 +747,6 @@ struct llama_model_loader {
if (use_mmap) {
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
if (!lmlock) {
// Don't call the callback since the actual loading will be lazy
// and we can't measure it.
progress_callback = NULL;
}
if (lmlock) {
lmlock->init(mapping->addr);
}
@@ -752,20 +754,49 @@ struct llama_model_loader {
size_t done_size = 0;
for (llama_load_tensor & lt : tensors_map.tensors) {
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
continue;
}
if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data);
}
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
lt.data = (uint8_t *) lt.ggml_tensor->data;
load_data_for(lt);
lt.ggml_tensor->data = lt.data;
done_size += lt.size;
if (use_mmap && lmlock) {
lmlock->grow_to(done_size);
// allocate temp buffer if not using mmap
if (!use_mmap && lt.data == NULL) {
GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
}
load_data_for(lt);
switch(lt.ggml_tensor->backend) {
case GGML_BACKEND_CPU:
lt.ggml_tensor->data = lt.data;
if (use_mmap && lmlock) {
lock_size += lt.size;
lmlock->grow_to(lock_size);
}
break;
#if defined(GGML_USE_CUBLAS)
case GGML_BACKEND_GPU:
case GGML_BACKEND_GPU_SPLIT:
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
if (!use_mmap) {
free(lt.data);
}
break;
#elif defined(GGML_USE_CLBLAST)
case GGML_BACKEND_GPU:
ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
if (!use_mmap) {
free(lt.data);
}
break;
#endif
default:
continue;
}
done_size += lt.size;
}
}
@@ -1141,7 +1172,7 @@ static void llama_model_load_internal(
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
}
}
@@ -1196,58 +1227,14 @@ static void llama_model_load_internal(
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
}
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
#if defined(GGML_USE_CUBLAS)
{
ggml_cuda_set_tensor_split(tensor_split);
size_t done_size = 0;
size_t data_size = 0;
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
done_size += lt.size;
}
}
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
ggml_backend backend = lt.ggml_tensor->backend;
if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
continue;
}
if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data);
}
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
done_size += lt.size;
}
}
#elif defined(GGML_USE_CLBLAST)
{
size_t done_size = 0;
size_t data_size = 0;
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
done_size += lt.size;
}
}
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
continue;
}
if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data);
}
ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
done_size += lt.size;
}
}
#else
(void) n_batch;
(void) tensor_split;
#endif
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
}