Compare commits

...

2 Commits

Author SHA1 Message Date
mgroeber9110
c2df36d60d llama : consistently catch and throw only exceptions deriving from std::exception (#1599)
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 23:24:29 +03:00
kiltyj
9d0693bce3 metal : use shared buffers between CPU and GPU (#1696)
* Use MTLDevice.newBufferWithBytesNoCopy to share buffers between CPU and GPU

* Page-align buffers used by Metal

* Remove trailing whitespace

* Only import unistd.h for Metal builds

* metal : remove unnecessary copies

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-06-05 23:24:04 +03:00
4 changed files with 68 additions and 45 deletions

View File

@@ -195,14 +195,25 @@ bool ggml_metal_add_buffer(
}
}
size_t page_size = getpagesize();
size_t aligned_size = size;
if ((aligned_size % page_size) != 0) {
aligned_size += (page_size - (aligned_size % page_size));
}
ctx->buffers[ctx->n_buffers].name = name;
ctx->buffers[ctx->n_buffers].data = data;
ctx->buffers[ctx->n_buffers].size = size;
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
return false;
} else {
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
}
++ctx->n_buffers;
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
}
return true;

8
ggml.c
View File

@@ -22,6 +22,10 @@
#include <float.h>
#include <limits.h>
#ifdef GGML_USE_METAL
#include <unistd.h>
#endif
// if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976
#ifndef static_assert
@@ -122,7 +126,11 @@ typedef void* thread_ret_t;
#else
inline static void* ggml_aligned_malloc(size_t size) {
void* aligned_memory = NULL;
#ifdef GGML_USE_METAL
int result = posix_memalign(&aligned_memory, getpagesize(), size);
#else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
#endif
if (result != 0) {
// Handle allocation failure
return NULL;

View File

@@ -405,13 +405,29 @@ struct llama_buffer {
llama_buffer() = default;
void resize(size_t len) {
#ifdef GGML_USE_METAL
free(addr);
int result = posix_memalign((void **) &addr, getpagesize(), len);
if (result == 0) {
memset(addr, 0, len);
}
else {
addr = NULL;
}
#else
delete[] addr;
addr = new uint8_t[len];
#endif
size = len;
}
~llama_buffer() {
#ifdef GGML_USE_METAL
free(addr);
#else
delete[] addr;
#endif
addr = NULL;
}
// disable copy and move

View File

@@ -53,7 +53,6 @@ enum e_model {
MODEL_65B,
};
static const size_t MB = 1024*1024;
// computed for n_ctx == 2048
@@ -290,15 +289,15 @@ template <typename T>
static T checked_mul(T a, T b) {
T ret = a * b;
if (a != 0 && ret / a != b) {
throw format("overflow multiplying %llu * %llu",
(unsigned long long) a, (unsigned long long) b);
throw std::runtime_error(format("overflow multiplying %llu * %llu",
(unsigned long long) a, (unsigned long long) b));
}
return ret;
}
static size_t checked_div(size_t a, size_t b) {
if (b == 0 || a % b != 0) {
throw format("error dividing %zu / %zu", a, b);
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
}
return a / b;
}
@@ -362,7 +361,7 @@ struct llama_load_tensor {
const auto & first_shard = shards.at(0);
for (const auto & shard : shards) {
if (shard.type != first_shard.type) {
throw format("inconsistent tensor shard type in '%s'", name.c_str());
throw std::runtime_error(format("inconsistent tensor shard type in '%s'", name.c_str()));
}
}
type = first_shard.type;
@@ -385,8 +384,8 @@ struct llama_load_tensor {
const auto & first_shard = shards.at(0);
for (const auto & shard : shards) {
if (shard.ne != first_shard.ne) {
throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str());
throw std::runtime_error(format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()));
}
}
ne = first_shard.ne;
@@ -464,8 +463,8 @@ struct llama_file_loader {
}
}
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version));
}
void read_hparams() {
hparams.n_vocab = file.read_u32();
@@ -505,7 +504,7 @@ struct llama_file_loader {
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
std::string name = file.read_string(name_len);
if (n_dims < 1 || n_dims > 2) {
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
}
switch (shard.type) {
case GGML_TYPE_F32:
@@ -522,7 +521,7 @@ struct llama_file_loader {
case GGML_TYPE_Q6_K:
break;
default: {
throw format("unrecognized tensor type %u\n", shard.type);
throw std::runtime_error(format("unrecognized tensor type %u\n", shard.type));
}
}
@@ -631,7 +630,7 @@ struct llama_model_loader {
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
file_loaders.emplace_back(ith_file);
if (ith_file->hparams != first_file->hparams) {
throw format("llama.cpp: hparams inconsistent between files");
throw std::runtime_error(format("llama.cpp: hparams inconsistent between files"));
}
}
if (!llama_mmap::SUPPORTED) {
@@ -661,7 +660,7 @@ struct llama_model_loader {
uint32_t guess_n_parts() const {
auto it = tensors_map.name_to_idx.find("tok_embeddings.weight");
if (it == tensors_map.name_to_idx.end()) {
throw std::string("missing tok_embeddings.weight");
throw std::runtime_error(std::string("missing tok_embeddings.weight"));
}
const llama_load_tensor & lt = tensors_map.tensors.at(it->second);
return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
@@ -678,12 +677,12 @@ struct llama_model_loader {
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) {
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
}
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
if (lt.ne != ne) {
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
}
return get_tensor_for(lt, backend);
@@ -707,7 +706,7 @@ struct llama_model_loader {
void done_getting_tensors() const {
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
throw std::string("llama.cpp: file contained more tensors than expected");
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
}
}
@@ -995,7 +994,7 @@ static void llama_model_load_internal(
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"));
}
}
@@ -1003,7 +1002,7 @@ static void llama_model_load_internal(
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
throw std::runtime_error(format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"));
}
}
@@ -1034,7 +1033,7 @@ static void llama_model_load_internal(
model.ctx = ggml_init(params);
if (!model.ctx) {
throw format("ggml_init() failed");
throw std::runtime_error(format("ggml_init() failed"));
}
}
@@ -1215,8 +1214,8 @@ static bool llama_model_load(
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
vocab_only, progress_callback, progress_callback_user_data);
return true;
} catch (const std::string & err) {
fprintf(stderr, "error loading model: %s\n", err.c_str());
} catch (const std::exception & err) {
fprintf(stderr, "error loading model: %s\n", err.what());
return false;
}
}
@@ -1281,12 +1280,6 @@ static bool llama_eval_internal(
ggml_set_name(embd, "embd");
memcpy(embd->data, tokens, N*ggml_element_size(embd));
#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
ggml_metal_set_tensor(lctx.ctx_metal, embd);
}
#endif
struct ggml_tensor * cur;
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@@ -1484,12 +1477,6 @@ static bool llama_eval_internal(
}
ggml_graph_compute(ctx0, &gf);
if (lctx.ctx_metal) {
// We need to sync the CPU KV cache with the GPU KV cache
ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k);
ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v);
}
}
#else
ggml_graph_compute(ctx0, &gf);
@@ -2133,8 +2120,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -2142,8 +2130,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
default: throw format("invalid output file type %d\n", ftype);
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
if (nthread <= 0) {
@@ -2244,7 +2232,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
}
} else {
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
throw std::runtime_error(format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)));
}
printf("quantizing .. ");
@@ -2446,8 +2434,8 @@ int llama_model_quantize(
try {
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
return 0;
} catch (const std::string & err) {
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
} catch (const std::exception & err) {
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
return 1;
}
}
@@ -2700,8 +2688,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
try {
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
} catch (const std::string & err) {
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
} catch (const std::exception & err) {
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
return 1;
}
}