Compare commits

..

4 Commits

Author SHA1 Message Date
xloem
ea3a0ad6b6 llama : update stubs for systems without mmap and mlock (#1266)
Co-authored-by: John Doe <john.doe@example.com>
2023-05-01 15:58:51 +03:00
Kerfuffle
2bdc09646d ggml : fix ggml_used_mem() (#1264) 2023-05-01 14:56:07 +03:00
Georgi Gerganov
70269cae37 llama : fix session load / save (#1263) 2023-05-01 14:54:59 +03:00
slaren
b925f1f1b0 cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)
* cuBLAS: fall back to pageable memory if pinned alloc fails

* cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
2023-05-01 13:32:22 +02:00
6 changed files with 157 additions and 81 deletions

View File

@@ -161,23 +161,22 @@ int main(int argc, char ** argv) {
std::vector<llama_token> session_tokens;
if (!path_session.empty()) {
fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str());
fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
// REVIEW - fopen to check for existing session
// fopen to check for existing session
FILE * fp = std::fopen(path_session.c_str(), "rb");
if (fp != NULL) {
std::fclose(fp);
session_tokens.resize(params.n_ctx);
size_t n_token_count_out = 0;
const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1;
}
session_tokens.resize(n_token_count_out);
if (n_session_bytes > 0) {
fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes);
} else {
fprintf(stderr, "%s: could not load session file, will recreate\n", __func__);
}
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
} else {
fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
}
@@ -214,7 +213,7 @@ int main(int argc, char ** argv) {
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
params.n_keep = (int)embd_inp.size();
}
@@ -329,7 +328,7 @@ int main(int argc, char ** argv) {
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
// REVIEW - stop saving session if we run out of context
// stop saving session if we run out of context
path_session = "";
//printf("\n---\n");
@@ -355,6 +354,7 @@ int main(int argc, char ** argv) {
n_session_consumed++;
if (n_session_consumed >= (int) session_tokens.size()) {
++i;
break;
}
}

View File

@@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
}
void * ggml_cuda_host_malloc(size_t size) {
void * ptr;
CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
return nullptr;
}
void * ptr = nullptr;
cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) {
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
size/1024.0/1024.0, cudaGetErrorString(err));
return nullptr;
}
return ptr;
}

2
ggml.c
View File

@@ -4411,7 +4411,7 @@ void ggml_free(struct ggml_context * ctx) {
}
size_t ggml_used_mem(const struct ggml_context * ctx) {
return ctx->objects_end->offs + ctx->objects_end->size;
return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
}
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {

View File

@@ -243,7 +243,8 @@ struct llama_mmap {
#else
static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file *) {
llama_mmap(struct llama_file *, bool prefetch = true) {
(void)prefetch;
throw std::string("mmap not supported");
}
#endif
@@ -382,8 +383,13 @@ struct llama_mlock {
#else
static constexpr bool SUPPORTED = false;
void raw_lock(const void * addr, size_t size) {
size_t lock_granularity() {
return (size_t) 65536;
}
bool raw_lock(const void * addr, size_t size) {
fprintf(stderr, "warning: mlock not supported on this system\n");
return false;
}
void raw_unlock(const void * addr, size_t size) {}
@@ -395,6 +401,8 @@ struct llama_buffer {
uint8_t * addr = NULL;
size_t size = 0;
llama_buffer() = default;
void resize(size_t size) {
delete[] addr;
addr = new uint8_t[size];
@@ -404,27 +412,59 @@ struct llama_buffer {
~llama_buffer() {
delete[] addr;
}
// disable copy and move
llama_buffer(const llama_buffer&) = delete;
llama_buffer(llama_buffer&&) = delete;
llama_buffer& operator=(const llama_buffer&) = delete;
llama_buffer& operator=(llama_buffer&&) = delete;
};
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
bool is_cuda;
size_t size = 0;
llama_ctx_buffer() = default;
void resize(size_t size) {
if (addr) {
ggml_cuda_host_free(addr);
}
free();
addr = (uint8_t *) ggml_cuda_host_malloc(size);
if (addr) {
is_cuda = true;
}
else {
// fall back to pageable memory
addr = new uint8_t[size];
is_cuda = false;
}
this->size = size;
}
~llama_ctx_buffer() {
void free() {
if (addr) {
ggml_cuda_host_free(addr);
if (is_cuda) {
ggml_cuda_host_free(addr);
}
else {
delete[] addr;
}
}
addr = NULL;
}
~llama_ctx_buffer() {
free();
}
// disable copy and move
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
};
#else
typedef llama_buffer llama_ctx_buffer;

136
llama.cpp
View File

@@ -727,8 +727,7 @@ struct llama_model_loader {
LLAMA_ASSERT(offset == lt.size);
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
// Let's load the data into temporary buffers to ensure the OS performs large loads.
std::vector<llama_buffer> tmp_bufs;
tmp_bufs.resize(lt.shards.size());
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
for (size_t i = 0; i < lt.shards.size(); i++) {
llama_load_tensor_shard & shard = lt.shards.at(i);
llama_file & file = file_loaders.at(shard.file_idx)->file;
@@ -2567,6 +2566,85 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
return nread;
}
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
llama_file file(path_session, "rb");
// sanity checks
{
const uint32_t magic = file.read_u32();
const uint32_t version = file.read_u32();
if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
return false;
}
llama_hparams session_hparams;
file.read_raw(&session_hparams, sizeof(llama_hparams));
if (session_hparams != ctx->model.hparams) {
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
return false;
}
}
// load the prompt
{
const uint32_t n_token_count = file.read_u32();
if (n_token_count > n_token_capacity) {
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
return false;
}
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
*n_token_count_out = n_token_count;
}
// restore the context state
{
const size_t n_state_size_cur = file.size - file.tell();
const size_t n_state_size_exp = llama_get_state_size(ctx);
if (n_state_size_cur != n_state_size_exp) {
fprintf(stderr, "%s : the state size in session file didn't match! expected %zu, got %zu\n", __func__, n_state_size_exp, n_state_size_cur);
return false;
}
std::vector<uint8_t> state_data(n_state_size_cur);
file.read_raw(state_data.data(), n_state_size_cur);
llama_set_state_data(ctx, state_data.data());
}
return true;
}
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
llama_file file(path_session, "wb");
file.write_u32(LLAMA_SESSION_MAGIC);
file.write_u32(LLAMA_SESSION_VERSION);
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
// save the prompt
file.write_u32((uint32_t) n_token_count);
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
// save the context state
{
const size_t n_state_size = llama_get_state_size(ctx);
std::vector<uint8_t> state_data(n_state_size);
llama_copy_state_data(ctx, state_data.data());
file.write_raw(state_data.data(), n_state_size);
}
return true;
}
int llama_eval(
struct llama_context * ctx,
const llama_token * tokens,
@@ -2694,57 +2772,3 @@ const char * llama_print_system_info(void) {
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
return ctx->model.tensors_by_name;
}
size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
// TODO leverage mmap
llama_file file(path_session, "rb");
const uint32_t magic = file.read_u32();
const uint32_t version = file.read_u32();
if (!(magic == 'ggsn' && version == 0)) {
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
return 0;
}
llama_hparams session_hparams;
file.read_raw(&session_hparams, sizeof(llama_hparams));
// REVIEW
if (session_hparams != ctx->model.hparams) {
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
return 0;
}
const uint32_t n_token_count = file.read_u32();
LLAMA_ASSERT(n_token_capacity >= n_token_count);
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
*n_token_count_out = n_token_count;
const size_t n_state_size = file.size - file.tell();
const size_t n_orig_state_size = llama_get_state_size(ctx);
if (n_state_size != n_orig_state_size) {
fprintf(stderr, "%s : failed to validate state size\n", __func__);
}
std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
file.read_raw(state_data.get(), n_state_size);
return llama_set_state_data(ctx, state_data.get());
}
size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
// TODO save temp & swap
llama_file file(path_session, "wb");
const size_t n_state_size = llama_get_state_size(ctx);
std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
llama_copy_state_data(ctx, state_data.get());
file.write_u32('ggsn'); // magic
file.write_u32(0); // version
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
file.write_u32((uint32_t) n_token_count); // REVIEW
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
file.write_raw(state_data.get(), n_state_size);
return n_state_size; // REVIEW
}

12
llama.h
View File

@@ -19,9 +19,11 @@
# define LLAMA_API
#endif
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 'ggjt'
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
#define LLAMA_SESSION_MAGIC 'ggsn'
#define LLAMA_SESSION_VERSION 0
#ifdef __cplusplus
extern "C" {
@@ -138,8 +140,8 @@ extern "C" {
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
// Save/load session file
LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process