mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
3 Commits
b3051
...
sl/rpc-bac
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5f8720fb7b | ||
|
|
a7060dffdd | ||
|
|
6c276deb9d |
29
Makefile
29
Makefile
@@ -67,6 +67,10 @@ ifeq ($(UNAME_S),Darwin)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef LLAMA_RPC
|
||||
BUILD_TARGETS += rpc-server
|
||||
endif
|
||||
|
||||
default: $(BUILD_TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
@@ -416,6 +420,11 @@ ifdef LLAMA_BLIS
|
||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||
endif # LLAMA_BLIS
|
||||
|
||||
ifdef LLAMA_RPC
|
||||
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||
OBJS += ggml-rpc.o
|
||||
endif # LLAMA_RPC
|
||||
|
||||
ifdef LLAMA_CUBLAS
|
||||
# LLAMA_CUBLAS is deprecated and will be removed in the future
|
||||
LLAMA_CUDA := 1
|
||||
@@ -626,11 +635,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
||||
endif
|
||||
endif # LLAMA_METAL
|
||||
|
||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
||||
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
||||
|
||||
ifndef LLAMA_NO_LLAMAFILE
|
||||
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif
|
||||
|
||||
ifdef LLAMA_RPC
|
||||
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
endif # LLAMA_RPC
|
||||
|
||||
GF_CC := $(CC)
|
||||
include scripts/get-flags.mk
|
||||
|
||||
@@ -710,14 +734,9 @@ unicode.o: unicode.cpp unicode.h
|
||||
unicode-data.o: unicode-data.cpp unicode-data.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
||||
|
||||
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
||||
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
||||
|
||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
|
||||
14
README.md
14
README.md
@@ -2,12 +2,12 @@
|
||||
|
||||

|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||
[](https://conan.io/center/llama-cpp)
|
||||
[](https://opensource.org/licenses/MIT) [](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||
|
||||
[](https://conan.io/center/llama-cpp)
|
||||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
|
||||
### Recent API changes
|
||||
@@ -388,14 +388,6 @@ In order to build llama.cpp you have four different options.
|
||||
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
|
||||
the instructions for use and activate this options in this document below.
|
||||
|
||||
### Homebrew
|
||||
|
||||
On Mac and Linux, the homebrew package manager can be used via
|
||||
```
|
||||
brew install llama.cpp
|
||||
```
|
||||
The formula is automatically updated with new `llama.cpp` releases.
|
||||
|
||||
### Metal Build
|
||||
|
||||
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
||||
|
||||
@@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
// this tensor was allocated without ggml-backend
|
||||
return;
|
||||
}
|
||||
ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
|
||||
ggml_backend_view_init(tensor);
|
||||
}
|
||||
} else {
|
||||
if (tensor->data == NULL) {
|
||||
@@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||
if (t->view_src == NULL) {
|
||||
ggml_tallocr_alloc(&tallocr, t);
|
||||
} else if (t->buffer == NULL) {
|
||||
ggml_backend_view_init(buffer, t);
|
||||
ggml_backend_view_init(t);
|
||||
}
|
||||
} else {
|
||||
if (t->view_src != NULL && t->buffer == NULL) {
|
||||
// view of a pre-allocated tensor
|
||||
ggml_backend_view_init(buffer, t);
|
||||
ggml_backend_view_init(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||
if (dst_buf->iface.cpy_tensor) {
|
||||
return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
|
||||
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
||||
|
||||
// utils
|
||||
|
||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->buffer == NULL);
|
||||
GGML_ASSERT(tensor->view_src != NULL);
|
||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||
GGML_ASSERT(tensor->view_src->data != NULL);
|
||||
|
||||
tensor->buffer = buffer;
|
||||
tensor->buffer = tensor->view_src->buffer;
|
||||
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
||||
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
||||
}
|
||||
|
||||
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
||||
@@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
||||
struct ggml_tensor * dst = node_copies[id];
|
||||
if (dst->view_src != NULL) {
|
||||
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
||||
ggml_backend_view_init(dst->view_src->buffer, dst);
|
||||
ggml_backend_view_init(dst);
|
||||
}
|
||||
else {
|
||||
ggml_backend_tensor_copy(src, dst);
|
||||
|
||||
@@ -225,7 +225,7 @@ extern "C" {
|
||||
|
||||
// Tensor initialization
|
||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
||||
if (remote_ptr != 0) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||
ggml_backend_rpc_buffer_interface,
|
||||
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
|
||||
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
|
||||
remote_size);
|
||||
return buffer;
|
||||
} else {
|
||||
@@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
|
||||
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .name = */ "RPC",
|
||||
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
||||
};
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
|
||||
288
llama.cpp
288
llama.cpp
@@ -1702,13 +1702,12 @@ struct llama_mlock {
|
||||
};
|
||||
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
||||
|
||||
// NOTE: avoid ever using this except for building the token_to_piece caches
|
||||
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
||||
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
||||
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
||||
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
}
|
||||
else {
|
||||
@@ -2163,9 +2162,7 @@ struct llama_vocab {
|
||||
std::unordered_map<token, id> token_to_id;
|
||||
std::vector<token_data> id_to_token;
|
||||
|
||||
std::vector<id> cache_special_tokens;
|
||||
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
|
||||
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
|
||||
std::vector<id> special_tokens_cache;
|
||||
|
||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||
|
||||
@@ -2372,13 +2369,34 @@ struct llama_context {
|
||||
struct llama_control_vector cvec;
|
||||
};
|
||||
|
||||
static size_t llama_get_device_count(const llama_model & model) {
|
||||
size_t count = 1;
|
||||
#if defined(GGML_USE_CUDA)
|
||||
count = ggml_backend_cuda_get_device_count();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
count = ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
count = ggml_backend_vk_get_device_count();
|
||||
#endif
|
||||
#if defined(GGML_USE_RPC)
|
||||
count += model.rpc_servers.size();
|
||||
#endif
|
||||
return count;
|
||||
GGML_UNUSED(model);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#ifdef GGML_USE_RPC
|
||||
std::string endpoint = model.rpc_servers[gpu];
|
||||
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
||||
#elif defined(GGML_USE_METAL)
|
||||
#if defined(GGML_USE_RPC)
|
||||
int dev_count = (int)llama_get_device_count(model);
|
||||
int rpc_count = (int)model.rpc_servers.size();
|
||||
if (gpu >= dev_count - rpc_count) {
|
||||
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
||||
return ggml_backend_rpc_buffer_type(endpoint);
|
||||
}
|
||||
#endif
|
||||
#if defined(GGML_USE_METAL)
|
||||
buft = ggml_backend_metal_buffer_type();
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||
@@ -2426,29 +2444,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
||||
GGML_UNUSED(tensor_split);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_count(const llama_model & model) {
|
||||
#if defined(GGML_USE_RPC)
|
||||
return model.rpc_servers.size();
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
return ggml_backend_cuda_get_device_count();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
return ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
return ggml_backend_vk_get_device_count();
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
GGML_UNUSED(model);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||
#if defined(GGML_USE_RPC)
|
||||
size_t total;
|
||||
size_t free;
|
||||
std::string endpoint = model.rpc_servers[device];
|
||||
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
int dev_count = (int)llama_get_device_count(model);
|
||||
int rpc_count = (int)model.rpc_servers.size();
|
||||
if (device >= dev_count - rpc_count) {
|
||||
size_t total;
|
||||
size_t free;
|
||||
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
||||
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
||||
return free;
|
||||
}
|
||||
#endif
|
||||
#if defined(GGML_USE_CUDA)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
||||
@@ -4595,14 +4603,20 @@ static void llm_load_vocab(
|
||||
vocab.special_cls_id = 101;
|
||||
vocab.special_mask_id = 103;
|
||||
vocab.add_space_prefix = false;
|
||||
} else if (tokenizer_model == "gpt2") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
||||
} else {
|
||||
if (tokenizer_model == "gpt2") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
||||
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
||||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
||||
return;
|
||||
}
|
||||
|
||||
// read bpe merges and populate bpe ranks
|
||||
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
||||
if (merges_keyidx == -1) {
|
||||
@@ -4636,8 +4650,6 @@ static void llm_load_vocab(
|
||||
vocab.special_pad_id = -1;
|
||||
vocab.special_cls_id = -1;
|
||||
vocab.special_mask_id = -1;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||||
}
|
||||
|
||||
// for now, only BPE models have pre-tokenizers
|
||||
@@ -4832,38 +4844,17 @@ static void llm_load_vocab(
|
||||
{
|
||||
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
||||
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
||||
vocab.cache_special_tokens.push_back(id);
|
||||
vocab.special_tokens_cache.push_back(id);
|
||||
}
|
||||
}
|
||||
|
||||
std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
||||
std::sort( vocab.special_tokens_cache.begin(), vocab.special_tokens_cache.end(),
|
||||
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
||||
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
||||
}
|
||||
);
|
||||
|
||||
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
||||
}
|
||||
|
||||
// build token to piece caches
|
||||
{
|
||||
size_t size_cache = 0;
|
||||
|
||||
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
|
||||
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
|
||||
|
||||
for (uint32_t id = 0; id < n_vocab; ++id) {
|
||||
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
|
||||
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
|
||||
|
||||
size_cache += cache_token_to_piece[id].size();
|
||||
size_cache += cache_token_to_piece_special[id].size();
|
||||
}
|
||||
|
||||
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
||||
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
|
||||
|
||||
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: special tokens cache size = %u.\n", __func__, (uint32_t)vocab.special_tokens_cache.size());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13253,7 +13244,7 @@ struct fragment_buffer_variant {
|
||||
|
||||
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
||||
// for each special token
|
||||
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
||||
for (const llama_vocab::id special_id : vocab.special_tokens_cache) {
|
||||
const auto & special_token = vocab.id_to_token[special_id].text;
|
||||
|
||||
// for each text fragment
|
||||
@@ -14412,7 +14403,7 @@ void llama_sample_repetition_penalties(
|
||||
|
||||
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
||||
GGML_ASSERT(ctx);
|
||||
int64_t t_start_sample_us = ggml_time_us();
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
bool allow_eog = false;
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
@@ -14424,13 +14415,12 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||
|
||||
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
||||
candidates_decoded.reserve(candidates->size);
|
||||
|
||||
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||
candidates_grammar.reserve(candidates->size);
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string piece = llama_token_to_piece(ctx, id, false);
|
||||
|
||||
if (llama_token_is_eog(&ctx->model, id)) {
|
||||
if (!allow_eog) {
|
||||
@@ -14630,7 +14620,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
|
||||
const std::string piece = llama_token_to_piece(ctx, token, false);
|
||||
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||
@@ -16167,7 +16157,7 @@ struct llama_model * llama_load_model_from_file(
|
||||
return true;
|
||||
};
|
||||
}
|
||||
if (params.rpc_servers != nullptr) {
|
||||
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
||||
// split the servers set them into model->rpc_servers
|
||||
std::string servers(params.rpc_servers);
|
||||
size_t pos = 0;
|
||||
@@ -16325,17 +16315,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
// initialize backends
|
||||
#if defined(GGML_USE_RPC)
|
||||
for (auto & server : model->rpc_servers) {
|
||||
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#elif defined(GGML_USE_METAL)
|
||||
#if defined(GGML_USE_METAL)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
ctx->backend_metal = ggml_backend_metal_init();
|
||||
if (ctx->backend_metal == nullptr) {
|
||||
@@ -16427,6 +16407,18 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#endif
|
||||
#if defined(GGML_USE_RPC)
|
||||
for (int i = 0; i < (int)model->rpc_servers.size(); i++) {
|
||||
const char * endpoint = model->rpc_servers[i].c_str();
|
||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#endif
|
||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||
if (ctx->backend_cpu == nullptr) {
|
||||
@@ -18313,83 +18305,69 @@ static std::string llama_decode_text(const std::string & text) {
|
||||
|
||||
// does not write null-terminator to buf
|
||||
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
||||
// if we have a cache - use it
|
||||
{
|
||||
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
|
||||
|
||||
if (!cache.empty()) {
|
||||
const auto & res = cache.at(token);
|
||||
if (length < (int) res.size()) {
|
||||
return -(int) res.size();
|
||||
}
|
||||
memcpy(buf, res.c_str(), res.size());
|
||||
return res.size();
|
||||
}
|
||||
}
|
||||
|
||||
if (0 <= token && token < llama_n_vocab(model)) {
|
||||
switch (llama_vocab_get_type(model->vocab)) {
|
||||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
case LLAMA_VOCAB_TYPE_SPM: {
|
||||
// NOTE: we accept all unsupported token types,
|
||||
// suppressing them like CONTROL tokens.
|
||||
if (llama_is_normal_token(model->vocab, token)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
llama_unescape_whitespace(result);
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
} else if (
|
||||
(llama_is_user_defined_token(model->vocab, token)) ||
|
||||
(llama_is_control_token (model->vocab, token) && special)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
||||
if (length < 3) {
|
||||
return -3;
|
||||
}
|
||||
memcpy(buf, "\xe2\x96\x85", 3);
|
||||
return 3;
|
||||
} else if (llama_is_byte_token(model->vocab, token)) {
|
||||
if (length < 1) {
|
||||
return -1;
|
||||
}
|
||||
buf[0] = llama_token_to_byte(model->vocab, token);
|
||||
return 1;
|
||||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
case LLAMA_VOCAB_TYPE_SPM: {
|
||||
// NOTE: we accept all unsupported token types,
|
||||
// suppressing them like CONTROL tokens.
|
||||
if (llama_is_normal_token(model->vocab, token)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
llama_unescape_whitespace(result);
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
break;
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_BPE: {
|
||||
// NOTE: we accept all unsupported token types,
|
||||
// suppressing them like CONTROL tokens.
|
||||
if (llama_is_normal_token(model->vocab, token)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
result = llama_decode_text(result);
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
} else if (
|
||||
(llama_is_user_defined_token(model->vocab, token)) ||
|
||||
(llama_is_control_token (model->vocab, token) && special)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
} else if (
|
||||
(llama_is_user_defined_token(model->vocab, token)) ||
|
||||
(llama_is_control_token (model->vocab, token) && special)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
break;
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
||||
if (length < 3) {
|
||||
return -3;
|
||||
}
|
||||
memcpy(buf, "\xe2\x96\x85", 3);
|
||||
return 3;
|
||||
} else if (llama_is_byte_token(model->vocab, token)) {
|
||||
if (length < 1) {
|
||||
return -1;
|
||||
}
|
||||
buf[0] = llama_token_to_byte(model->vocab, token);
|
||||
return 1;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_BPE: {
|
||||
// NOTE: we accept all unsupported token types,
|
||||
// suppressing them like CONTROL tokens.
|
||||
if (llama_is_normal_token(model->vocab, token)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
result = llama_decode_text(result);
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
} else if (
|
||||
(llama_is_user_defined_token(model->vocab, token)) ||
|
||||
(llama_is_control_token (model->vocab, token) && special)) {
|
||||
std::string result = model->vocab.id_to_token[token].text;
|
||||
if (length < (int) result.length()) {
|
||||
return -(int) result.length();
|
||||
}
|
||||
memcpy(buf, result.c_str(), result.length());
|
||||
return result.length();
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
4
llama.h
4
llama.h
@@ -424,8 +424,8 @@ extern "C" {
|
||||
|
||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||
|
||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||
|
||||
Reference in New Issue
Block a user