mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-19 14:13:22 +02:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f86b9d152c | ||
|
|
5b27975479 | ||
|
|
1b26d7151a | ||
|
|
45b8032b9c | ||
|
|
21431197a1 | ||
|
|
340484161f | ||
|
|
1665ad8bf1 | ||
|
|
0ec5fdb5ce | ||
|
|
cae8f50b1a |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -48,6 +48,7 @@ models-mnt
|
||||
/llama-bench
|
||||
/llava-cli
|
||||
/lookahead
|
||||
/lookup
|
||||
/main
|
||||
/metal
|
||||
/perplexity
|
||||
|
||||
@@ -291,12 +291,7 @@ if (LLAMA_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
if (WIN32)
|
||||
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||
else ()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
endif()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
else()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
|
||||
17
Makefile
17
Makefile
@@ -2,7 +2,7 @@
|
||||
BUILD_TARGETS = \
|
||||
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o
|
||||
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = \
|
||||
@@ -439,15 +439,9 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||
endif # LLAMA_CLBLAST
|
||||
|
||||
ifdef LLAMA_HIPBLAS
|
||||
|
||||
ifeq ($(wildcard /opt/rocm),)
|
||||
ROCM_PATH ?= /usr
|
||||
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
||||
else
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||
endif
|
||||
HIPCC ?= $(ROCM_PATH)/bin/hipcc
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
HIPCC ?= $(ROCM_PATH)/bin/hipcc
|
||||
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||
LLAMA_CUDA_DMMV_X ?= 32
|
||||
LLAMA_CUDA_MMV_Y ?= 1
|
||||
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
||||
@@ -645,6 +639,9 @@ parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
@@ -240,3 +240,4 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||
|
||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Sequence
|
||||
@@ -10,15 +11,43 @@ from typing import Any, BinaryIO, Sequence
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from pathlib import Path
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
|
||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||
|
||||
|
||||
HF_SUBLAYER_TO_GGML = {
|
||||
"self_attn.q_proj": "attn_q",
|
||||
"self_attn.k_proj": "attn_k",
|
||||
"self_attn.v_proj": "attn_v",
|
||||
"self_attn.o_proj": "attn_output",
|
||||
"mlp.gate_proj": "ffn_gate",
|
||||
"mlp.down_proj": "ffn_down",
|
||||
"mlp.up_proj": "ffn_up",
|
||||
"input_layernorm": "attn_norm",
|
||||
"post_attention_layernorm": "ffn_norm",
|
||||
}
|
||||
|
||||
|
||||
def translate_tensor_name(t: str) -> str:
|
||||
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
|
||||
if match:
|
||||
nn = match.group(1)
|
||||
sub_layer = match.group(2)
|
||||
lora_type = match.group(3)
|
||||
|
||||
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
|
||||
if sub_layer_renamed is None:
|
||||
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
|
||||
sys.exit(1)
|
||||
|
||||
output_string = (
|
||||
f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
||||
)
|
||||
return output_string
|
||||
else:
|
||||
print(f"Error: unrecognized tensor {t}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
@@ -32,7 +61,9 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
||||
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
||||
|
||||
|
||||
def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
|
||||
def write_tensor_header(
|
||||
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
|
||||
) -> None:
|
||||
sname = name.encode("utf-8")
|
||||
fout.write(
|
||||
struct.pack(
|
||||
@@ -47,12 +78,11 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
|
||||
fout.seek((fout.tell() + 31) & -32)
|
||||
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path> [arch]")
|
||||
if len(sys.argv) != 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path>")
|
||||
print(
|
||||
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
|
||||
)
|
||||
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
||||
sys.exit(1)
|
||||
|
||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||
@@ -60,14 +90,6 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
||||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||
|
||||
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||
print(f"Error: unsupported architecture {arch_name}")
|
||||
sys.exit(1)
|
||||
|
||||
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
|
||||
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
|
||||
|
||||
with open(input_json, "r") as f:
|
||||
params = json.load(f)
|
||||
@@ -95,7 +117,6 @@ with open(output_path, "wb") as fout:
|
||||
|
||||
write_file_header(fout, params)
|
||||
for k, v in model.items():
|
||||
orig_k = k
|
||||
if k.endswith(".default.weight"):
|
||||
k = k.replace(".default.weight", ".weight")
|
||||
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
||||
@@ -108,32 +129,7 @@ with open(output_path, "wb") as fout:
|
||||
v = v.float()
|
||||
|
||||
t = v.detach().numpy()
|
||||
|
||||
prefix = "base_model.model."
|
||||
if k.startswith(prefix):
|
||||
k = k[len(prefix) :]
|
||||
|
||||
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
|
||||
if k.endswith(lora_suffixes):
|
||||
suffix = k[-len(lora_suffixes[0]):]
|
||||
k = k[: -len(lora_suffixes[0])]
|
||||
else:
|
||||
print(f"Error: unrecognized tensor name {orig_k}")
|
||||
sys.exit(1)
|
||||
|
||||
tname = name_map.get_name(k)
|
||||
if tname is None:
|
||||
print(f"Error: could not map tensor name {orig_k}")
|
||||
print(" Note: the arch parameter must be specified if the model is not llama")
|
||||
sys.exit(1)
|
||||
|
||||
if suffix == ".lora_A.weight":
|
||||
tname += ".weight.loraA"
|
||||
elif suffix == ".lora_B.weight":
|
||||
tname += ".weight.loraB"
|
||||
else:
|
||||
assert False
|
||||
|
||||
tname = translate_tensor_name(k)
|
||||
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||
t.tofile(fout)
|
||||
|
||||
@@ -33,6 +33,7 @@ else()
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
if (LLAMA_METAL)
|
||||
add_subdirectory(metal)
|
||||
|
||||
@@ -1620,6 +1620,8 @@ int main(int argc, char ** argv) {
|
||||
opt->params.adam.gclip = params.common.adam_gclip;
|
||||
opt->params.adam.eps_f = params.common.adam_eps_f;
|
||||
|
||||
ggml_allocr * alloc = NULL;
|
||||
|
||||
printf("%s: init model\n", __func__);
|
||||
bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
|
||||
|
||||
@@ -1723,9 +1725,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// allocate input tensors
|
||||
mem_input_data.resize(max_input_size);
|
||||
ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
|
||||
ggml_allocr_alloc(alloc_inps, tokens_input);
|
||||
ggml_allocr_alloc(alloc_inps, target_probs);
|
||||
alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
|
||||
ggml_allocr_alloc(alloc, tokens_input);
|
||||
ggml_allocr_alloc(alloc, target_probs);
|
||||
ggml_allocr_free(alloc);
|
||||
|
||||
// context for compute tensors without their data
|
||||
const size_t estimated_compute_size_wo_data = (
|
||||
@@ -1752,7 +1755,7 @@ int main(int argc, char ** argv) {
|
||||
// find best evaluation order
|
||||
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
|
||||
ctx_compute = ggml_init(ctx_compute_params);
|
||||
ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gf->order = (enum ggml_cgraph_eval_order) order;
|
||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
@@ -1785,7 +1788,7 @@ int main(int argc, char ** argv) {
|
||||
// allocate compute tensors
|
||||
mem_compute_data.resize(max_compute_size);
|
||||
ctx_compute = ggml_init(ctx_compute_params);
|
||||
ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
||||
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
|
||||
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
gf->order = best_order;
|
||||
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
|
||||
@@ -1801,8 +1804,6 @@ int main(int argc, char ** argv) {
|
||||
params.common.use_checkpointing
|
||||
);
|
||||
ggml_allocr_free(alloc);
|
||||
ggml_allocr_free(alloc_inps);
|
||||
|
||||
|
||||
// tokenize data
|
||||
std::vector<llama_token> train_tokens;
|
||||
|
||||
5
examples/lookup/CMakeLists.txt
Normal file
5
examples/lookup/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
set(TARGET lookup)
|
||||
add_executable(${TARGET} lookup.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
13
examples/lookup/README.md
Normal file
13
examples/lookup/README.md
Normal file
@@ -0,0 +1,13 @@
|
||||
# llama.cpp/examples/lookup
|
||||
|
||||
Demonstration of Prompt Lookup Decoding
|
||||
|
||||
https://github.com/apoorvumang/prompt-lookup-decoding
|
||||
|
||||
The two key parameters for lookup decoding are `max_ngram_size` and `n_draft`. The first, determines how many ngrams to use when searching through the prompt for a match and the second specifies how many subsequent tokens to draft if a match is found.
|
||||
|
||||
More info:
|
||||
|
||||
https://github.com/ggerganov/llama.cpp/pull/4484
|
||||
https://github.com/ggerganov/llama.cpp/issues/4226
|
||||
|
||||
229
examples/lookup/lookup.cpp
Normal file
229
examples/lookup/lookup.cpp
Normal file
@@ -0,0 +1,229 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// maximum n-grams to search for in prompt
|
||||
const int max_ngram_size = 3;
|
||||
|
||||
// length of the candidate / draft sequence, if match is found
|
||||
const int n_draft = 10;
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("lookup", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_model * model = NULL;
|
||||
llama_context * ctx = NULL;
|
||||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
|
||||
// tokenize the prompt
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
LOG("add_bos tgt: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
||||
if ((int) inp.size() > max_tokens_list_size) {
|
||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
|
||||
const int n_input = inp.size();
|
||||
|
||||
const auto t_enc_start = ggml_time_us();
|
||||
|
||||
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
|
||||
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
|
||||
int n_predict = 0;
|
||||
int n_drafted = 0;
|
||||
int n_accept = 0;
|
||||
|
||||
int n_past = inp.size();
|
||||
|
||||
bool has_eos = false;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
||||
|
||||
std::vector<llama_token> draft;
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
|
||||
|
||||
// debug
|
||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
while (true) {
|
||||
// debug
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
// print current draft sequence
|
||||
LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
|
||||
|
||||
int i_dft = 0;
|
||||
while (true) {
|
||||
// sample from the target model
|
||||
llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
|
||||
if (!params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
|
||||
if (id == llama_token_eos(model)) {
|
||||
has_eos = true;
|
||||
}
|
||||
|
||||
++n_predict;
|
||||
|
||||
// check if the target token matches the draft
|
||||
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
|
||||
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
|
||||
++n_accept;
|
||||
++n_past;
|
||||
++i_dft;
|
||||
inp.push_back(id);
|
||||
|
||||
if (params.use_color) {
|
||||
// color accepted draft token
|
||||
printf("\033[34m%s\033[0m", token_str.c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
|
||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||
|
||||
draft.clear();
|
||||
draft.push_back(id);
|
||||
inp.push_back(id);
|
||||
break;
|
||||
}
|
||||
|
||||
if ((params.n_predict > 0 && n_predict > params.n_predict) || has_eos) {
|
||||
break;
|
||||
}
|
||||
|
||||
// KV cache management
|
||||
// clean the cache of draft tokens that weren't accepted
|
||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||
|
||||
llama_batch_clear(batch_tgt);
|
||||
llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||
|
||||
// generate n_pred tokens through prompt lookup
|
||||
auto prompt_lookup = [&]() -> void {
|
||||
int inp_size = inp.size();
|
||||
for (int ngram_size = max_ngram_size ; ngram_size > 0; --ngram_size){
|
||||
const llama_token * ngram = &inp[inp_size - ngram_size];
|
||||
|
||||
for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
|
||||
bool match = true;
|
||||
for (int j = 0; j < ngram_size; ++j) {
|
||||
if (inp[i + j] != ngram[j]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (match) {
|
||||
const int startIdx = i + ngram_size;
|
||||
const int endIdx = startIdx + n_draft;
|
||||
if (endIdx < inp_size) {
|
||||
for (int j = startIdx; j < endIdx; ++j) {
|
||||
LOG(" - draft candidate %d: %d\n", j, inp[j]);
|
||||
draft.push_back(inp[j]);
|
||||
llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
|
||||
++n_drafted;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
};
|
||||
|
||||
prompt_lookup();
|
||||
|
||||
llama_decode(ctx, batch_tgt);
|
||||
++n_past;
|
||||
|
||||
draft.erase(draft.begin());
|
||||
}
|
||||
|
||||
auto t_dec_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("n_draft = %d\n", n_draft);
|
||||
LOG_TEE("n_predict = %d\n", n_predict);
|
||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
LOG_TEE("\ntarget:\n");
|
||||
llama_print_timings(ctx);
|
||||
|
||||
llama_sampling_free(ctx_sampling);
|
||||
llama_batch_free(batch_tgt);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -10,8 +10,7 @@
|
||||
// crash the server in debug mode, otherwise send an http 500 error
|
||||
#define CPPHTTPLIB_NO_EXCEPTIONS 1
|
||||
#endif
|
||||
// increase max payload length to allow use of larger context size
|
||||
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||
|
||||
#include "httplib.h"
|
||||
#include "json.hpp"
|
||||
|
||||
@@ -2414,7 +2413,7 @@ json oaicompat_completion_params_parse(
|
||||
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
|
||||
|
||||
if (body.count("grammar") != 0) {
|
||||
if (llama_params.count("grammar") != 0) {
|
||||
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||
}
|
||||
|
||||
@@ -2645,9 +2644,6 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
#if SERVER_VERBOSE != 1
|
||||
log_disable();
|
||||
#endif
|
||||
// own arguments required by this example
|
||||
gpt_params params;
|
||||
server_params sparams;
|
||||
@@ -2702,7 +2698,7 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
// API key is invalid or not provided
|
||||
res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8");
|
||||
res.set_content("Unauthorized: Invalid API Key", "text/plain");
|
||||
res.status = 401; // Unauthorized
|
||||
|
||||
LOG_WARNING("Unauthorized: Invalid API Key", {});
|
||||
@@ -2717,28 +2713,28 @@ int main(int argc, char **argv)
|
||||
// this is only called if no index.html is found in the public --path
|
||||
svr.Get("/", [](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html; charset=utf-8");
|
||||
res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
|
||||
return false;
|
||||
});
|
||||
|
||||
// this is only called if no index.js is found in the public --path
|
||||
svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript; charset=utf-8");
|
||||
res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
|
||||
return false;
|
||||
});
|
||||
|
||||
// this is only called if no index.html is found in the public --path
|
||||
svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
|
||||
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
|
||||
return false;
|
||||
});
|
||||
|
||||
// this is only called if no index.html is found in the public --path
|
||||
svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
|
||||
res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
|
||||
return false;
|
||||
});
|
||||
|
||||
@@ -2749,7 +2745,7 @@ int main(int argc, char **argv)
|
||||
{ "user_name", llama.name_user.c_str() },
|
||||
{ "assistant_name", llama.name_assistant.c_str() }
|
||||
};
|
||||
res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
res.set_content(data.dump(), "application/json");
|
||||
});
|
||||
|
||||
svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
||||
@@ -2763,12 +2759,12 @@ int main(int argc, char **argv)
|
||||
std::string completion_text;
|
||||
task_result result = llama.next_result(task_id);
|
||||
if (!result.error && result.stop) {
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
|
||||
}
|
||||
else
|
||||
{
|
||||
res.status = 404;
|
||||
res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
|
||||
res.set_content(result.result_json["content"], "text/plain");
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
@@ -2839,7 +2835,7 @@ int main(int argc, char **argv)
|
||||
}}
|
||||
};
|
||||
|
||||
res.set_content(models.dump(), "application/json; charset=utf-8");
|
||||
res.set_content(models.dump(), "application/json");
|
||||
});
|
||||
|
||||
// TODO: add mount point without "/v1" prefix -- how?
|
||||
@@ -2861,10 +2857,10 @@ int main(int argc, char **argv)
|
||||
|
||||
res.set_content(oaicompat_result.dump(-1, ' ', false,
|
||||
json::error_handler_t::replace),
|
||||
"application/json; charset=utf-8");
|
||||
"application/json");
|
||||
} else {
|
||||
res.status = 500;
|
||||
res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
|
||||
res.set_content(result.result_json["content"], "text/plain");
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
@@ -2928,12 +2924,12 @@ int main(int argc, char **argv)
|
||||
task_result result = llama.next_result(task_id);
|
||||
if (!result.error && result.stop)
|
||||
{
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
|
||||
}
|
||||
else
|
||||
{
|
||||
res.status = 404;
|
||||
res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
|
||||
res.set_content(result.result_json["content"], "text/plain");
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
@@ -2982,11 +2978,11 @@ int main(int argc, char **argv)
|
||||
svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res)
|
||||
{
|
||||
const json data = llama.get_model_props();
|
||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
});
|
||||
|
||||
svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
|
||||
{ return res.set_content("", "application/json; charset=utf-8"); });
|
||||
{ return res.set_content("", "application/json"); });
|
||||
|
||||
svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
@@ -2997,7 +2993,7 @@ int main(int argc, char **argv)
|
||||
tokens = llama.tokenize(body["content"], false);
|
||||
}
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
});
|
||||
|
||||
svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
@@ -3011,7 +3007,7 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
const json data = format_detokenized_response(content);
|
||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||
return res.set_content(data.dump(), "application/json");
|
||||
});
|
||||
|
||||
svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
@@ -3028,7 +3024,7 @@ int main(int argc, char **argv)
|
||||
}
|
||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
|
||||
task_result result = llama.next_result(task_id);
|
||||
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
|
||||
return res.set_content(result.result_json.dump(), "application/json");
|
||||
});
|
||||
|
||||
svr.set_logger(log_server_request);
|
||||
@@ -3049,7 +3045,7 @@ int main(int argc, char **argv)
|
||||
{
|
||||
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
|
||||
}
|
||||
res.set_content(buf, "text/plain; charset=utf-8");
|
||||
res.set_content(buf, "text/plain");
|
||||
res.status = 500;
|
||||
});
|
||||
|
||||
@@ -3057,15 +3053,15 @@ int main(int argc, char **argv)
|
||||
{
|
||||
if (res.status == 401)
|
||||
{
|
||||
res.set_content("Unauthorized", "text/plain; charset=utf-8");
|
||||
res.set_content("Unauthorized", "text/plain");
|
||||
}
|
||||
if (res.status == 400)
|
||||
{
|
||||
res.set_content("Invalid request", "text/plain; charset=utf-8");
|
||||
res.set_content("Invalid request", "text/plain");
|
||||
}
|
||||
else if (res.status == 404)
|
||||
{
|
||||
res.set_content("File Not Found", "text/plain; charset=utf-8");
|
||||
res.set_content("File Not Found", "text/plain");
|
||||
res.status = 404;
|
||||
}
|
||||
});
|
||||
|
||||
155
llama.cpp
155
llama.cpp
@@ -1505,10 +1505,6 @@ struct llama_context {
|
||||
|
||||
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
||||
std::vector<float> logits;
|
||||
#ifndef NDEBUG
|
||||
// guard against access to unset logits
|
||||
std::vector<bool> logits_valid;
|
||||
#endif
|
||||
bool logits_all = false;
|
||||
|
||||
// input embedding (1-dimensional array: [n_embd])
|
||||
@@ -6154,14 +6150,6 @@ static int llama_decode_internal(
|
||||
{
|
||||
auto & logits_out = lctx.logits;
|
||||
|
||||
#ifndef NDEBUG
|
||||
auto & logits_valid = lctx.logits_valid;
|
||||
logits_valid.clear();
|
||||
logits_valid.resize(n_tokens);
|
||||
|
||||
logits_out.clear();
|
||||
#endif
|
||||
|
||||
if (batch.logits) {
|
||||
logits_out.resize(n_vocab * n_tokens);
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
@@ -6169,22 +6157,13 @@ static int llama_decode_internal(
|
||||
continue;
|
||||
}
|
||||
memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
|
||||
#ifndef NDEBUG
|
||||
logits_valid[i] = true;
|
||||
#endif
|
||||
}
|
||||
} else if (lctx.logits_all) {
|
||||
logits_out.resize(n_vocab * n_tokens);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
|
||||
#ifndef NDEBUG
|
||||
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
||||
#endif
|
||||
} else {
|
||||
logits_out.resize(n_vocab);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
|
||||
#ifndef NDEBUG
|
||||
logits_valid[n_tokens - 1] = true;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8647,60 +8626,53 @@ static int llama_apply_lora_from_file_internal(
|
||||
|
||||
const int64_t t_start_lora_us = ggml_time_us();
|
||||
|
||||
llama_file fin(path_lora, "rb");
|
||||
auto fin = std::ifstream(path_lora, std::ios::binary);
|
||||
if (!fin) {
|
||||
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// verify magic and version
|
||||
{
|
||||
uint32_t magic = fin.read_u32();
|
||||
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||
LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
uint32_t format_version;
|
||||
fin.read((char *) &format_version, sizeof(format_version));
|
||||
|
||||
uint32_t format_version = fin.read_u32();
|
||||
if (format_version != 1) {
|
||||
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t lora_r = fin.read_u32();
|
||||
int32_t lora_alpha = fin.read_u32();
|
||||
int32_t lora_r;
|
||||
int32_t lora_alpha;
|
||||
fin.read((char *) &lora_r, sizeof(lora_r));
|
||||
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
||||
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
||||
|
||||
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
||||
|
||||
// create a name -> tensor map of the model to accelerate lookups
|
||||
// find the max tensor size to estimate the required temporary buffer size
|
||||
size_t max_tensor_size = 0;
|
||||
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
||||
for (const auto & kv : model.tensors_by_name) {
|
||||
model_tensors.insert(kv);
|
||||
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
|
||||
max_tensor_size = std::max(max_tensor_size, f32_size);
|
||||
}
|
||||
|
||||
// create a temporary ggml context to store the lora tensors
|
||||
// TODO: use ggml-alloc
|
||||
size_t lora_ctx_size = max_tensor_size * 3;
|
||||
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
|
||||
std::vector<uint8_t> lora_buf(lora_ctx_size);
|
||||
|
||||
// todo: calculate size from biggest possible tensor
|
||||
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = lora_buf.size();
|
||||
params.mem_buffer = lora_buf.data();
|
||||
params.no_alloc = false;
|
||||
|
||||
using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
|
||||
|
||||
unique_context lora_ctx(nullptr, ggml_free);
|
||||
lora_ctx.reset(ggml_init(params));
|
||||
ggml_context * lora_ctx = ggml_init(params);
|
||||
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
||||
|
||||
// create a name -> tensor map of the model to accelerate lookups
|
||||
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
||||
for (const auto & kv : model.tensors_by_name) {
|
||||
model_tensors.insert(kv);
|
||||
}
|
||||
|
||||
// load base model
|
||||
std::unique_ptr<llama_model_loader> ml;
|
||||
|
||||
unique_context base_ctx(nullptr, ggml_free);
|
||||
ggml_context * base_ctx = NULL;
|
||||
std::vector<uint8_t> base_buf;
|
||||
if (path_base_model) {
|
||||
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
@@ -8709,7 +8681,6 @@ static int llama_apply_lora_from_file_internal(
|
||||
size_t ctx_size;
|
||||
size_t mmapped_size;
|
||||
ml->calc_sizes(ctx_size, mmapped_size);
|
||||
|
||||
base_buf.resize(ctx_size);
|
||||
|
||||
ggml_init_params base_params;
|
||||
@@ -8717,9 +8688,9 @@ static int llama_apply_lora_from_file_internal(
|
||||
base_params.mem_buffer = base_buf.data();
|
||||
base_params.no_alloc = ml->use_mmap;
|
||||
|
||||
base_ctx.reset(ggml_init(base_params));
|
||||
base_ctx = ggml_init(base_params);
|
||||
|
||||
// maybe this should be in llama_model_loader
|
||||
// maybe this should in llama_model_loader
|
||||
if (ml->use_mmap) {
|
||||
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
|
||||
}
|
||||
@@ -8732,35 +8703,27 @@ static int llama_apply_lora_from_file_internal(
|
||||
std::vector<uint8_t> work_buffer;
|
||||
|
||||
while (true) {
|
||||
if (fin.tell() == fin.size) {
|
||||
// eof
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t n_dims;
|
||||
int32_t name_len;
|
||||
int32_t length;
|
||||
int32_t ftype;
|
||||
|
||||
fin.read_raw(&n_dims, sizeof(n_dims));
|
||||
fin.read_raw(&name_len, sizeof(name_len));
|
||||
fin.read_raw(&ftype, sizeof(ftype));
|
||||
|
||||
if (n_dims != 1 && n_dims != 2) {
|
||||
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
return 1;
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read_raw(&ne[i], sizeof(ne[i]));
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
}
|
||||
|
||||
std::string name;
|
||||
{
|
||||
GGML_ASSERT(name_len <= 1024);
|
||||
char buf[1024];
|
||||
fin.read_raw(buf, name_len);
|
||||
name = std::string(buf, name_len);
|
||||
fin.read(buf, length);
|
||||
name = std::string(buf, length);
|
||||
}
|
||||
|
||||
// check for lora suffix and get the type of tensor
|
||||
@@ -8774,7 +8737,7 @@ static int llama_apply_lora_from_file_internal(
|
||||
std::string lora_type = name.substr(pos + lora_suffix.length());
|
||||
std::string base_name = name;
|
||||
base_name.erase(pos);
|
||||
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
|
||||
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
||||
|
||||
if (model_tensors.find(base_name) == model_tensors.end()) {
|
||||
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
||||
@@ -8793,15 +8756,22 @@ static int llama_apply_lora_from_file_internal(
|
||||
return false;
|
||||
}
|
||||
}
|
||||
ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
|
||||
ggml_set_name(lora_tensor, name.c_str());
|
||||
ggml_tensor * lora_tensor;
|
||||
if (n_dims == 2) {
|
||||
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
||||
}
|
||||
else {
|
||||
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
return 1;
|
||||
}
|
||||
ggml_set_name(lora_tensor, "lora_tensor");
|
||||
|
||||
// load tensor data
|
||||
size_t offset = fin.tell();
|
||||
size_t offset = fin.tellg();
|
||||
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
||||
offset = (offset + 31) & -32;
|
||||
fin.seek(offset, SEEK_SET);
|
||||
fin.read_raw(lora_tensor->data, tensor_data_size);
|
||||
fin.seekg(offset);
|
||||
fin.read((char*)lora_tensor->data, tensor_data_size);
|
||||
|
||||
lora_tensors[name] = lora_tensor;
|
||||
|
||||
@@ -8831,11 +8801,13 @@ static int llama_apply_lora_from_file_internal(
|
||||
|
||||
// load from base model
|
||||
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
||||
// TODO: throw
|
||||
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||
// TODO: not tested!! maybe not working!
|
||||
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||
ml->load_data_for(base_t);
|
||||
} else {
|
||||
base_t = dest_t;
|
||||
@@ -8864,45 +8836,43 @@ static int llama_apply_lora_from_file_internal(
|
||||
}
|
||||
|
||||
// w = w + BA*s
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||
offload_func(BA);
|
||||
ggml_set_name(BA, "BA");
|
||||
|
||||
if (scaling != 1.0f) {
|
||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling);
|
||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
||||
ggml_set_name(scale_tensor, "scale_tensor");
|
||||
|
||||
BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor);
|
||||
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
||||
offload_func(BA);
|
||||
ggml_set_name(BA, "BA_scaled");
|
||||
}
|
||||
|
||||
ggml_tensor * r;
|
||||
if (base_t == dest_t) {
|
||||
r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
|
||||
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
||||
offload_func_force_inplace(r);
|
||||
ggml_set_name(r, "r_add_inplace");
|
||||
}
|
||||
else {
|
||||
r = ggml_add(lora_ctx.get(), base_t, BA);
|
||||
r = ggml_add(lora_ctx, base_t, BA);
|
||||
offload_func(r);
|
||||
ggml_set_name(r, "r_add");
|
||||
|
||||
r = ggml_cpy(lora_ctx.get(), r, dest_t);
|
||||
r = ggml_cpy(lora_ctx, r, dest_t);
|
||||
offload_func(r);
|
||||
ggml_set_name(r, "r_cpy");
|
||||
}
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
|
||||
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
||||
ggml_build_forward_expand(gf, r);
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
||||
|
||||
// the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
|
||||
GGML_ASSERT(lora_tensors.size() == 2);
|
||||
|
||||
// we won't need these tensors again, reset the context to save memory
|
||||
lora_ctx.reset(ggml_init(params));
|
||||
ggml_free(lora_ctx);
|
||||
lora_ctx = ggml_init(params);
|
||||
lora_tensors.clear();
|
||||
|
||||
n_tensors++;
|
||||
@@ -8912,6 +8882,12 @@ static int llama_apply_lora_from_file_internal(
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: this should be in a destructor, it will leak on failure
|
||||
ggml_free(lora_ctx);
|
||||
if (base_ctx) {
|
||||
ggml_free(base_ctx);
|
||||
}
|
||||
|
||||
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
||||
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
||||
|
||||
@@ -10076,7 +10052,6 @@ float * llama_get_logits(struct llama_context * ctx) {
|
||||
}
|
||||
|
||||
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
||||
assert(ctx->logits_valid.at(i));
|
||||
return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user