mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
14 Commits
master-ea1
...
master-481
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
afd220d9c6 | ||
|
|
481044d50c | ||
|
|
563cdc391d | ||
|
|
8d4a855c24 | ||
|
|
b6b268d441 | ||
|
|
3cd8dde0d1 | ||
|
|
4870e455b3 | ||
|
|
483bab2e3d | ||
|
|
404e1da38e | ||
|
|
4cc053b6d5 | ||
|
|
0ba5a3a9a5 | ||
|
|
2e17dfd80a | ||
|
|
20a1a4e09c | ||
|
|
ad072fc5ad |
@@ -16,11 +16,7 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
|
||||
./quantize $arg2
|
||||
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
|
||||
./main $arg2
|
||||
elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
|
||||
python3 ./download-pth.py $arg2
|
||||
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
||||
echo "Downloading model..."
|
||||
python3 ./download-pth.py "$1" "$2"
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
if [ -f "${i/f16/q4_0}" ]; then
|
||||
@@ -39,8 +35,6 @@ else
|
||||
echo " ex: \"/models/7B/\" 1"
|
||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||
echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
|
||||
echo " ex: \"/models/\" 7B"
|
||||
echo " --all-in-one (-a): Execute --download, --convert & --quantize"
|
||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
||||
echo " ex: \"/models/\" 7B"
|
||||
fi
|
||||
|
||||
@@ -218,6 +218,9 @@ add_library(utils OBJECT
|
||||
target_include_directories(utils PUBLIC .)
|
||||
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
|
||||
target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
add_library(ggml OBJECT
|
||||
ggml.c
|
||||
@@ -226,6 +229,9 @@ add_library(ggml OBJECT
|
||||
target_include_directories(ggml PUBLIC .)
|
||||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
add_library(llama
|
||||
llama.cpp
|
||||
@@ -234,6 +240,10 @@ add_library(llama
|
||||
target_include_directories(llama PUBLIC .)
|
||||
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
|
||||
target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||
endif()
|
||||
|
||||
#
|
||||
# Executables
|
||||
|
||||
3
Makefile
3
Makefile
@@ -156,7 +156,8 @@ endif
|
||||
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
||||
ifneq (,$(findstring POWER9,$(POWER9_M)))
|
||||
CFLAGS += -mpower9-vector
|
||||
CFLAGS += -mcpu=power9
|
||||
CXXFLAGS += -mcpu=power9
|
||||
endif
|
||||
# Require c++23's std::byteswap for big-endian support.
|
||||
ifeq ($(UNAME_M),ppc64)
|
||||
|
||||
@@ -7,8 +7,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
|
||||
- New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
|
||||
- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
|
||||
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
|
||||
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
|
||||
|
||||
|
||||
@@ -36,7 +36,8 @@ fname_out = sys.argv[3]
|
||||
|
||||
fout = open(fname_out, "wb")
|
||||
|
||||
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
|
||||
fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
fout.write(struct.pack("i", n_vocab))
|
||||
fout.write(struct.pack("i", n_embd))
|
||||
fout.write(struct.pack("i", n_mult))
|
||||
@@ -49,27 +50,21 @@ fout.write(struct.pack("i", 4))
|
||||
# This loop unchanged from convert-pth-to-ggml.py:
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
# "<unk>" token (translated as ??)
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
elif tokenizer.is_control(i):
|
||||
# "<s>"/"</s>" tokens
|
||||
fout.write(struct.pack("i", 0))
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
# "<U+XX>" tokens (which may be invalid UTF-8)
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
if len(piece) != 6:
|
||||
print("Invalid token: " + piece)
|
||||
print(f"Invalid token: {piece}")
|
||||
sys.exit(1)
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
fout.write(struct.pack("i", 1))
|
||||
fout.write(struct.pack("B", byte_value))
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def write_header(shape, dst_name, ftype_cur):
|
||||
sname = dst_name.encode('utf-8')
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
import os
|
||||
import sys
|
||||
from tqdm import tqdm
|
||||
import requests
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: download-pth.py dir-model model-type\n")
|
||||
print(" model-type: Available models 7B, 13B, 30B or 65B")
|
||||
sys.exit(1)
|
||||
|
||||
modelsDir = sys.argv[1]
|
||||
model = sys.argv[2]
|
||||
|
||||
num = {
|
||||
"7B": 1,
|
||||
"13B": 2,
|
||||
"30B": 4,
|
||||
"65B": 8,
|
||||
}
|
||||
|
||||
if model not in num:
|
||||
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Downloading model {model}")
|
||||
|
||||
files = ["checklist.chk", "params.json"]
|
||||
|
||||
for i in range(num[model]):
|
||||
files.append(f"consolidated.0{i}.pth")
|
||||
|
||||
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
|
||||
os.makedirs(resolved_path, exist_ok=True)
|
||||
|
||||
for file in files:
|
||||
dest_path = os.path.join(resolved_path, file)
|
||||
|
||||
if os.path.exists(dest_path):
|
||||
print(f"Skip file download, it already exists: {file}")
|
||||
continue
|
||||
|
||||
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
|
||||
response = requests.get(url, stream=True)
|
||||
with open(dest_path, 'wb') as f:
|
||||
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
t.update(len(chunk))
|
||||
|
||||
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
|
||||
for file in files2:
|
||||
dest_path = os.path.join(modelsDir, file)
|
||||
|
||||
if os.path.exists(dest_path):
|
||||
print(f"Skip file download, it already exists: {file}")
|
||||
continue
|
||||
|
||||
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
|
||||
response = requests.get(url, stream=True)
|
||||
with open(dest_path, 'wb') as f:
|
||||
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
|
||||
for chunk in response.iter_content(chunk_size=1024):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
t.update(len(chunk))
|
||||
159
ggml.c
159
ggml.c
@@ -1,5 +1,5 @@
|
||||
// Defines CLOCK_MONOTONIC on Linux
|
||||
#define _POSIX_C_SOURCE 199309L
|
||||
// Defines CLOCK_MONOTONIC and asprintf on Linux
|
||||
#define _GNU_SOURCE
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#endif
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <time.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
@@ -31,7 +32,6 @@
|
||||
#else
|
||||
// ref: https://github.com/ggerganov/whisper.cpp/issues/168
|
||||
#include <windows.h>
|
||||
#include <errno.h>
|
||||
#endif
|
||||
|
||||
typedef volatile LONG atomic_int;
|
||||
@@ -83,6 +83,17 @@ typedef void* thread_ret_t;
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#endif
|
||||
|
||||
#define GGML_MLOCK_SUPPORT 0
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<sys/mman.h>)
|
||||
#undef GGML_MLOCK_SUPPORT
|
||||
#define GGML_MLOCK_SUPPORT 1
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/*#define GGML_PERF*/
|
||||
#define GGML_DEBUG 0
|
||||
#define GGML_GELU_FP16
|
||||
@@ -164,6 +175,39 @@ typedef double ggml_float;
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
@@ -261,6 +305,7 @@ static float table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
||||
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
@@ -451,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric
|
||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
||||
assert(k % QK == 0);
|
||||
|
||||
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
|
||||
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
|
||||
const int nb = k / QK;
|
||||
const size_t bs = sizeof(float) + QK/2;
|
||||
|
||||
@@ -461,7 +506,52 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
||||
uint8_t pp[QK/2];
|
||||
#endif
|
||||
|
||||
#if __ARM_NEON
|
||||
#if defined(__POWER9_VECTOR__)
|
||||
#if QK == 32
|
||||
const vector float v85 = vec_splats(8.5f);
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
vector float srcv [8];
|
||||
vector float asrcv[8];
|
||||
vector float amaxv[8];
|
||||
|
||||
for (int l = 0; l < 8; l++) srcv[l] = *(vector float *)(x + i*32 + 4*l);
|
||||
for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
|
||||
|
||||
for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
|
||||
//for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]);
|
||||
amaxv[0] = vec_max(amaxv[0], amaxv[2]);
|
||||
amaxv[4] = vec_max(amaxv[4], amaxv[6]);
|
||||
//for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]);
|
||||
amaxv[0] = vec_max(amaxv[0], amaxv[4]);
|
||||
|
||||
amax = MAX(
|
||||
MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)),
|
||||
MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3)));
|
||||
|
||||
const float d = amax / ((1 << 3) - 1);
|
||||
const float id = d ? 1.0/d : 0.0;
|
||||
|
||||
*(float *)pd = d;
|
||||
pd += bs;
|
||||
|
||||
const vector float vid = vec_splats(id);
|
||||
for (int l = 0; l < 8; l++) {
|
||||
const vector float vf = vec_madd(srcv[l], vid, v85);
|
||||
const vector signed int vi = vec_signed(vf);
|
||||
|
||||
pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4);
|
||||
pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4);
|
||||
}
|
||||
|
||||
//memcpy(pb, pp, sizeof(pp));
|
||||
pb += bs;
|
||||
}
|
||||
#else
|
||||
#error "not implemented for QK"
|
||||
#endif
|
||||
#elif __ARM_NEON
|
||||
#if QK == 32
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
@@ -2344,6 +2434,7 @@ struct ggml_context {
|
||||
size_t mem_size;
|
||||
void * mem_buffer;
|
||||
bool mem_buffer_owned;
|
||||
bool mem_buffer_mlocked;
|
||||
|
||||
int n_objects;
|
||||
|
||||
@@ -2619,16 +2710,19 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||
}
|
||||
|
||||
*ctx = (struct ggml_context) {
|
||||
/*.mem_size =*/ params.mem_size,
|
||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
|
||||
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
||||
/*.n_objects =*/ 0,
|
||||
/*.objects_begin =*/ NULL,
|
||||
/*.objects_end =*/ NULL,
|
||||
/*.scratch =*/ { 0, 0, NULL, },
|
||||
/*.scratch_save =*/ { 0, 0, NULL, },
|
||||
/*.mem_size =*/ params.mem_size,
|
||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
|
||||
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
||||
/*.mem_buffer_mlocked =*/ false,
|
||||
/*.n_objects =*/ 0,
|
||||
/*.objects_begin =*/ NULL,
|
||||
/*.objects_end =*/ NULL,
|
||||
/*.scratch =*/ { 0, 0, NULL, },
|
||||
/*.scratch_save =*/ { 0, 0, NULL, },
|
||||
};
|
||||
|
||||
GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
|
||||
|
||||
ggml_assert_aligned(ctx->mem_buffer);
|
||||
|
||||
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
||||
@@ -2651,6 +2745,14 @@ void ggml_free(struct ggml_context * ctx) {
|
||||
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
|
||||
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
|
||||
|
||||
#if GGML_MLOCK_SUPPORT
|
||||
if (ctx->mem_buffer_mlocked) {
|
||||
if (munlock(ctx->mem_buffer, ctx->mem_size)) {
|
||||
fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ctx->mem_buffer_owned) {
|
||||
free(ctx->mem_buffer);
|
||||
}
|
||||
@@ -2679,6 +2781,37 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
||||
return result;
|
||||
}
|
||||
|
||||
bool ggml_mlock_supported(void) {
|
||||
return GGML_MLOCK_SUPPORT;
|
||||
}
|
||||
|
||||
#if GGML_MLOCK_SUPPORT
|
||||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
|
||||
#else
|
||||
#define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
|
||||
#endif
|
||||
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
|
||||
if (ctx->mem_buffer_mlocked) {
|
||||
return true;
|
||||
}
|
||||
if (mlock(ctx->mem_buffer, ctx->mem_size)) {
|
||||
int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
|
||||
ctx->mem_size, strerror(errno));
|
||||
GGML_ASSERT(ret >= 0);
|
||||
return false;
|
||||
}
|
||||
ctx->mem_buffer_mlocked = true;
|
||||
return true;
|
||||
}
|
||||
#else // GGML_MLOCK_SUPPORT
|
||||
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
|
||||
*err_p = strdup("can't mlock because it's not supported on this system");
|
||||
return false;
|
||||
}
|
||||
#endif // GGML_MLOCK_SUPPORT
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_impl(
|
||||
|
||||
3
ggml.h
3
ggml.h
@@ -343,6 +343,9 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||
|
||||
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||
|
||||
bool ggml_mlock_supported(void);
|
||||
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
|
||||
88
llama.cpp
88
llama.cpp
@@ -102,6 +102,9 @@ struct llama_context {
|
||||
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
||||
std::vector<float> logits;
|
||||
bool logits_all = false;
|
||||
|
||||
// input embedding (1-dimensional array: [n_embd])
|
||||
std::vector<float> embedding;
|
||||
};
|
||||
|
||||
struct llama_context_params llama_context_default_params() {
|
||||
@@ -112,6 +115,8 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.f16_kv =*/ false,
|
||||
/*.logits_all =*/ false,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.embedding =*/ false,
|
||||
};
|
||||
|
||||
return result;
|
||||
@@ -592,8 +597,6 @@ static bool llama_model_load(
|
||||
fin.close();
|
||||
}
|
||||
|
||||
lctx.logits.reserve(lctx.model.hparams.n_ctx);
|
||||
|
||||
lctx.t_load_us = ggml_time_us() - t_start_us;
|
||||
|
||||
return true;
|
||||
@@ -727,11 +730,13 @@ static bool llama_eval_internal(
|
||||
|
||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||
struct ggml_tensor * V_trans =
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
1, 2, 0, 3);
|
||||
ggml_cpy(ctx0,
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
1, 2, 0, 3),
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
|
||||
|
||||
// KQV = transpose(V) * KQ_soft_max
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||
@@ -789,6 +794,9 @@ static bool llama_eval_internal(
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
// used at the end to optionally extract the embeddings
|
||||
struct ggml_tensor * embeddings = NULL;
|
||||
|
||||
// norm
|
||||
{
|
||||
inpL = ggml_rms_norm(ctx0, inpL);
|
||||
@@ -797,6 +805,8 @@ static bool llama_eval_internal(
|
||||
inpL = ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.norm, inpL),
|
||||
inpL);
|
||||
|
||||
embeddings = inpL;
|
||||
}
|
||||
|
||||
// lm_head
|
||||
@@ -819,15 +829,26 @@ static bool llama_eval_internal(
|
||||
//embd_w.resize(n_vocab*N);
|
||||
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
|
||||
auto & logits_out = lctx.logits;
|
||||
// extract logits
|
||||
{
|
||||
auto & logits_out = lctx.logits;
|
||||
|
||||
if (lctx.logits_all) {
|
||||
logits_out.resize(n_vocab * N);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
} else {
|
||||
// return result for just the last token
|
||||
logits_out.resize(n_vocab);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
if (lctx.logits_all) {
|
||||
logits_out.resize(n_vocab * N);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
} else {
|
||||
// return result for just the last token
|
||||
logits_out.resize(n_vocab);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
}
|
||||
}
|
||||
|
||||
// extract embeddings
|
||||
if (lctx.embedding.size()) {
|
||||
auto & embedding_out = lctx.embedding;
|
||||
|
||||
embedding_out.resize(n_embd);
|
||||
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
||||
}
|
||||
|
||||
if (mem_per_token == 0) {
|
||||
@@ -1408,17 +1429,44 @@ struct llama_context * llama_init_from_file(
|
||||
|
||||
ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) {
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory,
|
||||
params.vocab_only)) {
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
delete ctx;
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (params.use_mlock) {
|
||||
char *err;
|
||||
if (!ggml_mlock(ctx->model.ctx, &err)) {
|
||||
fprintf(stderr, "%s\n", err);
|
||||
free(err);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// reserve memory for context buffers
|
||||
{
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
if (params.logits_all) {
|
||||
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
||||
} else {
|
||||
ctx->logits.reserve(hparams.n_ctx);
|
||||
}
|
||||
|
||||
if (params.embedding){
|
||||
ctx->embedding.reserve(hparams.n_embd);
|
||||
}
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void llama_free(struct llama_context * ctx) {
|
||||
ggml_free(ctx->model.ctx);
|
||||
if (ctx->model.ctx) {
|
||||
ggml_free(ctx->model.ctx);
|
||||
}
|
||||
|
||||
delete ctx;
|
||||
}
|
||||
@@ -1482,6 +1530,10 @@ float * llama_get_logits(struct llama_context * ctx) {
|
||||
return ctx->logits.data();
|
||||
}
|
||||
|
||||
float * llama_get_embeddings(struct llama_context * ctx) {
|
||||
return ctx->embedding.data();
|
||||
}
|
||||
|
||||
const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
|
||||
if (token >= llama_n_vocab(ctx)) {
|
||||
return nullptr;
|
||||
|
||||
6
llama.h
6
llama.h
@@ -53,6 +53,8 @@ extern "C" {
|
||||
bool f16_kv; // use fp16 for KV cache
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool embedding; // embedding mode only
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
@@ -108,6 +110,10 @@ extern "C" {
|
||||
// Cols: n_vocab
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the input
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
|
||||
|
||||
|
||||
45
main.cpp
45
main.cpp
@@ -199,6 +199,8 @@ int main(int argc, char ** argv) {
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.logits_all = params.perplexity;
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
lparams.embedding = params.embedding;
|
||||
|
||||
ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
|
||||
@@ -258,6 +260,9 @@ int main(int argc, char ** argv) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
// determine newline token
|
||||
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
@@ -289,6 +294,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
|
||||
int last_n_size = params.repeat_last_n;
|
||||
std::vector<llama_token> last_n_tokens(last_n_size);
|
||||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||
@@ -321,6 +327,27 @@ int main(int argc, char ** argv) {
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
set_console_state(CONSOLE_STATE_PROMPT);
|
||||
|
||||
if (params.embedding){
|
||||
embd = embd_inp;
|
||||
|
||||
if (embd.size() > 0) {
|
||||
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto embeddings = llama_get_embeddings(ctx);
|
||||
|
||||
// TODO: print / use the embeddings
|
||||
|
||||
if (params.use_color) {
|
||||
printf(ANSI_COLOR_RESET);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (remaining_tokens > 0 || params.interactive) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
@@ -359,6 +386,16 @@ int main(int argc, char ** argv) {
|
||||
last_n_tokens.push_back(id);
|
||||
}
|
||||
|
||||
// replace end of text token with newline token when in interactive mode
|
||||
if (id == llama_token_eos() && params.interactive) {
|
||||
id = llama_token_newline.front();
|
||||
if (params.antiprompt.size() != 0) {
|
||||
// tokenize and inject first reverse prompt
|
||||
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
|
||||
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
||||
}
|
||||
}
|
||||
|
||||
// add it to the context
|
||||
embd.push_back(id);
|
||||
|
||||
@@ -451,12 +488,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// end of text token
|
||||
if (embd.back() == llama_token_eos()) {
|
||||
if (params.interactive) {
|
||||
is_interacting = true;
|
||||
} else {
|
||||
fprintf(stderr, " [end of text]\n");
|
||||
break;
|
||||
}
|
||||
fprintf(stderr, " [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
|
||||
@@ -57,6 +57,7 @@ def main():
|
||||
# )
|
||||
|
||||
args = parser.parse_args()
|
||||
args.models_path = os.path.abspath(args.models_path)
|
||||
|
||||
if not os.path.isfile(args.quantize_script_path):
|
||||
print(
|
||||
|
||||
11
utils.cpp
11
utils.cpp
@@ -1,3 +1,5 @@
|
||||
#include "ggml.h"
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
#include <cassert>
|
||||
@@ -117,12 +119,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.model = argv[i];
|
||||
} else if (arg == "-i" || arg == "--interactive") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--embedding") {
|
||||
params.embedding = true;
|
||||
} else if (arg == "--interactive-start") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--interactive-first") {
|
||||
params.interactive_start = true;
|
||||
} else if (arg == "-ins" || arg == "--instruct") {
|
||||
params.instruct = true;
|
||||
} else if (arg == "--color") {
|
||||
params.use_color = true;
|
||||
} else if (arg == "--mlock") {
|
||||
params.use_mlock = true;
|
||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -190,6 +198,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
|
||||
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||
if (ggml_mlock_supported()) {
|
||||
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
5
utils.h
5
utils.h
@@ -32,16 +32,21 @@ struct gpt_params {
|
||||
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
||||
std::string prompt = "";
|
||||
|
||||
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
|
||||
bool memory_f16 = false; // use f16 instead of f32 for memory kv
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool interactive_start = false; // wait for user input immediately
|
||||
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool ignore_eos = false; // do not stop generating after eos
|
||||
bool perplexity = false; // compute perplexity over the prompt
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
};
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||
|
||||
Reference in New Issue
Block a user