mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
17 Commits
b1074
...
gguf-64bit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
33a5517d87 | ||
|
|
b61b170005 | ||
|
|
09b6da741e | ||
|
|
6d369a1558 | ||
|
|
bc3eaf262e | ||
|
|
be726c57ee | ||
|
|
ba335ff5b2 | ||
|
|
3656b3ce81 | ||
|
|
4f0547e4a3 | ||
|
|
5f1fffd2d4 | ||
|
|
c7d92e6dfe | ||
|
|
61d1a2895e | ||
|
|
741ca7dd1c | ||
|
|
72f895c923 | ||
|
|
50526f37eb | ||
|
|
04f4b1eb10 | ||
|
|
7592375403 |
@@ -30,6 +30,9 @@ bool gguf_ex_write(const std::string & fname) {
|
||||
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
|
||||
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
|
||||
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
|
||||
gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull);
|
||||
gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll);
|
||||
gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789);
|
||||
gguf_set_val_bool(ctx, "some.parameter.bool", true);
|
||||
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
|
||||
|
||||
|
||||
@@ -604,7 +604,12 @@ int main(int argc, char ** argv) {
|
||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
last_n_repeat, alpha_frequency, alpha_presence);
|
||||
if (!penalize_nl) {
|
||||
logits[llama_token_nl(ctx)] = nl_logit;
|
||||
for (size_t idx = 0; idx < candidates_p.size; idx++) {
|
||||
if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
|
||||
candidates_p.data[idx].logit = nl_logit;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (grammar != NULL) {
|
||||
|
||||
12
flake.lock
generated
12
flake.lock
generated
@@ -5,11 +5,11 @@
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1685518550,
|
||||
"narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
|
||||
"lastModified": 1692799911,
|
||||
"narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
|
||||
"rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1685931219,
|
||||
"narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=",
|
||||
"lastModified": 1692913444,
|
||||
"narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "7409480d5c8584a1a83c422530419efe4afb0d19",
|
||||
"rev": "18324978d632ffc55ef1d928e81630c620f4f447",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
43
flake.nix
43
flake.nix
@@ -6,6 +6,9 @@
|
||||
outputs = { self, nixpkgs, flake-utils }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
name = "llama.cpp";
|
||||
src = ./.;
|
||||
meta.mainProgram = "llama";
|
||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
||||
buildInputs = with pkgs; [ openmpi ];
|
||||
osSpecific = with pkgs; buildInputs ++
|
||||
@@ -31,7 +34,7 @@
|
||||
with pkgs; [ openblas ]
|
||||
);
|
||||
pkgs = import nixpkgs { inherit system; };
|
||||
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
|
||||
nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ];
|
||||
llama-python =
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||
postPatch = ''
|
||||
@@ -44,35 +47,35 @@
|
||||
mv $out/bin/server $out/bin/llama-server
|
||||
'';
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||
in {
|
||||
in
|
||||
{
|
||||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
name = "llama.cpp";
|
||||
src = ./.;
|
||||
postPatch = postPatch;
|
||||
nativeBuildInputs = nativeBuildInputs;
|
||||
buildInputs = osSpecific;
|
||||
inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
|
||||
cmakeFlags = cmakeFlags
|
||||
++ (if isAarch64 && isDarwin then [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
"-DLLAMA_METAL=ON"
|
||||
] else [
|
||||
"-DLLAMA_BLAS=ON"
|
||||
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
"-DLLAMA_METAL=ON"
|
||||
] else [
|
||||
"-DLLAMA_BLAS=ON"
|
||||
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
||||
]);
|
||||
postInstall = postInstall;
|
||||
meta.mainProgram = "llama";
|
||||
};
|
||||
packages.opencl = pkgs.stdenv.mkDerivation {
|
||||
name = "llama.cpp";
|
||||
src = ./.;
|
||||
postPatch = postPatch;
|
||||
nativeBuildInputs = nativeBuildInputs;
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = with pkgs; buildInputs ++ [ clblast ];
|
||||
cmakeFlags = cmakeFlags ++ [
|
||||
"-DLLAMA_CLBLAST=ON"
|
||||
];
|
||||
postInstall = postInstall;
|
||||
meta.mainProgram = "llama";
|
||||
};
|
||||
packages.rocm = pkgs.stdenv.mkDerivation {
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
|
||||
cmakeFlags = cmakeFlags ++ [
|
||||
"-DLLAMA_HIPBLAS=1"
|
||||
"-DCMAKE_C_COMPILER=hipcc"
|
||||
"-DCMAKE_CXX_COMPILER=hipcc"
|
||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
|
||||
];
|
||||
};
|
||||
apps.llama-server = {
|
||||
type = "app";
|
||||
|
||||
137
ggml.c
137
ggml.c
@@ -19394,7 +19394,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct gguf_str {
|
||||
uint32_t n;
|
||||
uint64_t n; // GGUFv2
|
||||
char * data;
|
||||
};
|
||||
|
||||
@@ -19408,9 +19408,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
||||
[GGUF_TYPE_FLOAT32] = sizeof(float),
|
||||
[GGUF_TYPE_BOOL] = sizeof(bool),
|
||||
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
|
||||
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
|
||||
[GGUF_TYPE_INT64] = sizeof(int64_t),
|
||||
[GGUF_TYPE_FLOAT64] = sizeof(double),
|
||||
[GGUF_TYPE_ARRAY] = 0, // undefined
|
||||
};
|
||||
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
||||
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
||||
|
||||
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
||||
[GGUF_TYPE_UINT8] = "u8",
|
||||
@@ -19423,8 +19426,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
||||
[GGUF_TYPE_BOOL] = "bool",
|
||||
[GGUF_TYPE_STRING] = "str",
|
||||
[GGUF_TYPE_ARRAY] = "arr",
|
||||
[GGUF_TYPE_UINT64] = "u64",
|
||||
[GGUF_TYPE_INT64] = "i64",
|
||||
[GGUF_TYPE_FLOAT64] = "f64",
|
||||
};
|
||||
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
||||
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
||||
|
||||
union gguf_value {
|
||||
uint8_t uint8;
|
||||
@@ -19434,6 +19440,9 @@ union gguf_value {
|
||||
uint32_t uint32;
|
||||
int32_t int32;
|
||||
float float32;
|
||||
uint64_t uint64;
|
||||
int64_t int64;
|
||||
double float64;
|
||||
bool bool_;
|
||||
|
||||
struct gguf_str str;
|
||||
@@ -19441,7 +19450,7 @@ union gguf_value {
|
||||
struct {
|
||||
enum gguf_type type;
|
||||
|
||||
uint32_t n;
|
||||
uint64_t n; // GGUFv2
|
||||
void * data;
|
||||
} arr;
|
||||
};
|
||||
@@ -19449,8 +19458,6 @@ union gguf_value {
|
||||
struct gguf_kv {
|
||||
struct gguf_str key;
|
||||
|
||||
uint32_t n_bytes; // TODO: is this actually needed?
|
||||
|
||||
enum gguf_type type;
|
||||
union gguf_value value;
|
||||
};
|
||||
@@ -19458,15 +19465,15 @@ struct gguf_kv {
|
||||
struct gguf_header {
|
||||
uint32_t magic;
|
||||
uint32_t version;
|
||||
uint32_t n_tensors;
|
||||
uint32_t n_kv;
|
||||
uint64_t n_tensors; // GGUFv2
|
||||
uint64_t n_kv; // GGUFv2
|
||||
};
|
||||
|
||||
struct gguf_tensor_info {
|
||||
struct gguf_str name;
|
||||
|
||||
uint32_t n_dims;
|
||||
uint32_t ne[GGML_MAX_DIMS];
|
||||
uint64_t ne[GGML_MAX_DIMS];
|
||||
|
||||
enum ggml_type type;
|
||||
|
||||
@@ -19497,19 +19504,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
|
||||
return n == size;
|
||||
}
|
||||
|
||||
static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
p->n = 0;
|
||||
p->data = NULL;
|
||||
|
||||
bool ok = true;
|
||||
|
||||
// TODO: how to avoid mallocs for strings?
|
||||
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
p->n = 0;
|
||||
p->data = NULL;
|
||||
|
||||
bool ok = true;
|
||||
|
||||
uint32_t n = 0;
|
||||
ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
|
||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_empty(void) {
|
||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||
|
||||
@@ -19565,8 +19585,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
ctx->data = NULL;
|
||||
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
||||
|
||||
if (ctx->header.version == 1) {
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
uint32_t n_tensors = 0;
|
||||
uint32_t n_kv = 0;
|
||||
|
||||
ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
|
||||
ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
|
||||
|
||||
ctx->header.n_tensors = n_tensors;
|
||||
ctx->header.n_kv = n_kv;
|
||||
} else {
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read header\n", __func__);
|
||||
@@ -19576,6 +19609,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
|
||||
if (ctx->header.version == 1) {
|
||||
gguf_fread_str = gguf_fread_str_v1;
|
||||
}
|
||||
|
||||
// read the kv pairs
|
||||
{
|
||||
ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
@@ -19585,9 +19624,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
||||
|
||||
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
||||
//ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
|
||||
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
||||
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
||||
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
||||
|
||||
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
||||
|
||||
@@ -19599,12 +19637,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
||||
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
||||
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
||||
case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
|
||||
case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
|
||||
case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
|
||||
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
||||
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||
|
||||
if (ctx->header.version == 1) {
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
uint32_t n = 0;
|
||||
ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
|
||||
kv->value.arr.n = n;
|
||||
} else {
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||
}
|
||||
|
||||
switch (kv->value.arr.type) {
|
||||
case GGUF_TYPE_UINT8:
|
||||
@@ -19614,6 +19663,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32:
|
||||
case GGUF_TYPE_FLOAT32:
|
||||
case GGUF_TYPE_UINT64:
|
||||
case GGUF_TYPE_INT64:
|
||||
case GGUF_TYPE_FLOAT64:
|
||||
case GGUF_TYPE_BOOL:
|
||||
{
|
||||
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||
@@ -19660,7 +19712,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
||||
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||
if (ctx->header.version == 1) {
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
uint32_t t = 0;
|
||||
ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
|
||||
info->ne[j] = t;
|
||||
} else {
|
||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||
}
|
||||
}
|
||||
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
||||
@@ -19954,6 +20013,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.float32;
|
||||
}
|
||||
|
||||
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.uint64;
|
||||
}
|
||||
|
||||
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.int64;
|
||||
}
|
||||
|
||||
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.float64;
|
||||
}
|
||||
|
||||
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
||||
return ctx->kv[i].value.bool_;
|
||||
}
|
||||
@@ -20056,6 +20127,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
|
||||
ctx->kv[idx].value.float32 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_UINT64;
|
||||
ctx->kv[idx].value.uint64 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_INT64;
|
||||
ctx->kv[idx].value.int64 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
|
||||
ctx->kv[idx].value.float64 = val;
|
||||
}
|
||||
|
||||
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
||||
const int idx = gguf_get_or_add_key(ctx, key);
|
||||
|
||||
@@ -20106,6 +20198,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
||||
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
||||
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
||||
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
||||
case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
|
||||
case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
|
||||
case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
|
||||
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
||||
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
@@ -20267,6 +20362,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
||||
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
||||
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
||||
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
||||
case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
|
||||
case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
|
||||
case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
|
||||
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
||||
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
@@ -20282,6 +20380,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf,
|
||||
case GGUF_TYPE_UINT32:
|
||||
case GGUF_TYPE_INT32:
|
||||
case GGUF_TYPE_FLOAT32:
|
||||
case GGUF_TYPE_UINT64:
|
||||
case GGUF_TYPE_INT64:
|
||||
case GGUF_TYPE_FLOAT64:
|
||||
case GGUF_TYPE_BOOL:
|
||||
{
|
||||
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||
|
||||
11
ggml.h
11
ggml.h
@@ -216,7 +216,7 @@
|
||||
#define GGML_EXIT_ABORTED 1
|
||||
|
||||
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
||||
#define GGUF_VERSION 1
|
||||
#define GGUF_VERSION 2
|
||||
|
||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||
|
||||
@@ -1827,6 +1827,9 @@ extern "C" {
|
||||
GGUF_TYPE_BOOL = 7,
|
||||
GGUF_TYPE_STRING = 8,
|
||||
GGUF_TYPE_ARRAY = 9,
|
||||
GGUF_TYPE_UINT64 = 10,
|
||||
GGUF_TYPE_INT64 = 11,
|
||||
GGUF_TYPE_FLOAT64 = 12,
|
||||
GGUF_TYPE_COUNT, // marks the end of the enum
|
||||
};
|
||||
|
||||
@@ -1867,6 +1870,9 @@ extern "C" {
|
||||
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
||||
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
||||
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
||||
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
|
||||
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
|
||||
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
|
||||
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
||||
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
||||
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
||||
@@ -1886,6 +1892,9 @@ extern "C" {
|
||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
||||
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
||||
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
||||
|
||||
@@ -13,7 +13,7 @@ from typing import Any, IO, List, Optional
|
||||
#
|
||||
|
||||
GGUF_MAGIC = 0x46554747
|
||||
GGUF_VERSION = 1
|
||||
GGUF_VERSION = 2
|
||||
GGUF_DEFAULT_ALIGNMENT = 32
|
||||
|
||||
# general
|
||||
@@ -365,6 +365,9 @@ class GGUFValueType(IntEnum):
|
||||
BOOL = 7
|
||||
STRING = 8
|
||||
ARRAY = 9
|
||||
UINT64 = 10
|
||||
INT64 = 11
|
||||
FLOAT64 = 12
|
||||
|
||||
@staticmethod
|
||||
def get_type(val):
|
||||
@@ -378,6 +381,7 @@ class GGUFValueType(IntEnum):
|
||||
return GGUFValueType.BOOL
|
||||
elif isinstance(val, int):
|
||||
return GGUFValueType.INT32
|
||||
# TODO: need help with 64-bit types in Python
|
||||
else:
|
||||
print("Unknown type: "+str(type(val)))
|
||||
sys.exit()
|
||||
@@ -400,8 +404,8 @@ class GGUFWriter:
|
||||
def write_header_to_file(self):
|
||||
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
||||
self.fout.write(struct.pack("<I", GGUF_VERSION))
|
||||
self.fout.write(struct.pack("<I", self.ti_data_count))
|
||||
self.fout.write(struct.pack("<I", self.kv_data_count))
|
||||
self.fout.write(struct.pack("<Q", self.ti_data_count))
|
||||
self.fout.write(struct.pack("<Q", self.kv_data_count))
|
||||
self.flush()
|
||||
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
|
||||
|
||||
@@ -444,6 +448,18 @@ class GGUFWriter:
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.FLOAT32)
|
||||
|
||||
def add_uint64(self, key: str, val: int):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.UINT64)
|
||||
|
||||
def add_int64(self, key: str, val: int):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.INT64)
|
||||
|
||||
def add_float64(self, key: str, val: float):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.FLOAT64)
|
||||
|
||||
def add_bool(self, key: str, val: bool):
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.BOOL)
|
||||
@@ -483,17 +499,23 @@ class GGUFWriter:
|
||||
self.kv_data += struct.pack("<i", val)
|
||||
elif vtype == GGUFValueType.FLOAT32:
|
||||
self.kv_data += struct.pack("<f", val)
|
||||
elif vtype == GGUFValueType.UINT64:
|
||||
self.kv_data += struct.pack("<Q", val)
|
||||
elif vtype == GGUFValueType.INT64:
|
||||
self.kv_data += struct.pack("<q", val)
|
||||
elif vtype == GGUFValueType.FLOAT64:
|
||||
self.kv_data += struct.pack("<d", val)
|
||||
elif vtype == GGUFValueType.BOOL:
|
||||
self.kv_data += struct.pack("?", val)
|
||||
elif vtype == GGUFValueType.STRING:
|
||||
encoded_val = val.encode("utf8") if isinstance(val, str) else val
|
||||
self.kv_data += struct.pack("<I", len(encoded_val))
|
||||
self.kv_data += struct.pack("<Q", len(encoded_val))
|
||||
self.kv_data += encoded_val
|
||||
elif vtype == GGUFValueType.ARRAY:
|
||||
ltype = set([GGUFValueType.get_type(item) for item in val])
|
||||
assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
|
||||
self.kv_data += struct.pack("<I", list(ltype)[0])
|
||||
self.kv_data += struct.pack("<I", len(val))
|
||||
self.kv_data += struct.pack("<Q", len(val))
|
||||
for item in val:
|
||||
self.add_val(item, add_vtype=False)
|
||||
else:
|
||||
@@ -507,12 +529,12 @@ class GGUFWriter:
|
||||
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
||||
|
||||
encoded_name = name.encode("utf8")
|
||||
self.ti_data += struct.pack("<I", len(encoded_name))
|
||||
self.ti_data += struct.pack("<Q", len(encoded_name))
|
||||
self.ti_data += encoded_name
|
||||
n_dims = len(tensor_shape)
|
||||
self.ti_data += struct.pack("<I", n_dims)
|
||||
for i in range(n_dims):
|
||||
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
|
||||
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
|
||||
if raw_dtype is None:
|
||||
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
||||
else:
|
||||
|
||||
41
llama.cpp
41
llama.cpp
@@ -1,9 +1,6 @@
|
||||
// Defines fileno on msys:
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#endif
|
||||
|
||||
#include "llama.h"
|
||||
@@ -62,6 +59,9 @@
|
||||
#include <cinttypes>
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
@@ -955,10 +955,10 @@ struct llama_vocab {
|
||||
id linefeed_id = 13;
|
||||
|
||||
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
||||
replace_all(token_left, " ", "Ġ");
|
||||
replace_all(token_left, "\n", "Ċ");
|
||||
replace_all(token_right, " ", "Ġ");
|
||||
replace_all(token_right, "\n", "Ċ");
|
||||
replace_all(token_left, " ", "\u0120");
|
||||
replace_all(token_left, "\n", "\u010A");
|
||||
replace_all(token_right, " ", "\u0120");
|
||||
replace_all(token_right, "\n", "\u010A");
|
||||
|
||||
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
||||
if (it == bpe_ranks.end()) {
|
||||
@@ -1144,11 +1144,13 @@ static bool llama_kv_cache_init(
|
||||
|
||||
enum llama_fver {
|
||||
GGUF_FILE_VERSION_V1 = 1,
|
||||
GGUF_FILE_VERSION_V2 = 2,
|
||||
};
|
||||
|
||||
static const char * llama_file_version_name(llama_fver version) {
|
||||
switch (version) {
|
||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
|
||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
||||
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
@@ -3887,7 +3889,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
||||
|
||||
// Calculate absolute value of second derivatives
|
||||
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
||||
second_derivatives[i] = abs(second_derivatives[i]);
|
||||
second_derivatives[i] = std::abs(second_derivatives[i]);
|
||||
}
|
||||
|
||||
// Normalize the second derivatives
|
||||
@@ -4653,6 +4655,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
||||
|
||||
llama_model model;
|
||||
llm_load_arch(*ml, model);
|
||||
llm_load_hparams(*ml, model, 0, 0, 0);
|
||||
|
||||
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
||||
struct gguf_context * ctx_out = gguf_init_empty();
|
||||
|
||||
@@ -4678,6 +4684,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
++n_feed_forward_w2;
|
||||
}
|
||||
}
|
||||
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
||||
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
||||
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
||||
}
|
||||
|
||||
int i_attention_wv = 0;
|
||||
int i_feed_forward_w2 = 0;
|
||||
@@ -4754,8 +4764,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
||||
if (nx % QK_K == 0) {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||
@@ -4769,6 +4778,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||
if (model.type == MODEL_70B) {
|
||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
++i_attention_wv;
|
||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
@@ -4798,8 +4813,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
||||
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
||||
if (nx % QK_K != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
||||
convert_incompatible_tensor = true;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user