mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
7 Commits
master-112
...
master-46e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
46ef5b5fcf | ||
|
|
c63bb1d16a | ||
|
|
3b6cfe7c92 | ||
|
|
800c9635b4 | ||
|
|
deb7dfca4b | ||
|
|
bac66994cf | ||
|
|
519c981f8b |
@@ -387,11 +387,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--mul-mat-q" || arg == "-mmq") {
|
||||
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.mul_mat_q = true;
|
||||
params.mul_mat_q = false;
|
||||
#else
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
} else if (arg == "--low-vram" || arg == "-lv") {
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
@@ -599,11 +599,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stdout, " number of layers to store in VRAM\n");
|
||||
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
|
||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
|
||||
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
|
||||
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
|
||||
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
|
||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
|
||||
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
|
||||
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
|
||||
#endif
|
||||
fprintf(stdout, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
|
||||
|
||||
@@ -68,7 +68,7 @@ struct gpt_params {
|
||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||
|
||||
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
||||
bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels
|
||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
|
||||
31
convert.py
31
convert.py
@@ -69,7 +69,10 @@ SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
||||
'I32': DT_I32,
|
||||
}
|
||||
|
||||
class GGMLFileType(enum.Enum):
|
||||
# TODO: match this with `llama_ftype`
|
||||
# TODO: rename to LLAMAFileType
|
||||
# TODO: move to `gguf.py`
|
||||
class GGMLFileType(enum.IntEnum):
|
||||
AllF32 = 0
|
||||
MostlyF16 = 1 # except 1d tensors
|
||||
|
||||
@@ -101,6 +104,8 @@ class Params:
|
||||
n_head_kv: int
|
||||
f_norm_eps: float
|
||||
|
||||
ftype: Optional[GGMLFileType] = None
|
||||
|
||||
@staticmethod
|
||||
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
||||
# hardcoded magic range
|
||||
@@ -738,6 +743,9 @@ class OutputFile:
|
||||
self.gguf.add_head_count_kv (params.n_head_kv)
|
||||
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
|
||||
|
||||
if params.ftype:
|
||||
self.gguf.add_file_type(params.ftype)
|
||||
|
||||
def add_meta_vocab(self, vocab: Vocab) -> None:
|
||||
tokens = []
|
||||
scores = []
|
||||
@@ -956,7 +964,7 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, Sentence
|
||||
path = path3
|
||||
else:
|
||||
raise FileNotFoundError(
|
||||
f"Could not find tokenizer.model in {path} or its parent; "
|
||||
f"Could not find {vocab_file} in {path} or its parent; "
|
||||
"if it's in another directory, pass the directory as --vocab-dir")
|
||||
|
||||
print(f"Loading vocab file '{path}', type '{vocabtype}'")
|
||||
@@ -1020,6 +1028,12 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||
" - LLaMA v2: --ctx 4096\n")
|
||||
params.n_ctx = args.ctx
|
||||
|
||||
if args.outtype:
|
||||
params.ftype = {
|
||||
"f32": GGMLFileType.AllF32,
|
||||
"f16": GGMLFileType.MostlyF16,
|
||||
}[args.outtype]
|
||||
|
||||
print(f"params = {params}")
|
||||
|
||||
vocab: Vocab
|
||||
@@ -1040,11 +1054,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
||||
|
||||
model = model_plus.model
|
||||
model = convert_model_names(model, params)
|
||||
output_type = pick_output_type(model, args.outtype)
|
||||
model = convert_to_output_type(model, output_type)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
|
||||
model = model_plus.model
|
||||
model = convert_model_names(model, params)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
model = convert_to_output_type(model, ftype)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
||||
|
||||
params.ftype = ftype
|
||||
print(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
OutputFile.write_all(outfile, params, model, vocab)
|
||||
print(f"Wrote {outfile}")
|
||||
|
||||
@@ -72,23 +72,30 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
if (params.embedding){
|
||||
if (embd_inp.size() > 0) {
|
||||
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_embd = llama_n_embd(ctx);
|
||||
const auto embeddings = llama_get_embeddings(ctx);
|
||||
|
||||
for (int i = 0; i < n_embd; i++) {
|
||||
printf("%f ", embeddings[i]);
|
||||
}
|
||||
printf("\n");
|
||||
if (embd_inp.size() > (size_t)params.n_ctx) {
|
||||
fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
|
||||
__func__, embd_inp.size(), params.n_ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
while (!embd_inp.empty()) {
|
||||
int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
|
||||
if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += n_tokens;
|
||||
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
|
||||
}
|
||||
|
||||
const int n_embd = llama_n_embd(ctx);
|
||||
const auto embeddings = llama_get_embeddings(ctx);
|
||||
|
||||
for (int i = 0; i < n_embd; i++) {
|
||||
printf("%f ", embeddings[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
@@ -671,12 +671,11 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
fprintf(stdout, " number of layers to store in VRAM\n");
|
||||
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
|
||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
|
||||
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
|
||||
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
|
||||
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
|
||||
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
|
||||
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
|
||||
#endif
|
||||
fprintf(stdout, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
|
||||
@@ -867,12 +866,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
|
||||
#endif // GGML_USE_CUBLAS
|
||||
}
|
||||
else if (arg == "--mul-mat-q" || arg == "-mmq")
|
||||
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
|
||||
{
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.mul_mat_q = true;
|
||||
params.mul_mat_q = false;
|
||||
#else
|
||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
|
||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
|
||||
#endif // GGML_USE_CUBLAS
|
||||
}
|
||||
else if (arg == "--main-gpu" || arg == "-mg")
|
||||
|
||||
39
ggml-cuda.cu
39
ggml-cuda.cu
@@ -287,7 +287,7 @@ static int g_device_count = -1;
|
||||
static int g_main_device = 0;
|
||||
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
static bool g_mul_mat_q = false;
|
||||
static bool g_mul_mat_q = true;
|
||||
|
||||
static void * g_scratch_buffer = nullptr;
|
||||
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
||||
@@ -3979,24 +3979,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
||||
|
||||
// the CUDA soft max implementation differs from the CPU implementation
|
||||
// instead of doubles floats are used
|
||||
// values are also not normalized to the maximum value by subtracting it in the exponential function
|
||||
// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
|
||||
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
||||
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int block_size = blockDim.y;
|
||||
const int tid = threadIdx.y;
|
||||
|
||||
float tmp = 0.0;
|
||||
|
||||
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
||||
const int col = block_start + tid;
|
||||
|
||||
if (col >= ncols) {
|
||||
break;
|
||||
}
|
||||
float max_val = -INFINITY;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const int i = row*ncols + col;
|
||||
const float val = expf(x[i]);
|
||||
max_val = max(max_val, x[i]);
|
||||
}
|
||||
|
||||
// find the max value in the block
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
|
||||
}
|
||||
|
||||
float tmp = 0.f;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const int i = row*ncols + col;
|
||||
const float val = expf(x[i] - max_val);
|
||||
tmp += val;
|
||||
dst[i] = val;
|
||||
}
|
||||
@@ -4007,15 +4012,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||
}
|
||||
|
||||
for (int block_start = 0; block_start < ncols; block_start += block_size) {
|
||||
const int col = block_start + tid;
|
||||
|
||||
if (col >= ncols) {
|
||||
break;
|
||||
}
|
||||
const float inv_tmp = 1.f / tmp;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const int i = row*ncols + col;
|
||||
dst[i] /= tmp;
|
||||
dst[i] *= inv_tmp;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
4
gguf.py
4
gguf.py
@@ -26,6 +26,7 @@ KEY_GENERAL_DESCRIPTION = "general.description"
|
||||
KEY_GENERAL_LICENSE = "general.license"
|
||||
KEY_GENERAL_SOURCE_URL = "general.source.url"
|
||||
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
||||
KEY_GENERAL_FILE_TYPE = "general.file_type"
|
||||
|
||||
# LLM
|
||||
KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
|
||||
@@ -595,6 +596,9 @@ class GGUFWriter:
|
||||
def add_source_hf_repo(self, repo: str):
|
||||
self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
|
||||
|
||||
def add_file_type(self, ftype: int):
|
||||
self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
|
||||
|
||||
def add_name(self, name: str):
|
||||
self.add_string(KEY_GENERAL_NAME, name)
|
||||
|
||||
|
||||
164
k_quants.c
164
k_quants.c
@@ -77,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
||||
}
|
||||
return 1/iscale;
|
||||
}
|
||||
bool return_early = false;
|
||||
if (rmse_type < 0) {
|
||||
rmse_type = -rmse_type;
|
||||
return_early = true;
|
||||
}
|
||||
int weight_type = rmse_type%2;
|
||||
float sumlx = 0;
|
||||
float suml2 = 0;
|
||||
@@ -89,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
||||
suml2 += w*l*l;
|
||||
}
|
||||
float scale = sumlx/suml2;
|
||||
if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
|
||||
float best = scale * sumlx;
|
||||
for (int itry = 0; itry < 3; ++itry) {
|
||||
iscale = 1/scale;
|
||||
float slx = 0;
|
||||
float sl2 = 0;
|
||||
bool changed = false;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int l = nearest_int(iscale * x[i]);
|
||||
l = MAX(-nmax, MIN(nmax-1, l));
|
||||
if (l + nmax != L[i]) { changed = true; }
|
||||
float w = weight_type == 1 ? x[i] * x[i] : 1.f;
|
||||
slx += w*x[i]*l;
|
||||
sl2 += w*l*l;
|
||||
}
|
||||
if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; }
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int l = nearest_int(iscale * x[i]);
|
||||
L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
|
||||
}
|
||||
sumlx = slx; suml2 = sl2;
|
||||
scale = sumlx/suml2;
|
||||
best = scale * sumlx;
|
||||
}
|
||||
for (int itry = 0; itry < 5; ++itry) {
|
||||
int n_changed = 0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float w = weight_type == 1 ? x[i]*x[i] : 1;
|
||||
int l = L[i] - nmax;
|
||||
float slx = sumlx - w*x[i]*l;
|
||||
if (slx > 0) {
|
||||
float sl2 = suml2 - w*l*l;
|
||||
int new_l = nearest_int(x[i] * sl2 / slx);
|
||||
new_l = MAX(-nmax, MIN(nmax-1, new_l));
|
||||
if (new_l != l) {
|
||||
slx += w*x[i]*new_l;
|
||||
sl2 += w*new_l*new_l;
|
||||
if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
|
||||
L[i] = nmax + new_l; sumlx = slx; suml2 = sl2;
|
||||
scale = sumlx / suml2; best = scale * sumlx;
|
||||
++n_changed;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!n_changed) { break; }
|
||||
}
|
||||
if (rmse_type < 3) {
|
||||
return scale;
|
||||
}
|
||||
for (int is = -4; is <= 4; ++is) {
|
||||
for (int is = -9; is <= 9; ++is) {
|
||||
if (is == 0) {
|
||||
continue;
|
||||
}
|
||||
@@ -221,12 +179,17 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
||||
return 1/iscale;
|
||||
}
|
||||
|
||||
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) {
|
||||
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
|
||||
int ntry, float alpha) {
|
||||
float min = x[0];
|
||||
float max = x[0];
|
||||
float sum_x = 0;
|
||||
float sum_x2 = 0;
|
||||
for (int i = 1; i < n; ++i) {
|
||||
if (x[i] < min) min = x[i];
|
||||
if (x[i] > max) max = x[i];
|
||||
sum_x += x[i];
|
||||
sum_x2 += x[i]*x[i];
|
||||
}
|
||||
if (max == min) {
|
||||
for (int i = 0; i < n; ++i) L[i] = 0;
|
||||
@@ -254,7 +217,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
||||
for (int i = 0; i < n; ++i) {
|
||||
sum += x[i] - scale*L[i];
|
||||
}
|
||||
min = sum/n;
|
||||
min = alpha*min + (1 - alpha)*sum/n;
|
||||
if (min > 0) min = 0;
|
||||
iscale = 1/scale;
|
||||
if (!did_change) break;
|
||||
@@ -263,6 +226,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
||||
return scale;
|
||||
}
|
||||
|
||||
static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
||||
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
||||
float rmin, float rdelta, int nstep, bool use_mad) {
|
||||
float min = x[0];
|
||||
float max = x[0];
|
||||
float sum_w = weights[0];
|
||||
float sum_x = sum_w * x[0];
|
||||
for (int i = 1; i < n; ++i) {
|
||||
if (x[i] < min) min = x[i];
|
||||
if (x[i] > max) max = x[i];
|
||||
float w = weights[i];
|
||||
sum_w += w;
|
||||
sum_x += w * x[i];
|
||||
}
|
||||
if (min > 0) min = 0;
|
||||
if (max == min) {
|
||||
for (int i = 0; i < n; ++i) L[i] = 0;
|
||||
*the_min = -min;
|
||||
return 0.f;
|
||||
}
|
||||
float iscale = nmax/(max - min);
|
||||
float scale = 1/iscale;
|
||||
float best_mad = 0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int l = nearest_int(iscale*(x[i] - min));
|
||||
L[i] = MAX(0, MIN(nmax, l));
|
||||
float diff = scale * L[i] + min - x[i];
|
||||
diff = use_mad ? fabsf(diff) : diff * diff;
|
||||
float w = weights[i];
|
||||
best_mad += w * diff;
|
||||
}
|
||||
if (nstep < 1) {
|
||||
*the_min = -min;
|
||||
return scale;
|
||||
}
|
||||
for (int is = 0; is <= nstep; ++is) {
|
||||
iscale = (rmin + rdelta*is + nmax)/(max - min);
|
||||
float sum_l = 0, sum_l2 = 0, sum_xl = 0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
int l = nearest_int(iscale*(x[i] - min));
|
||||
l = MAX(0, MIN(nmax, l));
|
||||
Laux[i] = l;
|
||||
float w = weights[i];
|
||||
sum_l += w*l;
|
||||
sum_l2 += w*l*l;
|
||||
sum_xl += w*l*x[i];
|
||||
}
|
||||
float D = sum_w * sum_l2 - sum_l * sum_l;
|
||||
if (D > 0) {
|
||||
float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
|
||||
float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
|
||||
if (this_min > 0) {
|
||||
this_min = 0;
|
||||
this_scale = sum_xl / sum_l2;
|
||||
}
|
||||
float mad = 0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float diff = this_scale * Laux[i] + this_min - x[i];
|
||||
diff = use_mad ? fabsf(diff) : diff * diff;
|
||||
float w = weights[i];
|
||||
mad += w * diff;
|
||||
}
|
||||
if (mad < best_mad) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
L[i] = Laux[i];
|
||||
}
|
||||
best_mad = mad;
|
||||
scale = this_scale;
|
||||
min = this_min;
|
||||
}
|
||||
}
|
||||
}
|
||||
*the_min = -min;
|
||||
return scale;
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
|
||||
if (j < 4) {
|
||||
@@ -281,6 +320,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
||||
const int nb = k / QK_K;
|
||||
|
||||
uint8_t L[QK_K];
|
||||
uint8_t Laux[16];
|
||||
float weights[16];
|
||||
float mins[QK_K/16];
|
||||
float scales[QK_K/16];
|
||||
|
||||
@@ -291,7 +332,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||
float max_min = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5);
|
||||
for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
|
||||
scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
|
||||
float scale = scales[j];
|
||||
if (scale > max_scale) {
|
||||
max_scale = scale;
|
||||
@@ -637,6 +679,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
||||
const int nb = k / QK_K;
|
||||
|
||||
uint8_t L[QK_K];
|
||||
uint8_t Laux[32];
|
||||
float weights[32];
|
||||
float mins[QK_K/32];
|
||||
float scales[QK_K/32];
|
||||
|
||||
@@ -645,7 +689,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||
float max_min = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5);
|
||||
//scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
||||
float sum_x2 = 0;
|
||||
for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
|
||||
float av_x = sqrtf(sum_x2/32);
|
||||
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
||||
scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
|
||||
float scale = scales[j];
|
||||
if (scale > max_scale) {
|
||||
max_scale = scale;
|
||||
@@ -798,6 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
||||
uint8_t L[QK_K];
|
||||
float mins[QK_K/32];
|
||||
float scales[QK_K/32];
|
||||
float weights[32];
|
||||
uint8_t Laux[32];
|
||||
#else
|
||||
int8_t L[QK_K];
|
||||
float scales[QK_K/16];
|
||||
@@ -810,7 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||
float max_min = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5);
|
||||
//scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
||||
float sum_x2 = 0;
|
||||
for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
|
||||
float av_x = sqrtf(sum_x2/32);
|
||||
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
||||
scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
|
||||
float scale = scales[j];
|
||||
if (scale > max_scale) {
|
||||
max_scale = scale;
|
||||
|
||||
58
llama.cpp
58
llama.cpp
@@ -995,6 +995,16 @@ struct llama_model_loader {
|
||||
} break;
|
||||
}
|
||||
|
||||
// this is a way to mark that we have "guessed" the file type
|
||||
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
||||
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf, "general.file_type");
|
||||
if (kid >= 0) {
|
||||
ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(ctx_gguf, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
@@ -1197,7 +1207,11 @@ struct llama_model_loader {
|
||||
// load LLaMA models
|
||||
//
|
||||
|
||||
const char * llama_model_ftype_name(enum llama_ftype ftype) {
|
||||
std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
||||
if (ftype & LLAMA_FTYPE_GUESSED) {
|
||||
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
||||
}
|
||||
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
||||
@@ -1426,7 +1440,7 @@ static void llama_model_load_internal(
|
||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
||||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
||||
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype));
|
||||
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml->n_elements*1e-9);
|
||||
|
||||
// general kv
|
||||
@@ -2239,18 +2253,11 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||
}
|
||||
|
||||
static std::string llama_escape_whitespace(const std::string& text) {
|
||||
std::string result;
|
||||
bool escaping = false;
|
||||
result += "\xe2\x96\x81";
|
||||
std::string result = "\xe2\x96\x81";
|
||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||
if (text[offs] == ' ') {
|
||||
if (!escaping) {
|
||||
result += "\xe2\x96\x81";
|
||||
escaping = true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
escaping = false;
|
||||
result += "\xe2\x96\x81";
|
||||
} else {
|
||||
result += text[offs];
|
||||
}
|
||||
}
|
||||
@@ -3450,6 +3457,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
// copy the KV pairs from the input file
|
||||
gguf_set_kv (ctx_out, model_loader->ctx_gguf);
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
||||
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
int n_attention_wv = 0;
|
||||
@@ -3547,24 +3555,40 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||
++i_attention_wv;
|
||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
|
||||
++i_feed_forward_w2;
|
||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
}
|
||||
// This can be used to reduce the size of the Q5_K_S model.
|
||||
// The associated PPL increase is fully in line with the size reduction
|
||||
//else {
|
||||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
||||
//}
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
||||
@@ -4294,7 +4318,7 @@ int llama_model_n_embd(const struct llama_model * model) {
|
||||
}
|
||||
|
||||
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype));
|
||||
return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str());
|
||||
}
|
||||
|
||||
int llama_model_quantize(
|
||||
|
||||
2
llama.h
2
llama.h
@@ -103,6 +103,8 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
||||
typedef struct llama_token_data {
|
||||
|
||||
@@ -17,6 +17,8 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector<lla
|
||||
static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||
{ " ", {1, 259, }, },
|
||||
{ " ", { 1, 1678, }, },
|
||||
{ " ", { 1, 268, }, },
|
||||
{ "\t", { 1, 29871, 12, }, },
|
||||
{ "\n", { 1, 29871, 13, }, },
|
||||
{ "\t\n", { 1, 29871, 12, 13, }, },
|
||||
@@ -38,6 +40,12 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||
243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598,
|
||||
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
|
||||
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
|
||||
{ "Hello", { 1, 15043 }, },
|
||||
{ " Hello", { 1, 29871, 15043 }, },
|
||||
{ " Hello", { 1, 259, 15043 }, },
|
||||
{ " Hello", { 1, 1678, 15043 }, },
|
||||
{ " Hello", { 1, 268, 15043 }, },
|
||||
{ " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, },
|
||||
};
|
||||
|
||||
return _k_tests;
|
||||
@@ -106,7 +114,8 @@ int main(int argc, char **argv) {
|
||||
|
||||
if (!correct) {
|
||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||
fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());
|
||||
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||
unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str());
|
||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||
for (const auto & t : test_kv.second) {
|
||||
fprintf(stderr, "%6d, ", t);
|
||||
|
||||
@@ -11,18 +11,11 @@
|
||||
#include <locale>
|
||||
|
||||
static std::string escape_whitespace(const std::string& text) {
|
||||
std::string result;
|
||||
bool escaping = false;
|
||||
result += "\xe2\x96\x81";
|
||||
std::string result = "\xe2\x96\x81";
|
||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
||||
if (text[offs] == ' ') {
|
||||
if (!escaping) {
|
||||
result += "\xe2\x96\x81";
|
||||
escaping = true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
escaping = false;
|
||||
result += "\xe2\x96\x81";
|
||||
} else {
|
||||
result += text[offs];
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user