Compare commits

..

5 Commits

Author SHA1 Message Date
Iwan Kawrakow
0e826d12a5 quantize: be able to specify the token embedding tensor type 2024-03-22 16:27:34 +02:00
Iwan Kawrakow
7883796f71 quantize: be able to specify the output tensor type 2024-03-22 16:11:34 +02:00
Vaibhav Srivastav
b2075fd6a5 ci : add CURL flag for the mac builds (#6214) 2024-03-22 09:53:43 +02:00
Georgi Gerganov
95d576b48e metal : pad n_ctx by 32 (#6177)
* metal : require ne00 >= 128 for mat-mat kernels

ggml-ci

* llama : pad n_ctx by 32

ggml-ci
2024-03-22 09:36:03 +02:00
Neo Zhang Jianyu
59c17f02de add blog link (#6222) 2024-03-22 15:19:37 +08:00
8 changed files with 78 additions and 30 deletions

View File

@@ -41,7 +41,7 @@ jobs:
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON ..
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@@ -97,7 +97,7 @@ jobs:
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON ..
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test

View File

@@ -29,6 +29,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
## News
- 2024.3
- A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.

View File

@@ -101,7 +101,7 @@ int32_t get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
void process_escapes(std::string& input) {
void process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;

View File

@@ -48,6 +48,8 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is";
}
process_escapes(params.prompt);
// init LLM
llama_backend_init();
@@ -78,7 +80,7 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel);
ctx_params.n_seq_max = n_parallel;
ctx_params.n_threads = params.n_threads;

View File

@@ -189,6 +189,18 @@ static void prepare_imatrix(const std::string& imatrix_file,
}
}
static ggml_type parse_ggml_type(const char * arg) {
ggml_type result = GGML_TYPE_COUNT;
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
auto type = ggml_type(j);
const auto * name = ggml_type_name(type);
if (name && strcmp(arg, name) == 0) {
result = type; break;
}
}
return result;
}
int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
@@ -203,6 +215,18 @@ int main(int argc, char ** argv) {
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
params.quantize_output_tensor = false;
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
if (arg_idx < argc-1) {
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
if (arg_idx < argc-1) {
params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) {

View File

@@ -11971,27 +11971,34 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
// with the quantization of the output tensor
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
int nx = tensor->ne[0];
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
new_type = GGML_TYPE_Q5_K;
}
else if (new_type != GGML_TYPE_Q8_0) {
new_type = GGML_TYPE_Q6_K;
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
new_type = qs.params->output_tensor_type;
} else {
int nx = tensor->ne[0];
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
new_type = GGML_TYPE_Q5_K;
}
else if (new_type != GGML_TYPE_Q8_0) {
new_type = GGML_TYPE_Q6_K;
}
}
} else if (name == "token_embd.weight") {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
new_type = GGML_TYPE_Q2_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
new_type = GGML_TYPE_IQ3_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S;
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
new_type = qs.params->token_embedding_type;
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
new_type = GGML_TYPE_Q2_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
new_type = GGML_TYPE_IQ3_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ3_S;
}
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
@@ -12887,6 +12894,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
struct llama_model_quantize_params result = {
/*.nthread =*/ 0,
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
/*.allow_requantize =*/ false,
/*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false,
@@ -13044,6 +13053,9 @@ struct llama_context * llama_new_context_with_model(
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
// this is necessary due to kv_self.n being padded later during inference
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 32);
// with causal attention, the batch size is limited by the context size
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);

16
llama.h
View File

@@ -275,13 +275,15 @@ extern "C" {
// model quantization parameters
typedef struct llama_model_quantize_params {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
void * imatrix; // pointer to importance matrix data
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // itoken embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // quantize all tensors to the default type
void * imatrix; // pointer to importance matrix data
} llama_model_quantize_params;
// grammar types

View File

@@ -2091,6 +2091,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
}
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1}, {1, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1}, {4, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1}, {4, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
for (ggml_type type_a : all_types) {
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
for (int n_mats : {2, 4, 8}) {