Compare commits

..

4 Commits

Author SHA1 Message Date
Georgi Gerganov
7a32fcb3b2 ggml : add Q8_0 quantization format (rename the old one to Q8_1) (ARM NEON) (#1179)
* ggml : add Q8_0 quantization format (rename the old one to Q8_1)

* tests : fix test-quantize-fns

* ggml : finalize Q8_0 implementation

* ggml : use q4_0_q8_0 and q4_2_q8_0

* ggml : fix Q8_0 dot product bug (ARM)

* ggml : Q8_0 unroll x2

* ggml : fix bug - using wrong block type

* ggml : extend quantize_fns_t with "vec_dot_type"

* ggml : fix Q8_0 to use 255 values out of 256

* ggml : fix assert using wrong QK4_2 instead of QK4_3
2023-04-25 23:40:51 +03:00
unbounded
dd0eabc049 ggml : use full range for Q4_0 and Q4_2 quantization (#729)
* Use full range for q4_0 quantization

By keeping the sign of the highest magnitude, we can make sure the
highest value maps to -8, which is currently unused.
This is a bit of a freebie since it is fully backwards compatible with
the current format.

* Update quantize_row_q4_0 for AVX/AVX2

* Update quantize_row_q4_0 for WASM

Untested

* Update quantize_row_q4_0 for Arm NEON

* Update quantize_row_q4_0 for PowerPC

Untested

* Use full range for q4_2 quantization
2023-04-25 20:20:46 +03:00
xaedes
54bb60e268 ggml : fix bug in ggml_compute_forward_sum_f32 (#1162)
The sum over all rows is now computed instead of just the last row
2023-04-24 23:02:02 +02:00
Georgi Gerganov
8a0f8673ba ggml : export symbols (#1155) 2023-04-24 22:18:25 +03:00
8 changed files with 1109 additions and 862 deletions

View File

@@ -16,6 +16,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
fprintf(stderr, " type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0);
return 1;
}

View File

@@ -37,6 +37,13 @@ typedef struct {
} block_q4_3;
static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
#define QK8_0 32
typedef struct {
float d; // delta
int8_t qs[QK8_0]; // quants
} block_q8_0;
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
const block_q4_0 * x = (const block_q4_0 *) vx;
@@ -131,6 +138,22 @@ static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
}
}
static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
const block_q8_0 * x = (const block_q8_0 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const int8_t * pp = x[i].qs;
for (int l = 0; l < QK8_0; l++) {
const int8_t vi = pp[l];
y[i*QK8_0 + l] = vi*d;
}
}
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_0;
dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
@@ -151,6 +174,11 @@ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t st
dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK8_0;
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
}
// buffer pool for cuda
#define MAX_CUDA_BUFFERS 16

View File

@@ -35,6 +35,7 @@ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t st
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
#ifdef __cplusplus
}

618
ggml.c

File diff suppressed because it is too large Load Diff

1304
ggml.h

File diff suppressed because it is too large Load Diff

View File

@@ -484,6 +484,7 @@ struct llama_file_loader {
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
case GGML_TYPE_Q8_0:
break;
default: {
throw format("unrecognized tensor type %u\n", shard.type);
@@ -558,6 +559,7 @@ struct llama_file_saver {
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
case GGML_TYPE_Q8_0:
break;
default: LLAMA_ASSERT(false);
}
@@ -848,6 +850,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
return "mostly Q4_1, some F16";
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
default: return "unknown, may not work";
}
}
@@ -1585,6 +1588,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
default: throw format("invalid output file type %d\n", ftype);
};

View File

@@ -74,6 +74,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
};
LLAMA_API struct llama_context_params llama_context_default_params();

View File

@@ -36,7 +36,7 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
// Total quantization error on test data
float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(test_size);
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
@@ -46,7 +46,7 @@ float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const fl
// Total quantization error on test data
float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(test_size);
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size);
@@ -69,10 +69,10 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
// Total dot product error
float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
std::vector<uint8_t> tmp_q1(test_size);
std::vector<uint8_t> tmp_q2(test_size*2);
std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size);
qfns.quantize_row_q(test_data1, tmp_q1.data(), test_size);
qfns.quantize_row_q (test_data1, tmp_q1.data(), test_size);
qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
float result = INFINITY;
@@ -125,7 +125,7 @@ int main(int argc, char * argv[]) {
failed = !(total_error < MAX_QUANTIZATION_TOTAL_ERROR);
num_failed += failed;
if (failed || verbose) {
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
}
const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
@@ -139,7 +139,7 @@ int main(int argc, char * argv[]) {
failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
num_failed += failed;
if (failed || verbose) {
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
}
}
}