Compare commits

...

5 Commits

Author SHA1 Message Date
slaren
8944a13296 Add NVIDIA cuBLAS support (#1044) 2023-04-19 11:22:45 +02:00
slaren
6667401238 Multi-threaded ggml_cpy (#1035)
* Multi-threaded ggml_cpy

* Update ggml.c

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Also fix wdata offset in ggml_compute_forward_add_q_f32

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-04-19 00:53:24 +02:00
Georgi Gerganov
77a73403ca ggml : add new Q4_2 quantization (ARM only) (#1046)
* ggml : Q4_2 ARM

* ggml : add ggml_is_quantized()

* llama : update llama_type_name() with Q4_2 entry

* ggml : speed-up q4_2

- 4 threads: ~100ms -> ~90ms
- 8 threads:  ~55ms -> ~50ms

* ggml : optimize q4_2 using vmlaq_n_f32 + vmulq_n_f32
2023-04-18 23:54:57 +03:00
Georgi Gerganov
50a8a2af97 ggml : scratch that - vmlaq_n_f32 is always better
Had a background process that was messing with the timings
2023-04-18 23:11:23 +03:00
Georgi Gerganov
4caebf6d40 gitignore : vdot 2023-04-18 23:00:08 +03:00
8 changed files with 747 additions and 94 deletions

1
.gitignore vendored
View File

@@ -24,6 +24,7 @@ models/*
/perplexity
/embedding
/benchmark-q4_0-matmult
/vdot
/Pipfile
arm_neon.h

View File

@@ -66,6 +66,7 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -142,6 +143,26 @@ if (LLAMA_OPENBLAS)
endif()
endif()
if (LLAMA_CUBLAS)
cmake_minimum_required(VERSION 3.17)
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
add_compile_definitions(GGML_USE_CUBLAS)
if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
else()
message(WARNING "cuBLAS not found")
endif()
endif()
if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
set(c_flags

View File

@@ -97,6 +97,10 @@ ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas
endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64
endif
ifdef LLAMA_GPROF
CFLAGS += -pg
CXXFLAGS += -pg

View File

@@ -14,6 +14,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
return 1;
}

796
ggml.c

File diff suppressed because it is too large Load Diff

5
ggml.h
View File

@@ -204,7 +204,8 @@ enum ggml_type {
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
GGML_TYPE_Q8_0 = 4,
GGML_TYPE_Q4_2 = 4,
GGML_TYPE_Q8_0 = 5,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
@@ -806,6 +807,7 @@ enum ggml_opt_result ggml_opt(
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
//
// system info
@@ -823,6 +825,7 @@ int ggml_cpu_has_f16c(void);
int ggml_cpu_has_fp16_va(void);
int ggml_cpu_has_wasm_simd(void);
int ggml_cpu_has_blas(void);
int ggml_cpu_has_cublas(void);
int ggml_cpu_has_sse3(void);
int ggml_cpu_has_vsx(void);

View File

@@ -478,6 +478,7 @@ struct llama_file_loader {
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
break;
default: {
throw format("unrecognized tensor type %u\n", shard.type);
@@ -550,6 +551,7 @@ struct llama_file_saver {
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
break;
default: LLAMA_ASSERT(false);
}
@@ -838,6 +840,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
return "mostly Q4_1, some F16";
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
default: return "unknown, may not work";
}
}
@@ -1066,7 +1069,7 @@ static bool llama_eval_internal(
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ggml_cgraph gf = {};
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1571,6 +1574,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
default: throw format("invalid output file type %d\n", ftype);
};
@@ -1644,6 +1648,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
{
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
case GGML_TYPE_Q4_2:
{
new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
default:
LLAMA_ASSERT(false);
}
@@ -1955,7 +1963,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
base_t = dest_t;
}
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1) {
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) {
if (!warned) {
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
"use a f16 or f32 base model with --lora-base\n", __func__);

View File

@@ -72,6 +72,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
};
LLAMA_API struct llama_context_params llama_context_default_params();