mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
8 Commits
master-c2d
...
master-44f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
44f906e853 | ||
|
|
d5b111f53d | ||
|
|
2d43387daf | ||
|
|
7ad7750c5c | ||
|
|
7a74dee6b4 | ||
|
|
590250f7a9 | ||
|
|
f4c55d3bd7 | ||
|
|
f1465624c2 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -7,6 +7,7 @@
|
||||
.envrc
|
||||
.swiftpm
|
||||
.venv
|
||||
.clang-tidy
|
||||
.vs/
|
||||
.vscode/
|
||||
|
||||
@@ -34,6 +35,7 @@ models/*
|
||||
/benchmark-matmult
|
||||
/vdot
|
||||
/Pipfile
|
||||
/libllama.so
|
||||
|
||||
build-info.h
|
||||
arm_neon.h
|
||||
|
||||
18
Makefile
18
Makefile
@@ -243,7 +243,7 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
libllama.so: llama.o ggml.o $(OBJS)
|
||||
libllama.so: llama.o ggml.o ggml-quants-k.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
@@ -253,28 +253,28 @@ clean:
|
||||
# Examples
|
||||
#
|
||||
|
||||
main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o ggml-quants-k.o $(OBJS)
|
||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o ggml-quants-k.o $(OBJS)
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
|
||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
|
||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||
|
||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
@@ -289,7 +289,7 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
# Tests
|
||||
#
|
||||
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o ggml-quants-k.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
./$@
|
||||
|
||||
|
||||
15
README.md
15
README.md
@@ -267,11 +267,11 @@ Any value larger than 0 will offload the computation to the GPU. For example:
|
||||
|
||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
|
||||
|
||||
- **Accelerate Framework**:
|
||||
- #### Accelerate Framework:
|
||||
|
||||
This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
|
||||
|
||||
- **OpenBLAS**:
|
||||
- #### OpenBLAS:
|
||||
|
||||
This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
|
||||
|
||||
@@ -305,11 +305,11 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
- **BLIS**
|
||||
- #### BLIS
|
||||
|
||||
Check [BLIS.md](BLIS.md) for more information.
|
||||
|
||||
- **Intel MKL**
|
||||
- #### Intel MKL
|
||||
|
||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
|
||||
|
||||
@@ -317,10 +317,10 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake --build . -config Release
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
- **cuBLAS**
|
||||
- #### cuBLAS
|
||||
|
||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||
- Using `make`:
|
||||
@@ -339,7 +339,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
|
||||
|
||||
- **CLBlast**
|
||||
- #### CLBlast
|
||||
|
||||
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
|
||||
|
||||
@@ -684,3 +684,4 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
|
||||
### Docs
|
||||
|
||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
||||
|
||||
40
docs/token_generation_performance_tips.md
Normal file
40
docs/token_generation_performance_tips.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Token generation performance troubleshooting
|
||||
|
||||
## Verifying that the model is running on the GPU with cuBLAS
|
||||
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
|
||||
```shell
|
||||
./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
|
||||
```
|
||||
|
||||
When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
|
||||
```shell
|
||||
llama_model_load_internal: [cublas] offloading 60 layers to GPU
|
||||
llama_model_load_internal: [cublas] offloading output layer to GPU
|
||||
llama_model_load_internal: [cublas] total VRAM used: 17223 MB
|
||||
... rest of inference
|
||||
```
|
||||
|
||||
If you see these lines, then the GPU is being used.
|
||||
|
||||
## Verifying that the CPU is not oversaturated
|
||||
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
|
||||
|
||||
# Example of runtime flags effect on inference speed benchmark
|
||||
These runs were tested on the following machine:
|
||||
GPU: A6000 (48GB VRAM)
|
||||
CPU: 7 physical cores
|
||||
RAM: 32GB
|
||||
|
||||
Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
|
||||
|
||||
Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
|
||||
|
||||
Result:
|
||||
|
||||
| command | tokens/second (higher is better) |
|
||||
| - | - |
|
||||
| -ngl 2000000 | N/A (less than 0.1) |
|
||||
| -t 7 | 1.7 |
|
||||
| -t 1 -ngl 2000000 | 5.5 |
|
||||
| -t 7 -ngl 2000000 | 8.7 |
|
||||
| -t 4 -ngl 2000000 | 9.1 |
|
||||
28
ggml-metal.m
28
ggml-metal.m
@@ -47,10 +47,11 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(relu);
|
||||
GGML_METAL_DECL_KERNEL(soft_max);
|
||||
GGML_METAL_DECL_KERNEL(diag_mask_inf);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(rope);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
||||
@@ -130,10 +131,11 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
||||
GGML_METAL_ADD_KERNEL(relu);
|
||||
GGML_METAL_ADD_KERNEL(soft_max);
|
||||
GGML_METAL_ADD_KERNEL(diag_mask_inf);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(rope);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
||||
@@ -204,6 +206,11 @@ bool ggml_metal_add_buffer(
|
||||
ctx->buffers[ctx->n_buffers].name = name;
|
||||
ctx->buffers[ctx->n_buffers].data = data;
|
||||
ctx->buffers[ctx->n_buffers].size = size;
|
||||
|
||||
if (ctx->device.maxBufferLength < aligned_size) {
|
||||
fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
|
||||
return false;
|
||||
}
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
@@ -493,6 +500,14 @@ void ggml_metal_graph_compute(
|
||||
|
||||
// use custom matrix x vector kernel
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
|
||||
nth0 = 64;
|
||||
nth1 = 1;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
@@ -502,14 +517,6 @@ void ggml_metal_graph_compute(
|
||||
nth1 = 4;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
} break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
};
|
||||
|
||||
@@ -546,6 +553,7 @@ void ggml_metal_graph_compute(
|
||||
}
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
||||
@@ -169,6 +169,22 @@ kernel void kernel_diag_mask_inf(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_f16(
|
||||
device const void * src0,
|
||||
device const int * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb1,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
const int i = tpig;
|
||||
const int r = ((device int32_t *) src1)[i];
|
||||
|
||||
for (int j = 0; j < ne00; j++) {
|
||||
dst[i*nb1 + j] = ((device half *) ((device char *) src0 + r*nb01))[j];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_get_rows_q4_0(
|
||||
device const void * src0,
|
||||
device const int * src1,
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <atomic>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 110
|
||||
#include <clblast.h>
|
||||
@@ -604,21 +605,44 @@ struct cl_buffer {
|
||||
static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
|
||||
static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
|
||||
|
||||
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
|
||||
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
|
||||
scoped_spin_lock lock(g_cl_pool_lock);
|
||||
cl_int err;
|
||||
|
||||
int best_i = -1;
|
||||
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
|
||||
int worst_i = -1;
|
||||
size_t worst_size = 0; //largest unused buffer seen so far
|
||||
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
|
||||
cl_buffer& b = g_cl_buffer_pool[i];
|
||||
if (b.size > 0 && b.size >= size) {
|
||||
cl_mem mem = b.mem;
|
||||
*actual_size = b.size;
|
||||
b.size = 0;
|
||||
return mem;
|
||||
cl_buffer &b = g_cl_buffer_pool[i];
|
||||
if (b.size > 0 && b.size >= size && b.size < best_size)
|
||||
{
|
||||
best_i = i;
|
||||
best_size = b.size;
|
||||
}
|
||||
if (b.size > 0 && b.size > worst_size)
|
||||
{
|
||||
worst_i = i;
|
||||
worst_size = b.size;
|
||||
}
|
||||
}
|
||||
if(best_i!=-1) //found the smallest buffer that fits our needs
|
||||
{
|
||||
cl_buffer& b = g_cl_buffer_pool[best_i];
|
||||
cl_mem mem = b.mem;
|
||||
*actual_size = b.size;
|
||||
b.size = 0;
|
||||
return mem;
|
||||
}
|
||||
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
|
||||
{
|
||||
cl_buffer& b = g_cl_buffer_pool[worst_i];
|
||||
cl_mem mem = b.mem;
|
||||
b.size = 0;
|
||||
clReleaseMemObject(mem);
|
||||
}
|
||||
cl_mem mem;
|
||||
CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
|
||||
CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
|
||||
*actual_size = size;
|
||||
return mem;
|
||||
}
|
||||
@@ -692,9 +716,10 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
||||
size_t x_size;
|
||||
size_t d_size;
|
||||
|
||||
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_ONLY); // src0
|
||||
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
||||
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
|
||||
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_WRITE_ONLY); // dst
|
||||
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
||||
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
@@ -792,10 +817,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
if (src0->backend == GGML_BACKEND_CL) {
|
||||
d_X = (cl_mem) src0->data;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
||||
}
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
@@ -868,10 +893,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
if (src0->backend == GGML_BACKEND_CL) {
|
||||
d_X = (cl_mem) src0->data;
|
||||
} else {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
||||
}
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
|
||||
|
||||
bool src1_cont_rows = nb10 == sizeof(float);
|
||||
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
||||
@@ -970,13 +995,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
size_t q_size;
|
||||
cl_mem d_X;
|
||||
if (!mul_mat_vec) {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
|
||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
}
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
cl_mem d_Q;
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
||||
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
|
||||
}
|
||||
|
||||
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
||||
@@ -1143,7 +1168,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
||||
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
||||
|
||||
size_t q_size;
|
||||
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
||||
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
||||
|
||||
// copy tensor to device
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
|
||||
20
ggml.c
20
ggml.c
@@ -14753,7 +14753,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
||||
const int64_t * ne = tensor->ne;
|
||||
const size_t * nb = tensor->nb;
|
||||
|
||||
fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %32s\n",
|
||||
fprintf(fout, "%-6s %-12s %8d %8jd %jd %jd %jd %16zu %16zu %16zu %16zu %16p %32s\n",
|
||||
ggml_type_name(tensor->type),
|
||||
ggml_op_name (tensor->op),
|
||||
tensor->n_dims,
|
||||
@@ -14767,7 +14767,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
||||
const int64_t * ne = tensor->ne;
|
||||
const size_t * nb = tensor->nb;
|
||||
|
||||
fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
||||
fprintf(fout, "%-6s %-6s %-12s %8d %jd %jd %jd %jd %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
||||
arg,
|
||||
ggml_type_name(tensor->type),
|
||||
ggml_op_name (tensor->op),
|
||||
@@ -14796,11 +14796,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||
FILE * fout = stdout;
|
||||
|
||||
fprintf(fout, "\n");
|
||||
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
||||
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
||||
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
||||
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
||||
fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
|
||||
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
||||
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
||||
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
||||
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
||||
fprintf(fout, "%-16s %8ju\n", "eval", size_eval);
|
||||
|
||||
// header
|
||||
fprintf(fout, "\n");
|
||||
@@ -15033,7 +15033,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
||||
|
||||
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
||||
|
||||
fread(data->data, sizeof(char), fsize, fin);
|
||||
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
|
||||
if (ret != fsize) {
|
||||
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
|
||||
return result;
|
||||
}
|
||||
|
||||
fclose(fin);
|
||||
}
|
||||
|
||||
43
llama.cpp
43
llama.cpp
@@ -961,7 +961,6 @@ static void llama_model_load_internal(
|
||||
model.hparams = ml->file_loaders.at(0)->hparams;
|
||||
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
||||
auto & hparams = model.hparams;
|
||||
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||
|
||||
{
|
||||
switch (hparams.n_layer) {
|
||||
@@ -975,6 +974,8 @@ static void llama_model_load_internal(
|
||||
hparams.n_ctx = n_ctx;
|
||||
}
|
||||
|
||||
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||
|
||||
{
|
||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
@@ -2198,8 +2199,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
||||
} else {
|
||||
new_type = quantized_type;
|
||||
if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
|
||||
else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||
// TODO: temporary disabled until Metal / OpenCL support is available
|
||||
// ref: https://github.com/ggerganov/llama.cpp/issues/1711
|
||||
//if (tensor.name == "output.weight") {
|
||||
// new_type = GGML_TYPE_Q6_K;
|
||||
//}
|
||||
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
@@ -2207,7 +2212,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||
++i_attention_wv;
|
||||
}
|
||||
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
@@ -2215,10 +2220,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||
++i_feed_forward_w2;
|
||||
}
|
||||
else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
|
||||
float * f32_data;
|
||||
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
||||
llama_buffer f32_conv_buf;
|
||||
@@ -2405,17 +2411,30 @@ struct llama_context * llama_init_from_file(
|
||||
// this allocates all Metal resources and memory buffers
|
||||
ctx->ctx_metal = ggml_metal_init();
|
||||
|
||||
void *data_ptr = NULL;
|
||||
size_t data_size = 0;
|
||||
if (params.use_mmap) {
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr, ctx->model.mapping->size);
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
|
||||
data_ptr = ctx->model.mapping->addr;
|
||||
data_size= ctx->model.mapping->size;
|
||||
} else {
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx));
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
|
||||
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
||||
data_size= ggml_get_mem_size(ctx->model.ctx);
|
||||
}
|
||||
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size);
|
||||
ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size);
|
||||
#define LLAMA_METAL_CHECK_BUF(result) \
|
||||
if (!(result)) { \
|
||||
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
||||
llama_free(ctx); \
|
||||
return NULL; \
|
||||
}
|
||||
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
|
||||
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
|
||||
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
|
||||
#undef LLAMA_METAL_CHECK_BUF
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user