Compare commits

...

4 Commits

Author SHA1 Message Date
Georgi Gerganov
44f906e853 metal : add f16 support 2023-06-06 20:21:56 +03:00
LostRuins
d5b111f53d Clblast fixes + enhancements to save VRAM and offload more layers (#1675)
* Use events instead of clFinish, where possible

* OpenCL: Don't load gpu layers into RAM, add mul_f32 kernel

* Reduce queueing overhead for contiguous tensors by using single mul kernel call

* Adapt to #1612 cl_mem malloc changes

* Reduce code duplication between cuda and opencl branches

* Improve implementation

* Clblast fixes + enhancements to save VRAM:

1. Change all Clblast buffers to CL_MEM_READ_WRITE, as the pool malloc currently doesn't properly handle them.
2. When recycling buffers in pool malloc, always assign the SMALLEST available buffer that fits, instead of the FIRST available buffer
3. When failing to recycle a buffer in pool malloc (all too small), instead recycle the largest available free buffer by resizing it.

* change max value size_t to use limits

* removed flags from the CL pool malloc, apply code tidying suggestions.
2023-06-06 19:00:01 +02:00
Georgi Gerganov
2d43387daf ggml : fix builds, add ggml-quants-k.o (close #1712, close #1710) 2023-06-06 10:18:03 +03:00
Georgi Gerganov
7ad7750c5c gitignore : add .clang-tidy 2023-06-06 09:55:25 +03:00
7 changed files with 100 additions and 49 deletions

2
.gitignore vendored
View File

@@ -7,6 +7,7 @@
.envrc
.swiftpm
.venv
.clang-tidy
.vs/
.vscode/
@@ -34,6 +35,7 @@ models/*
/benchmark-matmult
/vdot
/Pipfile
/libllama.so
build-info.h
arm_neon.h

View File

@@ -243,7 +243,7 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@
libllama.so: llama.o ggml.o $(OBJS)
libllama.so: llama.o ggml.o ggml-quants-k.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean:
@@ -253,28 +253,28 @@ clean:
# Examples
#
main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o ggml-quants-k.o $(OBJS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o ggml-quants-k.o $(OBJS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o ggml-quants-k.o $(OBJS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
build-info.h: $(wildcard .git/index) scripts/build-info.sh
@@ -289,7 +289,7 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
# Tests
#
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o ggml-quants-k.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@

View File

@@ -47,10 +47,11 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(relu);
GGML_METAL_DECL_KERNEL(soft_max);
GGML_METAL_DECL_KERNEL(diag_mask_inf);
GGML_METAL_DECL_KERNEL(get_rows_f16);
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_DECL_KERNEL(rope);
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
@@ -130,10 +131,11 @@ struct ggml_metal_context * ggml_metal_init(void) {
GGML_METAL_ADD_KERNEL(relu);
GGML_METAL_ADD_KERNEL(soft_max);
GGML_METAL_ADD_KERNEL(diag_mask_inf);
GGML_METAL_ADD_KERNEL(get_rows_f16);
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
GGML_METAL_ADD_KERNEL(rope);
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
@@ -498,6 +500,14 @@ void ggml_metal_graph_compute(
// use custom matrix x vector kernel
switch (src0t) {
case GGML_TYPE_F16:
{
GGML_ASSERT(ne02 == ne12);
nth0 = 64;
nth1 = 1;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
} break;
case GGML_TYPE_Q4_0:
{
GGML_ASSERT(ne02 == 1);
@@ -507,14 +517,6 @@ void ggml_metal_graph_compute(
nth1 = 4;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
} break;
case GGML_TYPE_F16:
{
GGML_ASSERT(ne02 == ne12);
nth0 = 32;
nth1 = 1;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
} break;
default: GGML_ASSERT(false && "not implemented");
};
@@ -551,6 +553,7 @@ void ggml_metal_graph_compute(
}
switch (src0->type) {
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
default: GGML_ASSERT(false && "not implemented");
}

View File

@@ -169,6 +169,22 @@ kernel void kernel_diag_mask_inf(
}
}
kernel void kernel_get_rows_f16(
device const void * src0,
device const int * src1,
device float * dst,
constant int64_t & ne00,
constant uint64_t & nb01,
constant uint64_t & nb1,
uint tpig[[thread_position_in_grid]]) {
const int i = tpig;
const int r = ((device int32_t *) src1)[i];
for (int j = 0; j < ne00; j++) {
dst[i*nb1 + j] = ((device half *) ((device char *) src0 + r*nb01))[j];
}
}
kernel void kernel_get_rows_q4_0(
device const void * src0,
device const int * src1,

View File

@@ -4,6 +4,7 @@
#include <atomic>
#include <sstream>
#include <vector>
#include <limits>
#define CL_TARGET_OPENCL_VERSION 110
#include <clblast.h>
@@ -604,21 +605,44 @@ struct cl_buffer {
static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cl_pool_lock);
cl_int err;
int best_i = -1;
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
int worst_i = -1;
size_t worst_size = 0; //largest unused buffer seen so far
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
cl_buffer& b = g_cl_buffer_pool[i];
if (b.size > 0 && b.size >= size) {
cl_mem mem = b.mem;
*actual_size = b.size;
b.size = 0;
return mem;
cl_buffer &b = g_cl_buffer_pool[i];
if (b.size > 0 && b.size >= size && b.size < best_size)
{
best_i = i;
best_size = b.size;
}
if (b.size > 0 && b.size > worst_size)
{
worst_i = i;
worst_size = b.size;
}
}
if(best_i!=-1) //found the smallest buffer that fits our needs
{
cl_buffer& b = g_cl_buffer_pool[best_i];
cl_mem mem = b.mem;
*actual_size = b.size;
b.size = 0;
return mem;
}
if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
{
cl_buffer& b = g_cl_buffer_pool[worst_i];
cl_mem mem = b.mem;
b.size = 0;
clReleaseMemObject(mem);
}
cl_mem mem;
CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
*actual_size = size;
return mem;
}
@@ -692,9 +716,10 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
size_t x_size;
size_t d_size;
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size, CL_MEM_READ_ONLY); // src0
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_WRITE_ONLY); // dst
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -792,10 +817,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
if (src0->backend == GGML_BACKEND_CL) {
d_X = (cl_mem) src0->data;
} else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
}
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -868,10 +893,10 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
if (src0->backend == GGML_BACKEND_CL) {
d_X = (cl_mem) src0->data;
} else {
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
}
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
bool src1_cont_rows = nb10 == sizeof(float);
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
@@ -970,13 +995,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
size_t q_size;
cl_mem d_X;
if (!mul_mat_vec) {
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
}
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
cl_mem d_Q;
if (src0->backend == GGML_BACKEND_CPU) {
d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
}
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
@@ -1143,7 +1168,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
size_t q_size;
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
// copy tensor to device
for (int64_t i3 = 0; i3 < ne3; i3++) {

20
ggml.c
View File

@@ -14753,7 +14753,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
const int64_t * ne = tensor->ne;
const size_t * nb = tensor->nb;
fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %32s\n",
fprintf(fout, "%-6s %-12s %8d %8jd %jd %jd %jd %16zu %16zu %16zu %16zu %16p %32s\n",
ggml_type_name(tensor->type),
ggml_op_name (tensor->op),
tensor->n_dims,
@@ -14767,7 +14767,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
const int64_t * ne = tensor->ne;
const size_t * nb = tensor->nb;
fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %32s\n",
fprintf(fout, "%-6s %-6s %-12s %8d %jd %jd %jd %jd %16zu %16zu %16zu %16zu %8d %16p %32s\n",
arg,
ggml_type_name(tensor->type),
ggml_op_name (tensor->op),
@@ -14796,11 +14796,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
FILE * fout = stdout;
fprintf(fout, "\n");
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
fprintf(fout, "%-16s %8ju\n", "eval", size_eval);
// header
fprintf(fout, "\n");
@@ -15033,7 +15033,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
fread(data->data, sizeof(char), fsize, fin);
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
if (ret != fsize) {
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
return result;
}
fclose(fin);
}

View File

@@ -961,7 +961,6 @@ static void llama_model_load_internal(
model.hparams = ml->file_loaders.at(0)->hparams;
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
auto & hparams = model.hparams;
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
{
switch (hparams.n_layer) {
@@ -975,6 +974,8 @@ static void llama_model_load_internal(
hparams.n_ctx = n_ctx;
}
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
{
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);