Compare commits

..

7 Commits

Author SHA1 Message Date
l3utterfly
8596af4277 ggml : fix bug in ggml_compute_forward_add_q_f32 (#1918) 2023-06-18 14:19:16 +03:00
Mike
e1886cf4fe readme : update Android build instructions (#1922)
Add steps for using termux on android devices to prevent common errors.
2023-06-18 11:28:26 +03:00
Kawrakow
8ab8ba62eb llama : prevent usage of k-quants when tensor size is not a multiple of 256 (#1921)
* Fix examples/metal

* k-quants: prevent usage when tensor size is not divisible by 256

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-18 11:13:43 +03:00
Kawrakow
90cc59d6ab examples : fix examples/metal (#1920)
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-18 10:52:10 +03:00
Georgi Gerganov
ce2c7d72e2 metal : handle buffers larger than device's maxBufferLength (#1826)
* metal : handle buffers larger than device's maxBufferLength

* metal : print more verbose device info + handle errors

* metal : fix prints for overlapping views

* metal : minimize view overlap to try to utilize device memory better
2023-06-18 09:09:47 +03:00
Howard Su
57cd69460f cmake : add CUDA_ARCHITECTURES to new target ggml_static (#1917) 2023-06-18 07:29:47 +03:00
Georgi Gerganov
b2416493ab make : do not print help for simple example 2023-06-17 20:55:03 +03:00
9 changed files with 156 additions and 42 deletions

View File

@@ -492,6 +492,10 @@ if (GGML_SOURCES_CUDA)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
endif()

View File

@@ -252,7 +252,7 @@ $(info )
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h
@@ -276,9 +276,6 @@ main: examples/main/main.cpp build-info.h ggml.
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./simple -h for help. ===='
@echo
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

View File

@@ -617,7 +617,12 @@ And after 4.45 hours, you will have the final perplexity.
#### Building the Project using Android NDK
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
First, install the essential packages for termux:
```
pkg install clang wget git cmake
```
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
```
$ mkdir build-android
$ cd build-android

View File

@@ -40,8 +40,10 @@ int main(int argc, char ** argv) {
// this allocates all Metal resources and memory buffers
auto * ctx_metal = ggml_metal_init();
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
// main
{

View File

@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
// - max_size specifies the maximum size of a tensor and is used to create shared views such
// that it is guaranteed that the tensor will fit in at least one of the views
//
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size);
size_t size,
size_t max_size);
// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

View File

@@ -183,6 +183,14 @@ struct ggml_metal_context * ggml_metal_init(void) {
#undef GGML_METAL_ADD_KERNEL
}
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
if (ctx->device.maxTransferRate != 0) {
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
} else {
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
}
return ctx;
}
@@ -199,10 +207,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
const int64_t tsize = ggml_nbytes(t);
// find the view that contains the tensor fully
for (int i = 0; i < ctx->n_buffers; ++i) {
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
*offs = (size_t) ioffs;
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
@@ -220,7 +231,8 @@ bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size) {
size_t size,
size_t max_size) {
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
fprintf(stderr, "%s: too many buffers\n", __func__);
return false;
@@ -237,30 +249,68 @@ bool ggml_metal_add_buffer(
}
}
size_t page_size = getpagesize();
size_t aligned_size = size;
if ((aligned_size % page_size) != 0) {
aligned_size += (page_size - (aligned_size % page_size));
const size_t size_page = getpagesize();
size_t size_aligned = size;
if ((size_aligned % size_page) != 0) {
size_aligned += (size_page - (size_aligned % size_page));
}
ctx->buffers[ctx->n_buffers].name = name;
ctx->buffers[ctx->n_buffers].data = data;
ctx->buffers[ctx->n_buffers].size = size;
// the buffer fits into the max buffer size allowed by the device
if (size_aligned <= ctx->device.maxBufferLength) {
ctx->buffers[ctx->n_buffers].name = name;
ctx->buffers[ctx->n_buffers].data = data;
ctx->buffers[ctx->n_buffers].size = size;
if (ctx->device.maxBufferLength < aligned_size) {
fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
return false;
}
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
return false;
if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
return false;
}
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
++ctx->n_buffers;
} else {
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
// one of the views
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
const size_t size_view = ctx->device.maxBufferLength;
for (size_t i = 0; i < size; i += size_step) {
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
ctx->buffers[ctx->n_buffers].name = name;
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) {
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
return false;
}
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
if (i + size_step < size) {
fprintf(stderr, "\n");
}
++ctx->n_buffers;
}
}
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
fprintf(stderr, ", (%8.2f / %8.2f)",
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
++ctx->n_buffers;
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
} else {
fprintf(stderr, "\n");
}
}
return true;
@@ -909,4 +959,14 @@ void ggml_metal_graph_compute(
dispatch_barrier_sync(queue, ^{});
[command_buffers[n_cb - 1] waitUntilCompleted];
// check status of command buffers
// needed to detect if the device ran out-of-memory for example (#1881)
for (int i = 0; i < n_cb; i++) {
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
if (status != MTLCommandBufferStatusCompleted) {
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
GGML_ASSERT(false);
}
}
}

26
ggml.c
View File

@@ -4154,14 +4154,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
ctx->no_alloc = no_alloc;
}
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
return ctx->mem_buffer;
}
size_t ggml_get_mem_size(struct ggml_context * ctx) {
size_t ggml_get_mem_size(const struct ggml_context * ctx) {
return ctx->mem_size;
}
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
size_t max_size = 0;
struct ggml_object * obj = ctx->objects_begin;
while (obj != NULL) {
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
const size_t size = ggml_nbytes(tensor);
if (max_size < size) {
max_size = size;
}
obj = obj->next;
}
return max_size;
}
// IMPORTANT:
// when creating "opt" tensors, always save and load the scratch buffer
// this is an error prone process, but it is necessary to support inplace
@@ -7898,7 +7918,7 @@ static void ggml_compute_forward_add_q_f32(
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
assert(ne00 % 32 == 0);

5
ggml.h
View File

@@ -500,8 +500,9 @@ extern "C" {
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,

View File

@@ -19,6 +19,11 @@
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_K_QUANTS
#ifndef QK_K
#define QK_K 256
#endif
#endif
#include <array>
#include <ctime>
@@ -2491,6 +2496,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else {
new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
int nx = tensor.ne.at(0);
int ny = tensor.ne.at(0);
if (nx % QK_K != 0 || ny % QK_K != 0) {
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
fprintf(stderr, "This is required to be able to use k-quants for now!\n");
fprintf(stderr, "========================================================================================\n\n");
throw std::runtime_error("Unsupported tensor size encountered\n");
}
}
if (tensor.name == "output.weight") {
new_type = GGML_TYPE_Q6_K;
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
@@ -2696,16 +2712,21 @@ struct llama_context * llama_init_from_file(
// this allocates all Metal resources and memory buffers
ctx->ctx_metal = ggml_metal_init();
void *data_ptr = NULL;
void * data_ptr = NULL;
size_t data_size = 0;
if (params.use_mmap) {
data_ptr = ctx->model.mapping->addr;
data_size= ctx->model.mapping->size;
data_ptr = ctx->model.mapping->addr;
data_size = ctx->model.mapping->size;
} else {
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
data_size= ggml_get_mem_size(ctx->model.ctx);
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
data_size = ggml_get_mem_size (ctx->model.ctx);
}
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
#define LLAMA_METAL_CHECK_BUF(result) \
if (!(result)) { \
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
@@ -2713,12 +2734,13 @@ struct llama_context * llama_init_from_file(
return NULL; \
}
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
#undef LLAMA_METAL_CHECK_BUF
}
#endif