Compare commits

..

6 Commits
b1996 ... b2002

Author SHA1 Message Date
Sang-Kil Park
e76627bcce py : improve BPE tokenizer support (#5189) 2024-01-29 11:24:19 +02:00
slaren
fbe7dfa53c ggml : add max buffer sizes to opencl and metal backends (#5181) 2024-01-29 10:05:13 +02:00
Eve
172ac82629 cmake : fix Vulkan build (#5182) 2024-01-29 10:04:47 +02:00
Paul Tsochantaris
d2f650cb5b metal : free metal objects (#5161)
* Releasing MTLFunction references after Metal pipeline construction

* Keeping the `ggml_metal_kernel` structure

* Spacing fix

* Whitespace fix
2024-01-28 21:50:16 +02:00
Georgi Gerganov
35dec26cc2 sync : ggml 2024-01-28 19:48:05 +02:00
Georgi Gerganov
d460510c72 ggml : minor type fix (int64_t -> size_t) 2024-01-28 19:47:31 +02:00
6 changed files with 48 additions and 26 deletions

View File

@@ -422,7 +422,13 @@ if (LLAMA_VULKAN)
if (Vulkan_FOUND)
message(STATUS "Vulkan found")
set(GGML_HEADERS_VULKAN ggml-vulkan.h)
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
if (BUILD_SHARED_LIBS)
set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
add_compile_definitions(GGML_USE_VULKAN)
@@ -848,6 +854,7 @@ add_library(ggml OBJECT
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
@@ -928,7 +935,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_VULKAN}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

View File

@@ -334,7 +334,10 @@ class Params:
class BpeVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
self.vocab = self.bpe_tokenizer["model"]["vocab"]
try:
self.vocab = self.bpe_tokenizer["model"]["vocab"]
except:
self.vocab = self.bpe_tokenizer
added_tokens: dict[str, int]
if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.

View File

@@ -24,10 +24,7 @@
#define UNUSED(x) (void)(x)
#define GGML_METAL_MAX_KERNELS 256
struct ggml_metal_kernel {
id<MTLFunction> function;
id<MTLComputePipelineState> pipeline;
};
@@ -159,11 +156,10 @@ struct ggml_metal_context {
id<MTLDevice> device;
id<MTLCommandQueue> queue;
id<MTLLibrary> library;
dispatch_queue_t d_queue;
struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
bool support_simdgroup_reduction;
bool support_simdgroup_mm;
@@ -246,6 +242,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->queue = [ctx->device newCommandQueue];
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
id<MTLLibrary> metal_library;
// load library
{
NSBundle * bundle = nil;
@@ -260,7 +258,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
// pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:libPath];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
@@ -302,7 +300,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
//[options setFastMathEnabled:false];
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
@@ -367,8 +365,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
{
NSError * error = nil;
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
ctx->kernels[i].function = nil;
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
ctx->kernels[i].pipeline = nil;
}
@@ -380,10 +377,12 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
#define GGML_METAL_ADD_KERNEL(e, name, supported) \
if (supported) { \
struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \
kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \
id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
[metal_function release]; \
if (error) { \
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
[metal_library release]; \
return NULL; \
} \
} else { \
@@ -512,23 +511,17 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
}
[metal_library release];
return ctx;
}
static void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
if (ctx->kernels[i].pipeline) {
[ctx->kernels[i].pipeline release];
}
if (ctx->kernels[i].function) {
[ctx->kernels[i].function release];
}
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
[ctx->kernels[i].pipeline release];
}
[ctx->library release];
[ctx->queue release];
[ctx->device release];
@@ -2382,6 +2375,16 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backen
UNUSED(buft);
}
GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
id<MTLDevice> device = ggml_backend_metal_get_device();
size_t max_size = device.maxBufferLength;
ggml_backend_metal_free_device();
return max_size;
UNUSED(buft);
}
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
@@ -2400,7 +2403,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,

View File

@@ -2125,6 +2125,15 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
GGML_UNUSED(buffer_type);
}
static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
static size_t max_size = -1;
if (max_size == (size_t)-1) {
ggml_cl_init();
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
}
return max_size;
}
static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
//return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
return ggml_backend_is_cpu(backend);
@@ -2136,7 +2145,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // TODO: return from device info
/* .get_max_size = */ ggml_backend_opencl_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL,
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
/* .is_host = */ NULL,

2
ggml.c
View File

@@ -9978,7 +9978,7 @@ static void ggml_compute_forward_mul_mat(
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
const int64_t ne_plane = ne01*ne00;
const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
UNUSED(desired_wsize);
if (params->type == GGML_TASK_INIT) {

View File

@@ -1 +1 @@
c2448f88d17395452a587d0176d19ed87e0f7ce1
f2a9472b23cf27e672ed70a2a6eb078f7b060f18