Compare commits

...

5 Commits

Author SHA1 Message Date
Georgi Gerganov
fab49c685e ggml : update WASM SIMD 2023-05-20 20:00:41 +03:00
Zenix
b8ee340abe feature : support blis and other blas implementation (#1536)
* feature: add blis support

* feature: allow all BLA_VENDOR to be assigned in cmake arguments. align with whisper.cpp pr 927

* fix: version detection for BLA_SIZEOF_INTEGER, recover min version of cmake

* Fix typo in INTEGER

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Fix: blas changes on ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-05-20 17:58:31 +03:00
Henri Vasserman
9ecb30f959 OpenCL: Fixes for older devices. (#1435)
* Remove `constant`

* Rewrite platform and device selection

* Fix Q8_0
2023-05-20 17:57:39 +03:00
Juuso Alasuutari
29cf5596fe llama : define magic numbers as integer constants (#1518) (#1520)
The underlying representation of multibyte character literals is
implementation-defined. This could, at least in principle, cause
cross-build data export/import issues independent of endianness.

Define magic numbers as integer literals to be on the safe side.

Signed-off-by: Juuso Alasuutari <juuso.alasuutari@gmail.com>
2023-05-20 15:58:15 +03:00
Georgi Gerganov
3de84b2606 ggml : add ggml_clamp() (#1539)
* ggml : add ggml_clamp()

* ggml : indentation
2023-05-20 15:34:45 +03:00
10 changed files with 618 additions and 210 deletions

View File

@@ -165,7 +165,7 @@ jobs:
- build: 'clblast'
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas'
defines: '-DLLAMA_OPENBLAS=ON -DBLAS_LIBRARIES="/LIBPATH:$env:RUNNER_TEMP/openblas/lib" -DOPENBLAS_INC="$env:RUNNER_TEMP/openblas/include"'
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
steps:
- name: Clone

67
BLIS.md Normal file
View File

@@ -0,0 +1,67 @@
BLIS Installation Manual
------------------------
BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
Project URL: https://github.com/flame/blis
### Prepare:
Compile BLIS:
```bash
git clone https://github.com/flame/blis
cd blis
./configure --enable-cblas -t openmp,pthreads auto
# will install to /usr/local/ by default.
make -j
```
Install BLIS:
```bash
sudo make install
```
We recommend using openmp since it's easier to modify the cores been used.
### llama.cpp compilation
Makefile:
```bash
make LLAMA_BLIS=1 -j
# make LLAMA_BLIS=1 benchmark-matmult
```
CMake:
```bash
mkdir build
cd build
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
make -j
```
### llama.cpp execution
According to the BLIS documentation, we could set the following
environment variables to modify the behavior of openmp:
```
export GOMP_GPU_AFFINITY="0-19"
export BLIS_NUM_THREADS=14
```
And then run the binaries as normal.
### Intel specific issue
Some might get the error message saying that `libimf.so` cannot be found.
Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
### Reference:
1. https://github.com/flame/blis#getting-started
2. https://github.com/flame/blis/blob/master/docs/Multithreading.md

View File

@@ -65,7 +65,8 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
option(LLAMA_BLAS "llama: use BLAS" OFF)
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
@@ -145,36 +146,28 @@ if (APPLE AND LLAMA_ACCELERATE)
endif()
endif()
if (LLAMA_OPENBLAS)
if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
endif()
set(BLA_VENDOR OpenBLAS)
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
set(BLA_SIZEOF_INTEGER 8)
endif()
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
find_package(BLAS)
if (BLAS_FOUND)
message(STATUS "OpenBLAS found")
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
add_link_options(${BLAS_LIBRARIES})
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
# find header file
set(OPENBLAS_INCLUDE_SEARCH_PATHS
/usr/include
/usr/include/openblas
/usr/include/openblas-base
/usr/local/include
/usr/local/include/openblas
/usr/local/include/openblas-base
/opt/OpenBLAS/include
$ENV{OpenBLAS_HOME}
$ENV{OpenBLAS_HOME}/include
)
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
add_compile_options(-I${OPENBLAS_INC})
message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
include_directories(${BLAS_INCLUDE_DIRS})
else()
message(WARNING "OpenBLAS not found")
message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct LLAMA_BLAS_VENDOR")
endif()
endif()

View File

@@ -122,6 +122,10 @@ ifdef LLAMA_OPENBLAS
LDFLAGS += -lopenblas
endif
endif
ifdef LLAMA_BLIS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
LDFLAGS += -lblis -L/usr/local/lib
endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include

View File

@@ -56,7 +56,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
- Mixed F16 / F32 precision
- 4-bit, 5-bit and 8-bit integer quantization support
- Runs on the CPU
- OpenBLAS support
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
- cuBLAS and CLBlast support
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
@@ -274,10 +274,25 @@ Building the program with BLAS support may lead to some performance improvements
```bash
mkdir build
cd build
cmake .. -DLLAMA_OPENBLAS=ON
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build . --config Release
```
- BLIS
Check [BLIS.md](BLIS.md) for more information.
- Intel MKL
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
```bash
mkdir build
cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake --build . -config Release
```
- cuBLAS
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).

View File

@@ -10,87 +10,77 @@
#include "ggml.h"
#define MULTILINE_QUOTE(...) #__VA_ARGS__
const char * clblast_dequant = MULTILINE_QUOTE(
static const char * program_source = MULTILINE_QUOTE(
typedef char int8_t;
typedef uchar uint8_t;
typedef int int32_t;
typedef uint uint32_t;
constant uint QK4_0 = 32;
struct block_q4_0
struct __attribute__ ((packed)) block_q4_0
{
float d;
uint8_t qs[QK4_0 / 2];
half d;
uint8_t qs[16]; /* QK4_0 / 2 */
};
constant uint QK4_1 = 32;
struct block_q4_1
struct __attribute__ ((packed)) block_q4_1
{
float d;
float m;
uint8_t qs[QK4_1 / 2];
half d;
half m;
uint8_t qs[16]; /* QK4_1 / 2 */
};
constant uint QK5_0 = 32;
struct __attribute__ ((packed)) block_q5_0
{
half d;
uint32_t qh;
uint8_t qs[QK5_0 / 2];
uint8_t qs[16]; /* QK5_0 / 2 */
};
constant uint QK5_1 = 32;
struct block_q5_1
struct __attribute__ ((packed)) block_q5_1
{
half d;
half m;
uint32_t qh;
uint8_t qs[QK5_1 / 2];
uint8_t qs[16]; /* QK5_1 / 2 */
};
constant uint QK8_0 = 32;
struct block_q8_0
struct __attribute__ ((packed)) block_q8_0
{
float d;
uint8_t qs[QK8_0];
half d;
int8_t qs[32]; /* QK8_0 */
};
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
constant uint qk = QK4_0;
const uint i = get_global_id(0) / qk;
const uint i = get_global_id(0) / 32; /* QK4_0 */
const uint j = get_local_id(0);
const float d = x[i].d;
const float d = vload_half(0, (__global half*) &x[i].d);
const int x0 = (x[i].qs[j] & 0xf) - 8;
const int x1 = (x[i].qs[j] >> 4) - 8;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
y[i*32 + j + 0 ] = x0*d;
y[i*32 + j + 16] = x1*d;
}
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
constant uint qk = QK4_1;
const uint i = get_global_id(0) / qk;
const uint i = get_global_id(0) / 32; /* QK4_1 */
const uint j = get_local_id(0);
const float d = x[i].d;
const float m = x[i].m;
const float d = vload_half(0, (__global half*) &x[i].d);
const float m = vload_half(0, (__global half*) &x[i].m);
const int x0 = (x[i].qs[j] & 0xf);
const int x1 = (x[i].qs[j] >> 4);
y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
y[i*32 + j + 0 ] = x0*d + m;
y[i*32 + j + 16] = x1*d + m;
}
__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
constant uint qk = QK5_0;
const uint i = get_global_id(0) / qk;
const uint i = get_global_id(0) / 32; /* QK5_0 */
const uint j = get_local_id(0);
const float d = vload_half(0, (__global half*) &x[i].d);
@@ -103,14 +93,12 @@ __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float*
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
y[i*32 + j + 0 ] = x0*d;
y[i*32 + j + 16] = x1*d;
}
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
constant uint qk = QK5_1;
const uint i = get_global_id(0) / qk;
const uint i = get_global_id(0) / 32; /* QK5_1 */
const uint j = get_local_id(0);
const float d = vload_half(0, (__global half*) &x[i].d);
@@ -124,28 +112,38 @@ __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float*
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
const int x1 = (x[i].qs[j] >> 4) | xh_1;
y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
y[i*32 + j + 0 ] = x0*d + m;
y[i*32 + j + 16] = x1*d + m;
}
__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
constant uint qk = QK8_0;
const uint i = get_global_id(0) / qk;
const uint i = get_global_id(0) / 32; /* QK8_0 */
const uint j = get_local_id(0);
const float d = x[i].d;
y[i*qk + j] = x[i].qs[j]*d;
const float d = vload_half(0, (__global half*) &x[i].d);
y[i*32 + j] = x[i].qs[j]*d;
}
);
#define CL_CHECK(err, name) \
do { \
cl_int err_ = (err); \
if (err_ != CL_SUCCESS) { \
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
exit(1); \
} \
#define CL_CHECK(err) \
do { \
cl_int err_ = (err); \
if (err_ != CL_SUCCESS) { \
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
#err, err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
#define CLBLAST_CHECK(err) \
do { \
CLBlastStatusCode err_ = (err); \
if (err_ != CLBlastSuccess) { \
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
#err, err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
static cl_platform_id platform;
@@ -188,48 +186,174 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
void ggml_cl_init(void) {
cl_int err = 0;
char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
printf("\nInitializing CLBlast (First Run)...");
printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(num_platforms, platforms, NULL);
platform = platforms[plat_num];
char platform_buffer[1024];
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
cl_uint num_devices;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
device = devices[dev_num];
char device_buffer[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CL_CHECK(err, "clCreateContext");
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
CL_CHECK(err, "clCreateCommandQueue");
free(platforms);
free(devices);
struct cl_device;
struct cl_platform {
cl_platform_id id;
unsigned number;
char name[128];
char vendor[128];
struct cl_device * devices;
unsigned n_devices;
struct cl_device * default_device;
};
program = build_program_from_source(context, device, clblast_dequant);
struct cl_device {
struct cl_platform * platform;
cl_device_id id;
unsigned number;
cl_device_type type;
char name[128];
};
enum { NPLAT = 16, NDEV = 16 };
struct cl_platform platforms[NPLAT];
unsigned n_platforms = 0;
struct cl_device devices[NDEV];
unsigned n_devices = 0;
struct cl_device * default_device = NULL;
platform = NULL;
device = NULL;
cl_platform_id platform_ids[NPLAT];
CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
for (unsigned i = 0; i < n_platforms; i++) {
struct cl_platform * p = &platforms[i];
p->number = i;
p->id = platform_ids[i];
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
cl_device_id device_ids[NDEV];
cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
p->n_devices = 0;
} else {
CL_CHECK(clGetDeviceIDsError);
}
p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
p->default_device = NULL;
for (unsigned j = 0; j < p->n_devices; j++) {
struct cl_device * d = &devices[n_devices];
d->number = n_devices++;
d->id = device_ids[j];
d->platform = p;
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
p->default_device = d;
}
}
if (default_device == NULL && p->default_device != NULL) {
default_device = p->default_device;
}
}
if (n_devices == 0) {
fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
exit(1);
}
char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
char * user_device_string = getenv("GGML_OPENCL_DEVICE");
int user_platform_number = -1;
int user_device_number = -1;
unsigned n;
if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
user_platform_number = (int)n;
}
if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
user_device_number = (int)n;
}
struct cl_device * selected_devices = devices;
unsigned n_selected_devices = n_devices;
if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
for (unsigned i = 0; i < n_platforms; i++) {
struct cl_platform * p = &platforms[i];
if (strstr(p->name, user_platform_string) != NULL ||
strstr(p->vendor, user_platform_string) != NULL) {
user_platform_number = (int)i;
break;
}
}
if (user_platform_number == -1) {
fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
exit(1);
}
}
if (user_platform_number != -1) {
struct cl_platform * p = &platforms[user_platform_number];
selected_devices = p->devices;
n_selected_devices = p->n_devices;
default_device = p->default_device;
if (n_selected_devices == 0) {
fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
exit(1);
}
}
if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
for (unsigned i = 0; i < n_selected_devices; i++) {
struct cl_device * d = &selected_devices[i];
if (strstr(d->name, user_device_string) != NULL) {
user_device_number = d->number;
break;
}
}
if (user_device_number == -1) {
fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
exit(1);
}
}
if (user_device_number != -1) {
selected_devices = &devices[user_device_number];
n_selected_devices = 1;
default_device = &selected_devices[0];
}
GGML_ASSERT(n_selected_devices > 0);
if (default_device == NULL) {
default_device = &selected_devices[0];
}
fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
if (default_device->type != CL_DEVICE_TYPE_GPU) {
fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
}
platform = default_device->platform->id;
device = default_device->id;
cl_context_properties properties[] = {
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
};
CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
(err != CL_INVALID_PROPERTY && err != CL_INVALID_VALUE ? err :
(queue = clCreateCommandQueue(context, device, 0, &err), err)
)));
program = build_program_from_source(context, device, program_source);
// Prepare dequantize kernels
kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
CL_CHECK(err, "clCreateKernel");
CL_CHECK((kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
CL_CHECK((kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
CL_CHECK((kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
CL_CHECK((kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
CL_CHECK((kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
}
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
@@ -242,9 +366,8 @@ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags
clReleaseMemObject(*buf);
}
cl_int err;
*buf = clCreateBuffer(context, flags, req_size, NULL, &err);
CL_CHECK((*buf = clCreateBuffer(context, flags, req_size, NULL, &err), err));
*cur_size = req_size;
CL_CHECK(err, "clCreateBuffer");
}
void ggml_cl_sgemm_wrapper(
@@ -253,7 +376,6 @@ void ggml_cl_sgemm_wrapper(
const float alpha, const void *host_a, const int lda,
const float *host_b, const int ldb, const float beta,
float *host_c, const int ldc, const int btype) {
cl_int err = 0;
cl_kernel kernel;
size_t global = n * k, local, size_qb;
@@ -267,13 +389,13 @@ void ggml_cl_sgemm_wrapper(
dequant = true;
kernel = kernel_q4_0;
local = 16;
size_qb = global * (sizeof(float) + local) / 32;
size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
break;
case GGML_TYPE_Q4_1:
dequant = true;
kernel = kernel_q4_1;
local = 16;
size_qb = global * (sizeof(float) * 2 + local) / 32;
size_qb = global * (sizeof(ggml_fp16_t) * 2 + local) / 32;
break;
case GGML_TYPE_Q5_0:
dequant = true;
@@ -291,7 +413,7 @@ void ggml_cl_sgemm_wrapper(
dequant = true;
kernel = kernel_q8_0;
local = 32;
size_qb = global * (sizeof(float) + local) / 32;
size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
break;
default:
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
@@ -313,49 +435,40 @@ void ggml_cl_sgemm_wrapper(
cl_event ev_a, ev_qb, ev_b;
if (dequant) {
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
CL_CHECK(err, "clSetKernelArg");
err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
CL_CHECK(err, "clEnqueueWriteBuffer qb");
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b));
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb));
} else {
err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
CL_CHECK(err, "clEnqueueWriteBuffer b");
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b));
}
err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
CL_CHECK(err, "clEnqueueWriteBuffer a");
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a));
if (dequant) {
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
CL_CHECK(err, "clEnqueueNDRangeKernel");
clReleaseEvent(ev_qb);
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b));
CL_CHECK(clReleaseEvent(ev_qb));
}
clWaitForEvents(1, &ev_a);
clWaitForEvents(1, &ev_b);
clReleaseEvent(ev_a);
clReleaseEvent(ev_b);
CL_CHECK(clWaitForEvents(1, &ev_a));
CL_CHECK(clWaitForEvents(1, &ev_b));
CL_CHECK(clReleaseEvent(ev_a));
CL_CHECK(clReleaseEvent(ev_b));
cl_event ev_sgemm;
CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
m, n, k,
alpha,
cl_buffer_a, 0, lda,
cl_buffer_b, 0, ldb,
beta,
cl_buffer_c, 0, ldc,
&queue, &ev_sgemm);
if (status != CLBlastSuccess) {
fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
abort();
}
CLBLAST_CHECK(CLBlastSgemm(
(CLBlastLayout)order,
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
m, n, k,
alpha,
cl_buffer_a, 0, lda,
cl_buffer_b, 0, ldb,
beta,
cl_buffer_c, 0, ldc,
&queue, &ev_sgemm));
cl_event ev_c;
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
CL_CHECK(clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c));
// Wait for completion
clWaitForEvents(1, &ev_c);
clReleaseEvent(ev_sgemm);
clReleaseEvent(ev_c);
CL_CHECK(clWaitForEvents(1, &ev_c));
CL_CHECK(clReleaseEvent(ev_sgemm));
CL_CHECK(clReleaseEvent(ev_c));
}

258
ggml.c
View File

@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
float vminvq_f32(float32x4_t v) {
inline static float vminvq_f32(float32x4_t v) {
return
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
float vmaxvq_f32(float32x4_t v) {
inline static float vmaxvq_f32(float32x4_t v) {
return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
int32x4_t res;
res[0] = roundf(vgetq_lane_f32(v, 0));
@@ -766,7 +766,6 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
#endif
#endif
#define QK4_0 32
typedef struct {
ggml_fp16_t d; // delta
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
}
}
#elif defined(__wasm_simd128__)
for (int i = 0; i < nb; i++) {
v128_t srcv [8];
v128_t asrcv[8];
v128_t amaxv[8];
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
wasm_f32x4_extract_lane(amaxv[0], 1)),
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
wasm_f32x4_extract_lane(amaxv[0], 3)));
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
for (int j = 0; j < 8; j++) {
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
}
}
#elif defined(__AVX2__) || defined(__AVX__)
for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
y[i].s = d * vaddvq_s32(accv);
}
#elif defined(__wasm_simd128__)
for (int i = 0; i < nb; i++) {
v128_t srcv [8];
v128_t asrcv[8];
v128_t amaxv[8];
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
wasm_f32x4_extract_lane(amaxv[0], 1)),
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
wasm_f32x4_extract_lane(amaxv[0], 3)));
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = d;
v128_t accv = wasm_i32x4_splat(0);
for (int j = 0; j < 8; j++) {
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
accv = wasm_i32x4_add(accv, vi);
}
y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
wasm_i32x4_extract_lane(accv, 1) +
wasm_i32x4_extract_lane(accv, 2) +
wasm_i32x4_extract_lane(accv, 3));
}
#elif defined(__AVX2__) || defined(__AVX__)
for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors
@@ -2598,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
const block_q8_0 * restrict y0 = &y[i];
const v128_t m4b = wasm_i8x16_splat(0x0F);
const v128_t s16b = wasm_i8x16_splat(0x10);
// extract the 5th bit
memcpy(&qh, x0->qh, sizeof(qh));
@@ -2636,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
const float x0d = GGML_FP16_TO_FP32(x0->d);
// dot product
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
wasm_i32x4_add(
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
}
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -2868,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
const v128_t v0l = wasm_v128_and (v0, m4b);
const v128_t v0h = wasm_u8x16_shr(v0, 4);
static bool x = true;
// add high bit
const v128_t v0lf = wasm_v128_or(v0l, qhl);
const v128_t v0hf = wasm_v128_or(v0h, qhh);
@@ -2892,11 +2962,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
// dot product
sumv = wasm_f32x4_add(sumv,
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d));
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
}
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -3472,6 +3542,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"ROPE",
"ROPE_BACK",
"ALIBI",
"CLAMP",
"CONV_1D_1S",
"CONV_1D_2S",
@@ -3482,7 +3553,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"MAP_BINARY",
};
static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -3532,6 +3604,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"rope(x)",
"rope_back(x)",
"alibi(x)",
"clamp(x)",
"conv_1d_1s(x)",
"conv_1d_2s(x)",
@@ -3542,7 +3615,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"f(x,y)",
};
static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -6214,7 +6287,8 @@ struct ggml_tensor * ggml_alibi(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_head) {
int n_head,
float bias_max) {
GGML_ASSERT(n_past >= 0);
bool is_node = false;
@@ -6233,6 +6307,8 @@ struct ggml_tensor * ggml_alibi(
((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_head;
GGML_ASSERT(sizeof(float) == sizeof(int32_t));
(((float *) b->data)[2]) = bias_max;
ggml_scratch_load(ctx);
@@ -6244,6 +6320,40 @@ struct ggml_tensor * ggml_alibi(
return result;
}
// ggml_clamp
struct ggml_tensor * ggml_clamp(
struct ggml_context * ctx,
struct ggml_tensor * a,
float min,
float max) {
bool is_node = false;
if (a->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
// TODO: when implement backward, fix this:
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
ggml_scratch_save(ctx);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
((float *) b->data)[0] = min;
((float *) b->data)[1] = max;
ggml_scratch_load(ctx);
result->op = GGML_OP_CLAMP;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = b;
return result;
}
// ggml_conv_1d_1s
struct ggml_tensor * ggml_conv_1d_1s(
@@ -10553,6 +10663,7 @@ static void ggml_compute_forward_diag_mask_f32(
const int n_past = ((int32_t *) src1->data)[0];
const bool inplace = (bool)((int32_t *) src1->data)[1];
assert(n_past >= 0);
if (!inplace && (params->type == GGML_TASK_INIT)) {
@@ -10723,14 +10834,15 @@ static void ggml_compute_forward_alibi_f32(
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 2);
assert(ggml_nelements(src1) == 3);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int n_past = ((int32_t *) src1->data)[0];
const int n_head = ((int32_t *) src1->data)[1];
const int n_past = ((int32_t *) src1->data)[0];
const int n_head = ((int32_t *) src1->data)[1];
const float max_bias = ((float *) src1->data)[2];
assert(n_past >= 0);
@@ -10753,8 +10865,8 @@ static void ggml_compute_forward_alibi_f32(
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) {
@@ -10772,13 +10884,13 @@ static void ggml_compute_forward_alibi_f32(
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
pdst[0] = i * m_k + src[0];
pdst[0] = (i-ne0+1) * m_k + src[0];
}
}
}
}
static void ggml_compute_forward_alibi_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
@@ -10786,14 +10898,15 @@ static void ggml_compute_forward_alibi_f16(
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 2);
assert(ggml_nelements(src1) == 3);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int n_past = ((int32_t *) src1->data)[0];
const int n_head = ((int32_t *) src1->data)[1];
const int n_past = ((int32_t *) src1->data)[0];
const int n_head = ((int32_t *) src1->data)[1];
const float max_bias = ((float *) src1->data)[2];
assert(n_past >= 0);
@@ -10816,8 +10929,8 @@ static void ggml_compute_forward_alibi_f16(
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) {
@@ -10836,7 +10949,7 @@ static void ggml_compute_forward_alibi_f16(
}
// we return F32
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
}
}
}
@@ -10872,6 +10985,77 @@ static void ggml_compute_forward_alibi(
}
}
// ggml_compute_forward_clamp
static void ggml_compute_forward_clamp_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 2);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
const int min = ((float *) src1->data)[0];
const int max = ((float *) src1->data)[1];
const int ith = params->ith;
const int nth = params->nth;
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
const size_t nb0 = dst->nb[0];
const size_t nb1 = dst->nb[1];
GGML_ASSERT( nb0 == sizeof(float));
GGML_ASSERT(nb00 == sizeof(float));
for (int j = ith; j < n; j += nth) {
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
for (int i = 0; i < nc; i++) {
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
}
}
}
static void ggml_compute_forward_clamp(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_clamp_f32(params, src0, src1, dst);
} break;
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_COUNT:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_rope
static void ggml_compute_forward_rope_f32(
@@ -12853,6 +13037,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
} break;
case GGML_OP_CLAMP:
{
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
} break;
case GGML_OP_CONV_1D_1S:
{
ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
@@ -13160,6 +13348,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_CLAMP:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_SILU:
{
// necessary for llama
@@ -14039,6 +14231,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
{
node->n_tasks = 1; //TODO
} break;
case GGML_OP_CLAMP:
{
node->n_tasks = 1; //TODO
} break;
case GGML_OP_CONV_1D_1S:
case GGML_OP_CONV_1D_2S:
{

14
ggml.h
View File

@@ -313,6 +313,7 @@ extern "C" {
GGML_OP_ROPE,
GGML_OP_ROPE_BACK,
GGML_OP_ALIBI,
GGML_OP_CLAMP,
GGML_OP_CONV_1D_1S,
GGML_OP_CONV_1D_2S,
@@ -849,7 +850,7 @@ extern "C" {
int n_past);
// in-place, returns view(a)
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past);
@@ -897,7 +898,16 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_head);
int n_head,
float bias_max);
// clamp
// in-place, returns view(a)
struct ggml_tensor * ggml_clamp(
struct ggml_context * ctx,
struct ggml_tensor * a,
float min,
float max);
// padding = 1
// TODO: we don't support extra parameters for now

View File

@@ -427,26 +427,30 @@ struct llama_file_loader {
}
void read_magic() {
uint32_t magic = file.read_u32();
uint32_t version = 0;
if (magic != 'ggml') {
version = file.read_u32();
}
if (magic == 'ggml' && version == 0) {
if (magic == LLAMA_FILE_MAGIC_GGML) {
file_version = LLAMA_FILE_VERSION_GGML;
} else if (magic == 'ggmf' && version == 1) {
file_version = LLAMA_FILE_VERSION_GGMF_V1;
} else if (magic == 'ggjt' && version == 1) {
file_version = LLAMA_FILE_VERSION_GGJT_V1;
} else if (magic == 'ggjt' && version == 2) {
file_version = LLAMA_FILE_VERSION_GGJT_V2;
} else if (magic == 'ggjt' && version == 3) {
file_version = LLAMA_FILE_VERSION_GGJT_V3;
} else {
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
return;
}
uint32_t version = file.read_u32();
switch (magic) {
case LLAMA_FILE_MAGIC_GGMF:
switch (version) {
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
}
break;
case LLAMA_FILE_MAGIC_GGJT:
switch (version) {
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
}
}
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version);
}
void read_hparams() {
hparams.n_vocab = file.read_u32();
@@ -2290,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic != 'ggla') {
if (magic != LLAMA_FILE_MAGIC_GGLA) {
fprintf(stderr, "%s: bad file magic\n", __func__);
return 1;
}

12
llama.h
View File

@@ -19,10 +19,16 @@
# define LLAMA_API
#endif
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_FILE_VERSION 3
#define LLAMA_FILE_MAGIC 'ggjt'
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
#define LLAMA_SESSION_MAGIC 'ggsn'
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1
#ifdef __cplusplus