mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
10 Commits
master-ec2
...
master-778
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7780e4f479 | ||
|
|
265db9834e | ||
|
|
fab49c685e | ||
|
|
b8ee340abe | ||
|
|
9ecb30f959 | ||
|
|
29cf5596fe | ||
|
|
3de84b2606 | ||
|
|
affc76edfd | ||
|
|
ea600071cb | ||
|
|
07e9ace0f9 |
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -165,7 +165,7 @@ jobs:
|
||||
- build: 'clblast'
|
||||
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_OPENBLAS=ON -DBLAS_LIBRARIES="/LIBPATH:$env:RUNNER_TEMP/openblas/lib" -DOPENBLAS_INC="$env:RUNNER_TEMP/openblas/include"'
|
||||
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
67
BLIS.md
Normal file
67
BLIS.md
Normal file
@@ -0,0 +1,67 @@
|
||||
BLIS Installation Manual
|
||||
------------------------
|
||||
|
||||
BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
|
||||
|
||||
Project URL: https://github.com/flame/blis
|
||||
|
||||
### Prepare:
|
||||
|
||||
Compile BLIS:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/flame/blis
|
||||
cd blis
|
||||
./configure --enable-cblas -t openmp,pthreads auto
|
||||
# will install to /usr/local/ by default.
|
||||
make -j
|
||||
```
|
||||
|
||||
Install BLIS:
|
||||
|
||||
```bash
|
||||
sudo make install
|
||||
```
|
||||
|
||||
We recommend using openmp since it's easier to modify the cores been used.
|
||||
|
||||
### llama.cpp compilation
|
||||
|
||||
Makefile:
|
||||
|
||||
```bash
|
||||
make LLAMA_BLIS=1 -j
|
||||
# make LLAMA_BLIS=1 benchmark-matmult
|
||||
```
|
||||
|
||||
CMake:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
|
||||
make -j
|
||||
```
|
||||
|
||||
### llama.cpp execution
|
||||
|
||||
According to the BLIS documentation, we could set the following
|
||||
environment variables to modify the behavior of openmp:
|
||||
|
||||
```
|
||||
export GOMP_GPU_AFFINITY="0-19"
|
||||
export BLIS_NUM_THREADS=14
|
||||
```
|
||||
|
||||
And then run the binaries as normal.
|
||||
|
||||
|
||||
### Intel specific issue
|
||||
|
||||
Some might get the error message saying that `libimf.so` cannot be found.
|
||||
Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
|
||||
|
||||
### Reference:
|
||||
|
||||
1. https://github.com/flame/blis#getting-started
|
||||
2. https://github.com/flame/blis/blob/master/docs/Multithreading.md
|
||||
@@ -65,7 +65,8 @@ endif()
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
|
||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
|
||||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
|
||||
@@ -145,36 +146,28 @@ if (APPLE AND LLAMA_ACCELERATE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_OPENBLAS)
|
||||
if (LLAMA_BLAS)
|
||||
if (LLAMA_STATIC)
|
||||
set(BLA_STATIC ON)
|
||||
endif()
|
||||
|
||||
set(BLA_VENDOR OpenBLAS)
|
||||
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
|
||||
set(BLA_SIZEOF_INTEGER 8)
|
||||
endif()
|
||||
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
|
||||
find_package(BLAS)
|
||||
if (BLAS_FOUND)
|
||||
message(STATUS "OpenBLAS found")
|
||||
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
||||
|
||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||
add_link_options(${BLAS_LIBRARIES})
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||
|
||||
# find header file
|
||||
set(OPENBLAS_INCLUDE_SEARCH_PATHS
|
||||
/usr/include
|
||||
/usr/include/openblas
|
||||
/usr/include/openblas-base
|
||||
/usr/local/include
|
||||
/usr/local/include/openblas
|
||||
/usr/local/include/openblas-base
|
||||
/opt/OpenBLAS/include
|
||||
$ENV{OpenBLAS_HOME}
|
||||
$ENV{OpenBLAS_HOME}/include
|
||||
)
|
||||
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
|
||||
add_compile_options(-I${OPENBLAS_INC})
|
||||
message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
|
||||
include_directories(${BLAS_INCLUDE_DIRS})
|
||||
else()
|
||||
message(WARNING "OpenBLAS not found")
|
||||
message(WARNING "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
" to set correct LLAMA_BLAS_VENDOR")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
6
Makefile
6
Makefile
@@ -122,6 +122,10 @@ ifdef LLAMA_OPENBLAS
|
||||
LDFLAGS += -lopenblas
|
||||
endif
|
||||
endif
|
||||
ifdef LLAMA_BLIS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
LDFLAGS += -lblis -L/usr/local/lib
|
||||
endif
|
||||
ifdef LLAMA_CUBLAS
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
@@ -241,6 +245,6 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
.PHONY: tests
|
||||
.PHONY: tests clean
|
||||
tests:
|
||||
bash ./tests/run-tests.sh
|
||||
|
||||
19
README.md
19
README.md
@@ -56,7 +56,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
|
||||
- Mixed F16 / F32 precision
|
||||
- 4-bit, 5-bit and 8-bit integer quantization support
|
||||
- Runs on the CPU
|
||||
- OpenBLAS support
|
||||
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
|
||||
- cuBLAS and CLBlast support
|
||||
|
||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||
@@ -274,10 +274,25 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_OPENBLAS=ON
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
- BLIS
|
||||
|
||||
Check [BLIS.md](BLIS.md) for more information.
|
||||
|
||||
- Intel MKL
|
||||
|
||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake --build . -config Release
|
||||
```
|
||||
|
||||
- cuBLAS
|
||||
|
||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||
|
||||
123
ggml-cuda.cu
123
ggml-cuda.cu
@@ -83,9 +83,19 @@ typedef struct {
|
||||
} block_q8_0;
|
||||
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
||||
|
||||
#define CUDA_MUL_BLOCK_SIZE 256
|
||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||
#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec
|
||||
|
||||
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= kx) {
|
||||
return;
|
||||
}
|
||||
dst[i] = x[i] * y[i%ky];
|
||||
}
|
||||
|
||||
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
@@ -228,6 +238,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
||||
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
|
||||
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
||||
}
|
||||
|
||||
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
@@ -467,6 +482,67 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_CUDA);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[2];
|
||||
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
size_t x_size, d_size;
|
||||
|
||||
float * d_X = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
||||
float * d_Y = (float *) src1->data; // src1 is already on device, broadcasted.
|
||||
float * d_D = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
const int i0 = i03*ne02 + i02;
|
||||
float * c_X2 = d_X + i0*ne01*ne00;
|
||||
float * c_D2 = d_D + i0*ne01*ne00;
|
||||
|
||||
cudaStream_t cudaStream = g_cudaStreams[i0 % GGML_CUDA_MAX_STREAMS];
|
||||
cudaStream_t cudaStream2 = g_cudaStreams2[i0 % GGML_CUDA_MAX_STREAMS];
|
||||
cudaEvent_t cudaEvent = g_cudaEvents[i0 % GGML_CUDA_MAX_EVENTS];
|
||||
|
||||
// copy src0 to device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X2, src0, i03, i02, cudaStream2));
|
||||
CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
|
||||
|
||||
// wait for data
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
|
||||
|
||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||
const int64_t i13 = i03%ne13;
|
||||
const int64_t i12 = i02%ne12;
|
||||
const int64_t i11 = i01%ne11;
|
||||
const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
|
||||
|
||||
float * c_X1 = c_X2 + i01*ne00;
|
||||
float * c_Y = d_Y + i1*ne10;
|
||||
float * c_D1 = c_D2 + i01*ne00;
|
||||
|
||||
// compute
|
||||
mul_f32_cuda(c_X1, c_Y, c_D1, ne00, ne10, cudaStream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
CUDA_CHECK(cudaMemcpyAsync(d, c_D2, sizeof(float)*ne00*ne01, cudaMemcpyDeviceToHost, cudaStream));
|
||||
}
|
||||
}
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
ggml_cuda_pool_free(d_X, x_size);
|
||||
ggml_cuda_pool_free(d_D, d_size);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
@@ -724,6 +800,11 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
|
||||
ggml_cuda_pool_free(d_Q, q_size);
|
||||
}
|
||||
|
||||
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
ggml_cuda_mul_f32(src0, src1, dst);
|
||||
}
|
||||
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
@@ -797,14 +878,48 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
|
||||
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
||||
|
||||
size_t q_size;
|
||||
char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
|
||||
char * dst = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
|
||||
|
||||
cudaStream_t cudaStream2 = g_cudaStreams2[0];
|
||||
|
||||
// copy tensor to device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
int i = i3*ne2 + i2;
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(dst + i*ne0*ne1, tensor, i3, i2, cudaStream2));
|
||||
}
|
||||
}
|
||||
|
||||
tensor->data = d_Q;
|
||||
tensor->data = dst;
|
||||
tensor->backend = GGML_BACKEND_CUDA;
|
||||
}
|
||||
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
||||
FILE * fp = fopen(fname, "rb");
|
||||
|
||||
const size_t size = ggml_nbytes(tensor);
|
||||
|
||||
void * buf;
|
||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
||||
void * buf_host = malloc(size);
|
||||
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
|
||||
#else
|
||||
int ret = fseek(fp, (long) offset, SEEK_SET);
|
||||
#endif
|
||||
GGML_ASSERT(ret == 0); // same
|
||||
|
||||
size_t ret2 = fread(buf_host, size, 1, fp);
|
||||
if (ret2 != 1) {
|
||||
fprintf(stderr, "unexpectedly reached end of file");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
tensor->data = buf;
|
||||
free(buf_host);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ extern "C" {
|
||||
|
||||
void ggml_init_cublas(void);
|
||||
|
||||
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
@@ -15,6 +16,7 @@ void * ggml_cuda_host_malloc(size_t size);
|
||||
void ggml_cuda_host_free(void * ptr);
|
||||
|
||||
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
|
||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
375
ggml-opencl.c
375
ggml-opencl.c
@@ -10,87 +10,77 @@
|
||||
#include "ggml.h"
|
||||
|
||||
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
||||
const char * clblast_dequant = MULTILINE_QUOTE(
|
||||
static const char * program_source = MULTILINE_QUOTE(
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
constant uint QK4_0 = 32;
|
||||
struct block_q4_0
|
||||
struct __attribute__ ((packed)) block_q4_0
|
||||
{
|
||||
float d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
half d;
|
||||
uint8_t qs[16]; /* QK4_0 / 2 */
|
||||
};
|
||||
|
||||
constant uint QK4_1 = 32;
|
||||
struct block_q4_1
|
||||
struct __attribute__ ((packed)) block_q4_1
|
||||
{
|
||||
float d;
|
||||
float m;
|
||||
uint8_t qs[QK4_1 / 2];
|
||||
half d;
|
||||
half m;
|
||||
uint8_t qs[16]; /* QK4_1 / 2 */
|
||||
};
|
||||
|
||||
constant uint QK5_0 = 32;
|
||||
struct __attribute__ ((packed)) block_q5_0
|
||||
{
|
||||
half d;
|
||||
uint32_t qh;
|
||||
uint8_t qs[QK5_0 / 2];
|
||||
uint8_t qs[16]; /* QK5_0 / 2 */
|
||||
};
|
||||
|
||||
constant uint QK5_1 = 32;
|
||||
struct block_q5_1
|
||||
struct __attribute__ ((packed)) block_q5_1
|
||||
{
|
||||
half d;
|
||||
half m;
|
||||
uint32_t qh;
|
||||
uint8_t qs[QK5_1 / 2];
|
||||
uint8_t qs[16]; /* QK5_1 / 2 */
|
||||
};
|
||||
|
||||
constant uint QK8_0 = 32;
|
||||
struct block_q8_0
|
||||
struct __attribute__ ((packed)) block_q8_0
|
||||
{
|
||||
float d;
|
||||
uint8_t qs[QK8_0];
|
||||
half d;
|
||||
int8_t qs[32]; /* QK8_0 */
|
||||
};
|
||||
|
||||
|
||||
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
|
||||
constant uint qk = QK4_0;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint i = get_global_id(0) / 32; /* QK4_0 */
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
|
||||
const int x0 = (x[i].qs[j] & 0xf) - 8;
|
||||
const int x1 = (x[i].qs[j] >> 4) - 8;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d;
|
||||
y[i*qk + j + qk/2] = x1*d;
|
||||
y[i*32 + j + 0 ] = x0*d;
|
||||
y[i*32 + j + 16] = x1*d;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
|
||||
constant uint qk = QK4_1;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint i = get_global_id(0) / 32; /* QK4_1 */
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
const float m = x[i].m;
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
const float m = vload_half(0, (__global half*) &x[i].m);
|
||||
|
||||
const int x0 = (x[i].qs[j] & 0xf);
|
||||
const int x1 = (x[i].qs[j] >> 4);
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d + m;
|
||||
y[i*qk + j + qk/2] = x1*d + m;
|
||||
y[i*32 + j + 0 ] = x0*d + m;
|
||||
y[i*32 + j + 16] = x1*d + m;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
|
||||
constant uint qk = QK5_0;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint i = get_global_id(0) / 32; /* QK5_0 */
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
@@ -103,14 +93,12 @@ __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float*
|
||||
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
|
||||
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d;
|
||||
y[i*qk + j + qk/2] = x1*d;
|
||||
y[i*32 + j + 0 ] = x0*d;
|
||||
y[i*32 + j + 16] = x1*d;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
|
||||
constant uint qk = QK5_1;
|
||||
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint i = get_global_id(0) / 32; /* QK5_1 */
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
@@ -124,28 +112,38 @@ __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float*
|
||||
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
|
||||
const int x1 = (x[i].qs[j] >> 4) | xh_1;
|
||||
|
||||
y[i*qk + j + 0 ] = x0*d + m;
|
||||
y[i*qk + j + qk/2] = x1*d + m;
|
||||
y[i*32 + j + 0 ] = x0*d + m;
|
||||
y[i*32 + j + 16] = x1*d + m;
|
||||
}
|
||||
|
||||
__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
|
||||
constant uint qk = QK8_0;
|
||||
const uint i = get_global_id(0) / qk;
|
||||
const uint i = get_global_id(0) / 32; /* QK8_0 */
|
||||
const uint j = get_local_id(0);
|
||||
|
||||
const float d = x[i].d;
|
||||
y[i*qk + j] = x[i].qs[j]*d;
|
||||
const float d = vload_half(0, (__global half*) &x[i].d);
|
||||
y[i*32 + j] = x[i].qs[j]*d;
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
#define CL_CHECK(err, name) \
|
||||
do { \
|
||||
cl_int err_ = (err); \
|
||||
if (err_ != CL_SUCCESS) { \
|
||||
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
#define CL_CHECK(err) \
|
||||
do { \
|
||||
cl_int err_ = (err); \
|
||||
if (err_ != CL_SUCCESS) { \
|
||||
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
|
||||
#err, err_, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CLBLAST_CHECK(err) \
|
||||
do { \
|
||||
CLBlastStatusCode err_ = (err); \
|
||||
if (err_ != CLBlastSuccess) { \
|
||||
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
|
||||
#err, err_, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static cl_platform_id platform;
|
||||
@@ -188,48 +186,174 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
||||
|
||||
void ggml_cl_init(void) {
|
||||
cl_int err = 0;
|
||||
char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
|
||||
char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
|
||||
int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
|
||||
int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
|
||||
printf("\nInitializing CLBlast (First Run)...");
|
||||
printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
|
||||
cl_uint num_platforms;
|
||||
clGetPlatformIDs(0, NULL, &num_platforms);
|
||||
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
|
||||
clGetPlatformIDs(num_platforms, platforms, NULL);
|
||||
platform = platforms[plat_num];
|
||||
char platform_buffer[1024];
|
||||
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
|
||||
cl_uint num_devices;
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
||||
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
||||
device = devices[dev_num];
|
||||
char device_buffer[1024];
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
|
||||
printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
|
||||
CL_CHECK(err, "clCreateContext");
|
||||
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
|
||||
CL_CHECK(err, "clCreateCommandQueue");
|
||||
|
||||
free(platforms);
|
||||
free(devices);
|
||||
struct cl_device;
|
||||
struct cl_platform {
|
||||
cl_platform_id id;
|
||||
unsigned number;
|
||||
char name[128];
|
||||
char vendor[128];
|
||||
struct cl_device * devices;
|
||||
unsigned n_devices;
|
||||
struct cl_device * default_device;
|
||||
};
|
||||
|
||||
program = build_program_from_source(context, device, clblast_dequant);
|
||||
struct cl_device {
|
||||
struct cl_platform * platform;
|
||||
cl_device_id id;
|
||||
unsigned number;
|
||||
cl_device_type type;
|
||||
char name[128];
|
||||
};
|
||||
|
||||
enum { NPLAT = 16, NDEV = 16 };
|
||||
|
||||
struct cl_platform platforms[NPLAT];
|
||||
unsigned n_platforms = 0;
|
||||
struct cl_device devices[NDEV];
|
||||
unsigned n_devices = 0;
|
||||
struct cl_device * default_device = NULL;
|
||||
|
||||
platform = NULL;
|
||||
device = NULL;
|
||||
|
||||
cl_platform_id platform_ids[NPLAT];
|
||||
CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
|
||||
|
||||
for (unsigned i = 0; i < n_platforms; i++) {
|
||||
struct cl_platform * p = &platforms[i];
|
||||
p->number = i;
|
||||
p->id = platform_ids[i];
|
||||
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
|
||||
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
|
||||
|
||||
cl_device_id device_ids[NDEV];
|
||||
cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
|
||||
if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
|
||||
p->n_devices = 0;
|
||||
} else {
|
||||
CL_CHECK(clGetDeviceIDsError);
|
||||
}
|
||||
p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
|
||||
p->default_device = NULL;
|
||||
|
||||
for (unsigned j = 0; j < p->n_devices; j++) {
|
||||
struct cl_device * d = &devices[n_devices];
|
||||
d->number = n_devices++;
|
||||
d->id = device_ids[j];
|
||||
d->platform = p;
|
||||
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
|
||||
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
|
||||
|
||||
if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
|
||||
p->default_device = d;
|
||||
}
|
||||
}
|
||||
|
||||
if (default_device == NULL && p->default_device != NULL) {
|
||||
default_device = p->default_device;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_devices == 0) {
|
||||
fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
|
||||
char * user_device_string = getenv("GGML_OPENCL_DEVICE");
|
||||
int user_platform_number = -1;
|
||||
int user_device_number = -1;
|
||||
|
||||
unsigned n;
|
||||
if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
|
||||
user_platform_number = (int)n;
|
||||
}
|
||||
if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
|
||||
user_device_number = (int)n;
|
||||
}
|
||||
|
||||
struct cl_device * selected_devices = devices;
|
||||
unsigned n_selected_devices = n_devices;
|
||||
|
||||
if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
|
||||
for (unsigned i = 0; i < n_platforms; i++) {
|
||||
struct cl_platform * p = &platforms[i];
|
||||
if (strstr(p->name, user_platform_string) != NULL ||
|
||||
strstr(p->vendor, user_platform_string) != NULL) {
|
||||
user_platform_number = (int)i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (user_platform_number == -1) {
|
||||
fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (user_platform_number != -1) {
|
||||
struct cl_platform * p = &platforms[user_platform_number];
|
||||
selected_devices = p->devices;
|
||||
n_selected_devices = p->n_devices;
|
||||
default_device = p->default_device;
|
||||
if (n_selected_devices == 0) {
|
||||
fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
|
||||
for (unsigned i = 0; i < n_selected_devices; i++) {
|
||||
struct cl_device * d = &selected_devices[i];
|
||||
if (strstr(d->name, user_device_string) != NULL) {
|
||||
user_device_number = d->number;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (user_device_number == -1) {
|
||||
fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (user_device_number != -1) {
|
||||
selected_devices = &devices[user_device_number];
|
||||
n_selected_devices = 1;
|
||||
default_device = &selected_devices[0];
|
||||
}
|
||||
|
||||
GGML_ASSERT(n_selected_devices > 0);
|
||||
|
||||
if (default_device == NULL) {
|
||||
default_device = &selected_devices[0];
|
||||
}
|
||||
|
||||
fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
|
||||
fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
|
||||
if (default_device->type != CL_DEVICE_TYPE_GPU) {
|
||||
fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
|
||||
}
|
||||
|
||||
platform = default_device->platform->id;
|
||||
device = default_device->id;
|
||||
|
||||
cl_context_properties properties[] = {
|
||||
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
|
||||
};
|
||||
|
||||
CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
|
||||
|
||||
CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
|
||||
(err != CL_INVALID_PROPERTY && err != CL_INVALID_VALUE ? err :
|
||||
(queue = clCreateCommandQueue(context, device, 0, &err), err)
|
||||
)));
|
||||
|
||||
program = build_program_from_source(context, device, program_source);
|
||||
|
||||
// Prepare dequantize kernels
|
||||
kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
CL_CHECK((kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
|
||||
CL_CHECK((kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
|
||||
CL_CHECK((kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
|
||||
CL_CHECK((kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
|
||||
CL_CHECK((kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
|
||||
}
|
||||
|
||||
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
|
||||
@@ -242,9 +366,8 @@ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags
|
||||
clReleaseMemObject(*buf);
|
||||
}
|
||||
cl_int err;
|
||||
*buf = clCreateBuffer(context, flags, req_size, NULL, &err);
|
||||
CL_CHECK((*buf = clCreateBuffer(context, flags, req_size, NULL, &err), err));
|
||||
*cur_size = req_size;
|
||||
CL_CHECK(err, "clCreateBuffer");
|
||||
}
|
||||
|
||||
void ggml_cl_sgemm_wrapper(
|
||||
@@ -253,7 +376,6 @@ void ggml_cl_sgemm_wrapper(
|
||||
const float alpha, const void *host_a, const int lda,
|
||||
const float *host_b, const int ldb, const float beta,
|
||||
float *host_c, const int ldc, const int btype) {
|
||||
cl_int err = 0;
|
||||
|
||||
cl_kernel kernel;
|
||||
size_t global = n * k, local, size_qb;
|
||||
@@ -267,13 +389,13 @@ void ggml_cl_sgemm_wrapper(
|
||||
dequant = true;
|
||||
kernel = kernel_q4_0;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(float) + local) / 32;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
dequant = true;
|
||||
kernel = kernel_q4_1;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) * 2 + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
dequant = true;
|
||||
@@ -291,7 +413,7 @@ void ggml_cl_sgemm_wrapper(
|
||||
dequant = true;
|
||||
kernel = kernel_q8_0;
|
||||
local = 32;
|
||||
size_qb = global * (sizeof(float) + local) / 32;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
|
||||
@@ -313,49 +435,40 @@ void ggml_cl_sgemm_wrapper(
|
||||
cl_event ev_a, ev_qb, ev_b;
|
||||
|
||||
if (dequant) {
|
||||
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
|
||||
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
|
||||
CL_CHECK(err, "clSetKernelArg");
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer qb");
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b));
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb));
|
||||
} else {
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer b");
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b));
|
||||
}
|
||||
|
||||
err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
|
||||
CL_CHECK(err, "clEnqueueWriteBuffer a");
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a));
|
||||
if (dequant) {
|
||||
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
|
||||
CL_CHECK(err, "clEnqueueNDRangeKernel");
|
||||
clReleaseEvent(ev_qb);
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b));
|
||||
CL_CHECK(clReleaseEvent(ev_qb));
|
||||
}
|
||||
clWaitForEvents(1, &ev_a);
|
||||
clWaitForEvents(1, &ev_b);
|
||||
clReleaseEvent(ev_a);
|
||||
clReleaseEvent(ev_b);
|
||||
CL_CHECK(clWaitForEvents(1, &ev_a));
|
||||
CL_CHECK(clWaitForEvents(1, &ev_b));
|
||||
CL_CHECK(clReleaseEvent(ev_a));
|
||||
CL_CHECK(clReleaseEvent(ev_b));
|
||||
|
||||
cl_event ev_sgemm;
|
||||
CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
|
||||
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
|
||||
m, n, k,
|
||||
alpha,
|
||||
cl_buffer_a, 0, lda,
|
||||
cl_buffer_b, 0, ldb,
|
||||
beta,
|
||||
cl_buffer_c, 0, ldc,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != CLBlastSuccess) {
|
||||
fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
|
||||
abort();
|
||||
}
|
||||
CLBLAST_CHECK(CLBlastSgemm(
|
||||
(CLBlastLayout)order,
|
||||
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
|
||||
m, n, k,
|
||||
alpha,
|
||||
cl_buffer_a, 0, lda,
|
||||
cl_buffer_b, 0, ldb,
|
||||
beta,
|
||||
cl_buffer_c, 0, ldc,
|
||||
&queue, &ev_sgemm));
|
||||
|
||||
cl_event ev_c;
|
||||
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c));
|
||||
|
||||
// Wait for completion
|
||||
clWaitForEvents(1, &ev_c);
|
||||
clReleaseEvent(ev_sgemm);
|
||||
clReleaseEvent(ev_c);
|
||||
CL_CHECK(clWaitForEvents(1, &ev_c));
|
||||
CL_CHECK(clReleaseEvent(ev_sgemm));
|
||||
CL_CHECK(clReleaseEvent(ev_c));
|
||||
}
|
||||
|
||||
357
ggml.c
357
ggml.c
@@ -740,19 +740,19 @@ inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
float vminvq_f32(float32x4_t v) {
|
||||
inline static float vminvq_f32(float32x4_t v) {
|
||||
return
|
||||
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
float vmaxvq_f32(float32x4_t v) {
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
int32x4_t res;
|
||||
|
||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||
@@ -766,7 +766,6 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#define QK4_0 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
@@ -1056,6 +1055,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
|
||||
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
|
||||
}
|
||||
}
|
||||
#elif defined(__wasm_simd128__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
v128_t srcv [8];
|
||||
v128_t asrcv[8];
|
||||
v128_t amaxv[8];
|
||||
|
||||
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
||||
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
||||
|
||||
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
||||
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
||||
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
||||
|
||||
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
||||
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
||||
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
||||
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
||||
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
||||
|
||||
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
||||
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
||||
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
||||
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
||||
}
|
||||
}
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Load elements into 4 AVX vectors
|
||||
@@ -1224,6 +1256,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
|
||||
|
||||
y[i].s = d * vaddvq_s32(accv);
|
||||
}
|
||||
#elif defined(__wasm_simd128__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
v128_t srcv [8];
|
||||
v128_t asrcv[8];
|
||||
v128_t amaxv[8];
|
||||
|
||||
for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j);
|
||||
for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
|
||||
|
||||
for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
|
||||
for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
|
||||
for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
|
||||
|
||||
const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
|
||||
wasm_f32x4_extract_lane(amaxv[0], 1)),
|
||||
MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
|
||||
wasm_f32x4_extract_lane(amaxv[0], 3)));
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = d;
|
||||
|
||||
v128_t accv = wasm_i32x4_splat(0);
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
||||
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
|
||||
|
||||
y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
|
||||
y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
|
||||
y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
|
||||
y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
|
||||
|
||||
accv = wasm_i32x4_add(accv, vi);
|
||||
}
|
||||
|
||||
y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
|
||||
wasm_i32x4_extract_lane(accv, 1) +
|
||||
wasm_i32x4_extract_lane(accv, 2) +
|
||||
wasm_i32x4_extract_lane(accv, 3));
|
||||
}
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
for (int i = 0; i < nb; i++) {
|
||||
// Load elements into 4 AVX vectors
|
||||
@@ -2598,7 +2672,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
||||
const block_q8_0 * restrict y0 = &y[i];
|
||||
|
||||
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
||||
const v128_t s16b = wasm_i8x16_splat(0x10);
|
||||
|
||||
// extract the 5th bit
|
||||
memcpy(&qh, x0->qh, sizeof(qh));
|
||||
@@ -2636,15 +2709,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
||||
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
||||
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
||||
|
||||
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
||||
|
||||
// dot product
|
||||
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
||||
wasm_i32x4_add(
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
||||
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
||||
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
||||
}
|
||||
|
||||
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
@@ -2868,8 +2940,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
||||
const v128_t v0l = wasm_v128_and (v0, m4b);
|
||||
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
||||
|
||||
static bool x = true;
|
||||
|
||||
// add high bit
|
||||
const v128_t v0lf = wasm_v128_or(v0l, qhl);
|
||||
const v128_t v0hf = wasm_v128_or(v0h, qhh);
|
||||
@@ -2892,11 +2962,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
||||
// dot product
|
||||
sumv = wasm_f32x4_add(sumv,
|
||||
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
||||
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
||||
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d));
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
||||
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
||||
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
|
||||
}
|
||||
|
||||
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
@@ -3472,6 +3542,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
||||
"ROPE",
|
||||
"ROPE_BACK",
|
||||
"ALIBI",
|
||||
"CLAMP",
|
||||
"CONV_1D_1S",
|
||||
"CONV_1D_2S",
|
||||
|
||||
@@ -3482,7 +3553,8 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
||||
"MAP_BINARY",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
|
||||
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
|
||||
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -3532,6 +3604,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"rope(x)",
|
||||
"rope_back(x)",
|
||||
"alibi(x)",
|
||||
"clamp(x)",
|
||||
"conv_1d_1s(x)",
|
||||
"conv_1d_2s(x)",
|
||||
|
||||
@@ -3542,7 +3615,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"f(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50");
|
||||
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||
@@ -3776,6 +3849,12 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g
|
||||
(t1->ne[3]%t0->ne[3] == 0);
|
||||
}
|
||||
|
||||
static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
|
||||
}
|
||||
|
||||
static inline int ggml_up32(int n) {
|
||||
return (n + 31) & ~31;
|
||||
}
|
||||
@@ -4658,11 +4737,15 @@ struct ggml_tensor * ggml_mul_impl(
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(ggml_are_same_shape(a, b));
|
||||
// TODO: support less-strict constraint
|
||||
// GGML_ASSERT(ggml_can_repeat(b, a));
|
||||
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (!inplace && (a->grad || b->grad)) {
|
||||
// TODO: support backward pass for broadcasting
|
||||
GGML_ASSERT(ggml_are_same_shape(a, b));
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
@@ -6204,7 +6287,8 @@ struct ggml_tensor * ggml_alibi(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_head) {
|
||||
int n_head,
|
||||
float bias_max) {
|
||||
GGML_ASSERT(n_past >= 0);
|
||||
bool is_node = false;
|
||||
|
||||
@@ -6223,6 +6307,8 @@ struct ggml_tensor * ggml_alibi(
|
||||
|
||||
((int32_t *) b->data)[0] = n_past;
|
||||
((int32_t *) b->data)[1] = n_head;
|
||||
GGML_ASSERT(sizeof(float) == sizeof(int32_t));
|
||||
(((float *) b->data)[2]) = bias_max;
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
@@ -6234,6 +6320,40 @@ struct ggml_tensor * ggml_alibi(
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_clamp
|
||||
|
||||
struct ggml_tensor * ggml_clamp(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float min,
|
||||
float max) {
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
GGML_ASSERT(false); // TODO: implement backward
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
// TODO: when implement backward, fix this:
|
||||
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
||||
|
||||
((float *) b->data)[0] = min;
|
||||
((float *) b->data)[1] = max;
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
result->op = GGML_OP_CLAMP;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src0 = a;
|
||||
result->src1 = b;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_1d_1s
|
||||
|
||||
struct ggml_tensor * ggml_conv_1d_1s(
|
||||
@@ -7960,7 +8080,7 @@ static void ggml_compute_forward_mul_f32(
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
@@ -7968,10 +8088,25 @@ static void ggml_compute_forward_mul_f32(
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
const int64_t ne0 = src0->ne[0];
|
||||
const int64_t ne1 = src0->ne[1];
|
||||
const int64_t ne2 = src0->ne[2];
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (src1->backend == GGML_BACKEND_CUDA) {
|
||||
if (ith == 0) {
|
||||
ggml_cuda_mul(src0, src1, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
|
||||
const size_t nb00 = src0->nb[0];
|
||||
const size_t nb01 = src0->nb[1];
|
||||
@@ -7990,44 +8125,51 @@ static void ggml_compute_forward_mul_f32(
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
|
||||
if (nb10 == sizeof(float)) {
|
||||
for (int ir = ith; ir < nr; ir += nth) {
|
||||
// src0, src1 and dst are same shape => same indices
|
||||
const int i3 = ir/(ne2*ne1);
|
||||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
for (int64_t ir = ith; ir < nr; ir += nth) {
|
||||
// src0 and dst are same shape => same indices
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
|
||||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
UNUSED(ggml_vec_mul_f32);
|
||||
|
||||
vDSP_vmul(
|
||||
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
||||
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
||||
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
||||
ne0);
|
||||
vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
||||
#else
|
||||
ggml_vec_mul_f32(ne0,
|
||||
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
||||
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
||||
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
||||
ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
||||
#endif
|
||||
// }
|
||||
// }
|
||||
}
|
||||
} else {
|
||||
// src1 is not contiguous
|
||||
for (int ir = ith; ir < nr; ir += nth) {
|
||||
// src0, src1 and dst are same shape => same indices
|
||||
const int i3 = ir/(ne2*ne1);
|
||||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
for (int64_t ir = ith; ir < nr; ir += nth) {
|
||||
// src0 and dst are same shape => same indices
|
||||
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
|
||||
float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
|
||||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
|
||||
for (int64_t i0 = 0; i0 < ne00; i0++) {
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
||||
|
||||
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
||||
}
|
||||
@@ -10521,6 +10663,7 @@ static void ggml_compute_forward_diag_mask_f32(
|
||||
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const bool inplace = (bool)((int32_t *) src1->data)[1];
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
||||
@@ -10691,14 +10834,15 @@ static void ggml_compute_forward_alibi_f32(
|
||||
struct ggml_tensor * dst) {
|
||||
assert(params->ith == 0);
|
||||
assert(src1->type == GGML_TYPE_I32);
|
||||
assert(ggml_nelements(src1) == 2);
|
||||
assert(ggml_nelements(src1) == 3);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_head = ((int32_t *) src1->data)[1];
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_head = ((int32_t *) src1->data)[1];
|
||||
const float max_bias = ((float *) src1->data)[2];
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
@@ -10721,8 +10865,8 @@ static void ggml_compute_forward_alibi_f32(
|
||||
// add alibi to src0 (KQ_scaled)
|
||||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
|
||||
const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
|
||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||
|
||||
for (int i = 0; i < ne0; i++) {
|
||||
for (int j = 0; j < ne1; j++) {
|
||||
@@ -10740,13 +10884,13 @@ static void ggml_compute_forward_alibi_f32(
|
||||
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
||||
}
|
||||
|
||||
pdst[0] = i * m_k + src[0];
|
||||
pdst[0] = (i-ne0+1) * m_k + src[0];
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_alibi_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
@@ -10754,14 +10898,15 @@ static void ggml_compute_forward_alibi_f16(
|
||||
struct ggml_tensor * dst) {
|
||||
assert(params->ith == 0);
|
||||
assert(src1->type == GGML_TYPE_I32);
|
||||
assert(ggml_nelements(src1) == 2);
|
||||
assert(ggml_nelements(src1) == 3);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_head = ((int32_t *) src1->data)[1];
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_head = ((int32_t *) src1->data)[1];
|
||||
const float max_bias = ((float *) src1->data)[2];
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
@@ -10784,8 +10929,8 @@ static void ggml_compute_forward_alibi_f16(
|
||||
// add alibi to src0 (KQ_scaled)
|
||||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
|
||||
const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
|
||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||
|
||||
for (int i = 0; i < ne0; i++) {
|
||||
for (int j = 0; j < ne1; j++) {
|
||||
@@ -10804,7 +10949,7 @@ static void ggml_compute_forward_alibi_f16(
|
||||
}
|
||||
|
||||
// we return F32
|
||||
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
||||
pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10840,6 +10985,77 @@ static void ggml_compute_forward_alibi(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_clamp
|
||||
|
||||
static void ggml_compute_forward_clamp_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(params->ith == 0);
|
||||
assert(src1->type == GGML_TYPE_I32);
|
||||
assert(ggml_nelements(src1) == 2);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int min = ((float *) src1->data)[0];
|
||||
const int max = ((float *) src1->data)[1];
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
const size_t nb00 = src0->nb[0];
|
||||
const size_t nb01 = src0->nb[1];
|
||||
|
||||
const size_t nb0 = dst->nb[0];
|
||||
const size_t nb1 = dst->nb[1];
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
|
||||
for (int j = ith; j < n; j += nth) {
|
||||
float * dst_ptr = (float *) ((char *) dst->data + j*nb1);
|
||||
float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
|
||||
|
||||
for (int i = 0; i < nc; i++) {
|
||||
dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_clamp(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_clamp_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_rope
|
||||
|
||||
static void ggml_compute_forward_rope_f32(
|
||||
@@ -12821,6 +13037,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
|
||||
} break;
|
||||
case GGML_OP_CLAMP:
|
||||
{
|
||||
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_1D_1S:
|
||||
{
|
||||
ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
|
||||
@@ -13128,6 +13348,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_CLAMP:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_SILU:
|
||||
{
|
||||
// necessary for llama
|
||||
@@ -14007,6 +14231,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||
{
|
||||
node->n_tasks = 1; //TODO
|
||||
} break;
|
||||
case GGML_OP_CLAMP:
|
||||
{
|
||||
node->n_tasks = 1; //TODO
|
||||
} break;
|
||||
case GGML_OP_CONV_1D_1S:
|
||||
case GGML_OP_CONV_1D_2S:
|
||||
{
|
||||
@@ -14403,9 +14631,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
||||
fprintf(fp, "%s |", node->name);
|
||||
}
|
||||
|
||||
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
|
||||
i, node->ne[0], node->ne[1],
|
||||
GGML_OP_SYMBOL[node->op]);
|
||||
if (node->n_dims == 2) {
|
||||
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]);
|
||||
} else {
|
||||
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
||||
}
|
||||
|
||||
|
||||
if (node->grad) {
|
||||
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
||||
|
||||
14
ggml.h
14
ggml.h
@@ -313,6 +313,7 @@ extern "C" {
|
||||
GGML_OP_ROPE,
|
||||
GGML_OP_ROPE_BACK,
|
||||
GGML_OP_ALIBI,
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_1D_1S,
|
||||
GGML_OP_CONV_1D_2S,
|
||||
|
||||
@@ -849,7 +850,7 @@ extern "C" {
|
||||
int n_past);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
|
||||
GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past);
|
||||
@@ -897,7 +898,16 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_head);
|
||||
int n_head,
|
||||
float bias_max);
|
||||
|
||||
// clamp
|
||||
// in-place, returns view(a)
|
||||
struct ggml_tensor * ggml_clamp(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float min,
|
||||
float max);
|
||||
|
||||
// padding = 1
|
||||
// TODO: we don't support extra parameters for now
|
||||
|
||||
@@ -172,7 +172,7 @@ struct llama_mmap {
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
@@ -184,9 +184,9 @@ struct llama_mmap {
|
||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||
}
|
||||
|
||||
if (prefetch) {
|
||||
if (prefetch > 0) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
||||
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
|
||||
239
llama.cpp
239
llama.cpp
@@ -1,6 +1,7 @@
|
||||
// Defines fileno on msys:
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#endif
|
||||
@@ -426,26 +427,30 @@ struct llama_file_loader {
|
||||
}
|
||||
void read_magic() {
|
||||
uint32_t magic = file.read_u32();
|
||||
uint32_t version = 0;
|
||||
|
||||
if (magic != 'ggml') {
|
||||
version = file.read_u32();
|
||||
}
|
||||
|
||||
if (magic == 'ggml' && version == 0) {
|
||||
if (magic == LLAMA_FILE_MAGIC_GGML) {
|
||||
file_version = LLAMA_FILE_VERSION_GGML;
|
||||
} else if (magic == 'ggmf' && version == 1) {
|
||||
file_version = LLAMA_FILE_VERSION_GGMF_V1;
|
||||
} else if (magic == 'ggjt' && version == 1) {
|
||||
file_version = LLAMA_FILE_VERSION_GGJT_V1;
|
||||
} else if (magic == 'ggjt' && version == 2) {
|
||||
file_version = LLAMA_FILE_VERSION_GGJT_V2;
|
||||
} else if (magic == 'ggjt' && version == 3) {
|
||||
file_version = LLAMA_FILE_VERSION_GGJT_V3;
|
||||
} else {
|
||||
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||
magic, version);
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t version = file.read_u32();
|
||||
|
||||
switch (magic) {
|
||||
case LLAMA_FILE_MAGIC_GGMF:
|
||||
switch (version) {
|
||||
case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
|
||||
}
|
||||
break;
|
||||
case LLAMA_FILE_MAGIC_GGJT:
|
||||
switch (version) {
|
||||
case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
|
||||
case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
|
||||
case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
|
||||
}
|
||||
}
|
||||
|
||||
throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||
magic, version);
|
||||
}
|
||||
void read_hparams() {
|
||||
hparams.n_vocab = file.read_u32();
|
||||
@@ -645,7 +650,7 @@ struct llama_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
||||
auto it = tensors_map.name_to_idx.find(name);
|
||||
if (it == tensors_map.name_to_idx.end()) {
|
||||
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
||||
@@ -656,10 +661,10 @@ struct llama_model_loader {
|
||||
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
||||
}
|
||||
|
||||
return get_tensor_for(lt);
|
||||
return get_tensor_for(lt, backend);
|
||||
}
|
||||
|
||||
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
|
||||
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
||||
struct ggml_tensor * tensor;
|
||||
if (lt.ne.size() == 2) {
|
||||
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
||||
@@ -669,6 +674,7 @@ struct llama_model_loader {
|
||||
}
|
||||
ggml_set_name(tensor, lt.name.c_str());
|
||||
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
||||
tensor->backend = backend;
|
||||
lt.ggml_tensor = tensor;
|
||||
num_ggml_tensors_created++;
|
||||
return tensor;
|
||||
@@ -682,12 +688,16 @@ struct llama_model_loader {
|
||||
|
||||
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
||||
size_t data_size = 0;
|
||||
size_t prefetch_size = 0;
|
||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
||||
data_size += lt.size;
|
||||
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||
prefetch_size += lt.size;
|
||||
}
|
||||
}
|
||||
|
||||
if (use_mmap) {
|
||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
|
||||
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
|
||||
if (!lmlock) {
|
||||
// Don't call the callback since the actual loading will be lazy
|
||||
// and we can't measure it.
|
||||
@@ -700,6 +710,9 @@ struct llama_model_loader {
|
||||
|
||||
size_t done_size = 0;
|
||||
for (llama_load_tensor & lt : tensors_map.tensors) {
|
||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
|
||||
continue;
|
||||
}
|
||||
if (progress_callback) {
|
||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||
}
|
||||
@@ -712,9 +725,6 @@ struct llama_model_loader {
|
||||
lmlock->grow_to(done_size);
|
||||
}
|
||||
}
|
||||
if (progress_callback) {
|
||||
progress_callback(1.0f, progress_callback_user_data);
|
||||
}
|
||||
}
|
||||
|
||||
void load_data_for(llama_load_tensor & lt) {
|
||||
@@ -969,27 +979,7 @@ static void llama_model_load_internal(
|
||||
size_t ctx_size;
|
||||
size_t mmapped_size;
|
||||
ml->calc_sizes(&ctx_size, &mmapped_size);
|
||||
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
||||
|
||||
// print memory requirements
|
||||
{
|
||||
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
||||
|
||||
// this is the total memory required to run the inference
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size +
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at(model.type);
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
}
|
||||
fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
@@ -1011,7 +1001,14 @@ static void llama_model_load_internal(
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
||||
#else
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
||||
#endif
|
||||
|
||||
// prepare memory for the weights
|
||||
size_t vram_total = 0;
|
||||
{
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
@@ -1019,33 +1016,87 @@ static void llama_model_load_internal(
|
||||
|
||||
ml->ggml_ctx = ctx;
|
||||
|
||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
||||
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
||||
|
||||
// "output" tensor
|
||||
{
|
||||
ggml_backend backend_output;
|
||||
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
||||
backend_output = LLAMA_BACKEND_OFFLOAD;
|
||||
} else {
|
||||
backend_output = GGML_BACKEND_CPU;
|
||||
}
|
||||
|
||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
std::string layers_i = "layers." + std::to_string(i);
|
||||
|
||||
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
|
||||
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
||||
|
||||
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
|
||||
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
||||
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
||||
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
||||
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
||||
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
||||
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
||||
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
||||
|
||||
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
|
||||
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
||||
|
||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
|
||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
|
||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
|
||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_CUDA) {
|
||||
vram_total +=
|
||||
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
||||
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ml->done_getting_tensors();
|
||||
|
||||
// print memory requirements
|
||||
{
|
||||
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
||||
|
||||
// this is the total memory required to run the inference
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size - vram_total + // weights in VRAM not in memory
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at(model.type);
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
||||
}
|
||||
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
#else
|
||||
(void) n_gpu_layers;
|
||||
#endif
|
||||
}
|
||||
|
||||
// populate `tensors_by_name`
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
||||
@@ -1053,36 +1104,34 @@ static void llama_model_load_internal(
|
||||
|
||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
||||
|
||||
model.mapping = std::move(ml->mapping);
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
{
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
|
||||
size_t vram_total = 0;
|
||||
|
||||
for (int i = 0; i < n_gpu; ++i) {
|
||||
const auto & layer = model.layers[i];
|
||||
|
||||
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
||||
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
||||
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
||||
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
||||
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
||||
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
||||
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
||||
size_t done_size = 0;
|
||||
size_t data_size = 0;
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
data_size += lt.size;
|
||||
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
||||
done_size += lt.size;
|
||||
}
|
||||
}
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
||||
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
||||
continue;
|
||||
}
|
||||
if (progress_callback) {
|
||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||
}
|
||||
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
||||
done_size += lt.size;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
}
|
||||
#else
|
||||
(void) n_gpu_layers;
|
||||
#endif
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
if (progress_callback) {
|
||||
progress_callback(1.0f, progress_callback_user_data);
|
||||
}
|
||||
|
||||
model.mapping = std::move(ml->mapping);
|
||||
|
||||
// loading time will be recalculate after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
@@ -1181,10 +1230,8 @@ static bool llama_eval_internal(
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
|
||||
// cur = attention_norm*cur
|
||||
cur = ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
||||
cur);
|
||||
// cur = cur*attention_norm(broadcasted)
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
||||
}
|
||||
|
||||
// self-attention
|
||||
@@ -1291,10 +1338,8 @@ static bool llama_eval_internal(
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpFF);
|
||||
|
||||
// cur = ffn_norm*cur
|
||||
cur = ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
|
||||
cur);
|
||||
// cur = cur*ffn_norm(broadcasted)
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
||||
}
|
||||
|
||||
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
||||
@@ -1331,10 +1376,8 @@ static bool llama_eval_internal(
|
||||
|
||||
inpL = ggml_rms_norm(ctx0, inpL);
|
||||
|
||||
// inpL = norm*inpL
|
||||
inpL = ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.norm, inpL),
|
||||
inpL);
|
||||
// inpL = inpL*norm(broadcasted)
|
||||
inpL = ggml_mul(ctx0, inpL, model.norm);
|
||||
|
||||
embeddings = inpL;
|
||||
}
|
||||
@@ -2158,7 +2201,7 @@ struct llama_context * llama_init_from_file(
|
||||
unsigned * cur_percentage_p = (unsigned *) ctx;
|
||||
unsigned percentage = (unsigned) (100 * progress);
|
||||
while (percentage > *cur_percentage_p) {
|
||||
++*cur_percentage_p;
|
||||
*cur_percentage_p = percentage;
|
||||
fprintf(stderr, ".");
|
||||
fflush(stderr);
|
||||
if (percentage >= 100) {
|
||||
@@ -2251,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 'ggla') {
|
||||
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||
fprintf(stderr, "%s: bad file magic\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -2315,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
|
||||
// maybe this should in llama_model_loader
|
||||
if (model_loader->use_mmap) {
|
||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2408,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
}
|
||||
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
||||
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
||||
model_loader->load_data_for(lt);
|
||||
lt.ggml_tensor->data = lt.data;
|
||||
|
||||
12
llama.h
12
llama.h
@@ -19,10 +19,16 @@
|
||||
# define LLAMA_API
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
||||
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_FILE_VERSION 3
|
||||
#define LLAMA_FILE_MAGIC 'ggjt'
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
||||
#define LLAMA_SESSION_MAGIC 'ggsn'
|
||||
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Reference in New Issue
Block a user