Compare commits

..

11 Commits

Author SHA1 Message Date
Henri Vasserman
0ecb1bbbeb [CI] Fix openblas (#1613)
* Fix OpenBLAS build

* Fix `LLAMA_BLAS_VENDOR` CMake variable that should be a string and not a boolean.
2023-05-27 17:24:06 +03:00
Georgi Gerganov
93618031c7 ggml : add ggml_tensor_overhead() 2023-05-27 16:19:56 +03:00
Henri Vasserman
83c54e6da5 [CI] CLBlast: Fix directory name (#1606) 2023-05-27 14:18:25 +02:00
Georgi Gerganov
bdbda1b17a ggml : sync ggml core (minor additions, e.g. ggml_get_tensor_by_name()) 2023-05-27 12:23:16 +03:00
Kerfuffle
66874d4fbc Some improvements to loading the session with --prompt-cache (#1550)
Improvements to loading the session with `--prompt-cache` in the `main` example.

1. Fix an issue where the `--seed` parameter was ignored when loading a cached prompt.
2. When loading a cached prompt, you previously had to specify the saved prompt (or a prefix of it) again. This pull changes that behavior to default to the prompt that was cached if a prompt wasn't specified by the user.
2023-05-25 20:18:01 -06:00
Johannes Gäßler
1fcdcc28b1 cuda : performance optimizations (#1530)
* xor hack

* block y dim

* loop unrolling

* Fixed cmake LLAMA_CUDA_BY option

* Removed hipblas compatibility code

* Define GGML_CUDA_DMMV_BLOCK_Y if not defined

* Fewer iters, more ops per iter

* Renamed DMMV X/Y compilation options
2023-05-26 00:07:29 +03:00
Henri Vasserman
ac7876ac20 Update CLBlast to 1.6.0 (#1580)
* Update CLBlast to 1.6.0
2023-05-24 10:30:09 +03:00
Evan Jones
c31bbe934b readme : add docs for chat-persistent.sh (#1568)
* readme : add docs for chat-persistent.sh

* Update README.md
2023-05-24 09:24:01 +03:00
Senemu
1359b6aba5 chat-persistent.sh : use bracket expressions in grep (#1564) 2023-05-24 09:16:22 +03:00
Maarten ter Huurne
7d873811f3 Fix handling of "invalid property" when creating OpenCL command queue (#1565)
The `clCreateCommandQueue()` function will return the code
`CL_INVALID_QUEUE_PROPERTIES` when passed unsupported properties,
not `CL_INVALID_PROPERTY` as the original code was checking for.
2023-05-23 19:01:15 +03:00
0cc4m
2e6cd4b025 OpenCL Token Generation Acceleration (#1459)
* Move back to C++ for OpenCL

* Refactor OpenCL code to work more like the CUDA code, add missing functions

* Deduplicate dequant kernels

* Add OpenCL compile options

* Use compile args for preprocessing constants

* Restore default platform + device selection by id behavior

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Henri Vasserman <henv@hot.ee>
2023-05-23 00:33:24 +03:00
14 changed files with 1326 additions and 624 deletions

View File

@@ -151,7 +151,7 @@ jobs:
env:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.5.3
CLBLAST_VERSION: 1.6.0
strategy:
matrix:
@@ -165,7 +165,7 @@ jobs:
- build: 'clblast'
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas'
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
steps:
- name: Clone
@@ -184,13 +184,13 @@ jobs:
id: get_clblast
if: ${{ matrix.build == 'clblast' }}
run: |
curl.exe -o $env:RUNNER_TEMP/clblast.zip -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-Windows-x64.zip"
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
mkdir $env:RUNNER_TEMP/clblast
tar.exe -xvf $env:RUNNER_TEMP/clblast.zip -C $env:RUNNER_TEMP/clblast
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw
$txt.Replace('C:/dependencies/opencl/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
}
- name: Download OpenBLAS
@@ -213,7 +213,6 @@ jobs:
cd build
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
cp ../LICENSE ./bin/Release/llama.cpp.txt
- name: Add clblast.dll
id: add_clblast_dll
@@ -258,6 +257,7 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts

View File

@@ -37,42 +37,44 @@ endif()
#
# general
option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
option(LLAMA_LTO "llama: enable link time optimization" OFF)
option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
option(LLAMA_LTO "llama: enable link time optimization" OFF)
# debug
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
option(LLAMA_GPROF "llama: enable gprof" OFF)
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
option(LLAMA_GPROF "llama: enable gprof" OFF)
# sanitizers
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
# instruction set specific
option(LLAMA_AVX "llama: enable AVX" ON)
option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_FMA "llama: enable FMA" ON)
option(LLAMA_AVX "llama: enable AVX" ON)
option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_FMA "llama: enable FMA" ON)
# in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC)
option(LLAMA_F16C "llama: enable F16C" ON)
option(LLAMA_F16C "llama: enable F16C" ON)
endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" OFF)
#
# Build info header
@@ -184,6 +186,8 @@ if (LLAMA_CUBLAS)
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -201,7 +205,7 @@ if (LLAMA_CLBLAST)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")
set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
add_compile_definitions(GGML_USE_CLBLAST)

View File

@@ -133,11 +133,22 @@ ifdef LLAMA_CUBLAS
OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
ifdef LLAMA_CUDA_DMMV_X
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
else
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
endif # LLAMA_CUDA_DMMV_X
ifdef LLAMA_CUDA_DMMV_Y
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
else
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
endif # LLAMA_CUDA_DMMV_Y
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif
endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
CXXFLAGS += -DGGML_USE_CLBLAST
# Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin)
LDFLAGS += -lclblast -framework OpenCL
@@ -145,8 +156,8 @@ ifdef LLAMA_CLBLAST
LDFLAGS += -lclblast -lOpenCL
endif
OBJS += ggml-opencl.o
ggml-opencl.o: ggml-opencl.c ggml-opencl.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.

View File

@@ -391,6 +391,25 @@ Note the use of `--color` to distinguish between user input and generated text.
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
### Persistent Interaction
The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
```bash
# Start a new chat
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
# Resume that chat
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
# Start a different chat with the same prompt/model
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
# Different prompt cache for different prompt/model
PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
```
### Instruction mode with Alpaca
1. First, download the `ggml` Alpaca model into the `./models` folder

View File

@@ -23,8 +23,8 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
SESSION_SIZE_MSG_PATTERN='main: session file matches \d+ / \d+'
SAMPLE_TIME_MSG_PATTERN='sample time =\s+\d+.\d+ ms /\s+\d+'
SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
CTX_SIZE=2048

View File

@@ -272,7 +272,7 @@ These options help improve the performance and memory usage of the LLaMA models.
### Prompt Caching
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
### Quantization

View File

@@ -134,8 +134,6 @@ int main(int argc, char ** argv) {
return 0;
}
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens;
@@ -155,6 +153,7 @@ int main(int argc, char ** argv) {
return 1;
}
session_tokens.resize(n_token_count_out);
llama_set_rng_seed(ctx, params.seed);
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
} else {
@@ -163,7 +162,16 @@ int main(int argc, char ** argv) {
}
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> embd_inp;
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
} else {
embd_inp = session_tokens;
}
const int n_ctx = llama_n_ctx(ctx);
@@ -181,7 +189,9 @@ int main(int argc, char ** argv) {
}
n_matching_session_tokens++;
}
if (n_matching_session_tokens >= embd_inp.size()) {
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
fprintf(stderr, "%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) {
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",

View File

@@ -83,9 +83,19 @@ typedef struct {
} block_q8_0;
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
#define WARP_SIZE 32
#define CUDA_MUL_BLOCK_SIZE 256
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec
// dmmv = dequantize_mul_mat_vec
#ifndef GGML_CUDA_DMMV_X
#define GGML_CUDA_DMMV_X 32
#endif
#ifndef GGML_CUDA_DMMV_Y
#define GGML_CUDA_DMMV_Y 1
#endif
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -200,41 +210,51 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
dequantize_kernel(vx, ib, iqs, v0, v1);
}
template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
const int row = blockIdx.x;
// qk = quantized weights per x block
// qr = number of quantized weights per data value in x block
const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x;
const int iter_stride = 2*GGML_CUDA_DMMV_X;
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
const int y_offset = qr == 1 ? 1 : qk/2;
__shared__ float tmp[block_size]; // separate sum for each thread
tmp[tid] = 0;
float tmp = 0; // partial sum for thread in warp
for (int i = 0; i < ncols/block_size; i += 2) {
const int col = i*block_size + 2*tid;
const int ib = (row*ncols + col)/qk; // block index
const int iqs = (col%qk)/qr; // quant index
for (int i = 0; i < ncols; i += iter_stride) {
const int col = i + vals_per_iter*tid;
const int ib = (row*ncols + col)/qk; // x block index
const int iqs = (col%qk)/qr; // x quant index
const int iybs = col - col%qk; // y block start index
// dequantize
float v0, v1;
dequantize_kernel(vx, ib, iqs, v0, v1);
// processing >2 values per i iter is faster for fast GPUs
#pragma unroll
for (int j = 0; j < vals_per_iter; j += 2) {
// process 2 vals per j iter
// matrix multiplication
tmp[tid] += v0 * y[iybs + iqs + 0];
tmp[tid] += v1 * y[iybs + iqs + y_offset];
// dequantize
float v0, v1;
dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
// matrix multiplication
tmp += v0 * y[iybs + iqs + j/qr + 0];
tmp += v1 * y[iybs + iqs + j/qr + y_offset];
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
}
}
// sum up partial sums and write back result
__syncthreads();
for (int s=block_size/2; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
__syncthreads();
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (tid == 0) {
dst[row] = tmp[0];
dst[row] = tmp;
}
}
@@ -269,33 +289,43 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
}
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, QR4_0, dequantize_q4_0>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, QR4_1, dequantize_q4_1>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, QR5_0, dequantize_q5_0>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_1, QR5_1, dequantize_q5_1>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK8_0, QR8_0, dequantize_q8_0>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
}
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -304,9 +334,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
}
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, 32, 1, convert_f16>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
dequantize_mul_mat_vec<1, 1, convert_f16>
<<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
}
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {

View File

@@ -1,474 +0,0 @@
#include "ggml-opencl.h"
#define CL_TARGET_OPENCL_VERSION 110
#include <clblast_c.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "ggml.h"
#define MULTILINE_QUOTE(...) #__VA_ARGS__
static const char * program_source = MULTILINE_QUOTE(
typedef char int8_t;
typedef uchar uint8_t;
typedef int int32_t;
typedef uint uint32_t;
struct __attribute__ ((packed)) block_q4_0
{
half d;
uint8_t qs[16]; /* QK4_0 / 2 */
};
struct __attribute__ ((packed)) block_q4_1
{
half d;
half m;
uint8_t qs[16]; /* QK4_1 / 2 */
};
struct __attribute__ ((packed)) block_q5_0
{
half d;
uint32_t qh;
uint8_t qs[16]; /* QK5_0 / 2 */
};
struct __attribute__ ((packed)) block_q5_1
{
half d;
half m;
uint32_t qh;
uint8_t qs[16]; /* QK5_1 / 2 */
};
struct __attribute__ ((packed)) block_q8_0
{
half d;
int8_t qs[32]; /* QK8_0 */
};
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
const uint i = get_global_id(0) / 32; /* QK4_0 */
const uint j = get_local_id(0);
const float d = vload_half(0, (__global half*) &x[i].d);
const int x0 = (x[i].qs[j] & 0xf) - 8;
const int x1 = (x[i].qs[j] >> 4) - 8;
y[i*32 + j + 0 ] = x0*d;
y[i*32 + j + 16] = x1*d;
}
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
const uint i = get_global_id(0) / 32; /* QK4_1 */
const uint j = get_local_id(0);
const float d = vload_half(0, (__global half*) &x[i].d);
const float m = vload_half(0, (__global half*) &x[i].m);
const int x0 = (x[i].qs[j] & 0xf);
const int x1 = (x[i].qs[j] >> 4);
y[i*32 + j + 0 ] = x0*d + m;
y[i*32 + j + 16] = x1*d + m;
}
__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
const uint i = get_global_id(0) / 32; /* QK5_0 */
const uint j = get_local_id(0);
const float d = vload_half(0, (__global half*) &x[i].d);
uint32_t qh = x[i].qh;
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
y[i*32 + j + 0 ] = x0*d;
y[i*32 + j + 16] = x1*d;
}
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
const uint i = get_global_id(0) / 32; /* QK5_1 */
const uint j = get_local_id(0);
const float d = vload_half(0, (__global half*) &x[i].d);
const float m = vload_half(0, (__global half*) &x[i].m);
uint32_t qh = x[i].qh;
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
const int x1 = (x[i].qs[j] >> 4) | xh_1;
y[i*32 + j + 0 ] = x0*d + m;
y[i*32 + j + 16] = x1*d + m;
}
__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
const uint i = get_global_id(0) / 32; /* QK8_0 */
const uint j = get_local_id(0);
const float d = vload_half(0, (__global half*) &x[i].d);
y[i*32 + j] = x[i].qs[j]*d;
}
);
#define CL_CHECK(err) \
do { \
cl_int err_ = (err); \
if (err_ != CL_SUCCESS) { \
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
#err, err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
#define CLBLAST_CHECK(err) \
do { \
CLBlastStatusCode err_ = (err); \
if (err_ != CLBlastSuccess) { \
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
#err, err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
static cl_platform_id platform;
static cl_device_id device;
static cl_context context;
static cl_command_queue queue;
static cl_program program;
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
cl_program p;
char *program_log;
size_t program_size, log_size;
int err;
program_size = strlen(program_buffer);
p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
if(err < 0) {
fprintf(stderr, "OpenCL error creating program");
exit(1);
}
err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
if(err < 0) {
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return p;
}
void ggml_cl_init(void) {
cl_int err = 0;
struct cl_device;
struct cl_platform {
cl_platform_id id;
unsigned number;
char name[128];
char vendor[128];
struct cl_device * devices;
unsigned n_devices;
struct cl_device * default_device;
};
struct cl_device {
struct cl_platform * platform;
cl_device_id id;
unsigned number;
cl_device_type type;
char name[128];
};
enum { NPLAT = 16, NDEV = 16 };
struct cl_platform platforms[NPLAT];
unsigned n_platforms = 0;
struct cl_device devices[NDEV];
unsigned n_devices = 0;
struct cl_device * default_device = NULL;
platform = NULL;
device = NULL;
cl_platform_id platform_ids[NPLAT];
CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
for (unsigned i = 0; i < n_platforms; i++) {
struct cl_platform * p = &platforms[i];
p->number = i;
p->id = platform_ids[i];
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
cl_device_id device_ids[NDEV];
cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
p->n_devices = 0;
} else {
CL_CHECK(clGetDeviceIDsError);
}
p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
p->default_device = NULL;
for (unsigned j = 0; j < p->n_devices; j++) {
struct cl_device * d = &devices[n_devices];
d->number = n_devices++;
d->id = device_ids[j];
d->platform = p;
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
p->default_device = d;
}
}
if (default_device == NULL && p->default_device != NULL) {
default_device = p->default_device;
}
}
if (n_devices == 0) {
fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
exit(1);
}
char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
char * user_device_string = getenv("GGML_OPENCL_DEVICE");
int user_platform_number = -1;
int user_device_number = -1;
unsigned n;
if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
user_platform_number = (int)n;
}
if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
user_device_number = (int)n;
}
struct cl_device * selected_devices = devices;
unsigned n_selected_devices = n_devices;
if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
for (unsigned i = 0; i < n_platforms; i++) {
struct cl_platform * p = &platforms[i];
if (strstr(p->name, user_platform_string) != NULL ||
strstr(p->vendor, user_platform_string) != NULL) {
user_platform_number = (int)i;
break;
}
}
if (user_platform_number == -1) {
fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
exit(1);
}
}
if (user_platform_number != -1) {
struct cl_platform * p = &platforms[user_platform_number];
selected_devices = p->devices;
n_selected_devices = p->n_devices;
default_device = p->default_device;
if (n_selected_devices == 0) {
fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
exit(1);
}
}
if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
for (unsigned i = 0; i < n_selected_devices; i++) {
struct cl_device * d = &selected_devices[i];
if (strstr(d->name, user_device_string) != NULL) {
user_device_number = d->number;
break;
}
}
if (user_device_number == -1) {
fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
exit(1);
}
}
if (user_device_number != -1) {
selected_devices = &devices[user_device_number];
n_selected_devices = 1;
default_device = &selected_devices[0];
}
GGML_ASSERT(n_selected_devices > 0);
if (default_device == NULL) {
default_device = &selected_devices[0];
}
fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
if (default_device->type != CL_DEVICE_TYPE_GPU) {
fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
}
platform = default_device->platform->id;
device = default_device->id;
cl_context_properties properties[] = {
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
};
CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
(err != CL_INVALID_PROPERTY && err != CL_INVALID_VALUE ? err :
(queue = clCreateCommandQueue(context, device, 0, &err), err)
)));
program = build_program_from_source(context, device, program_source);
// Prepare dequantize kernels
CL_CHECK((kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
CL_CHECK((kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
CL_CHECK((kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
CL_CHECK((kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
CL_CHECK((kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
}
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
if (req_size <= *cur_size) {
return;
}
// Reallocate buffer with enough space
if (*cur_size > 0) {
clReleaseMemObject(*buf);
}
cl_int err;
CL_CHECK((*buf = clCreateBuffer(context, flags, req_size, NULL, &err), err));
*cur_size = req_size;
}
void ggml_cl_sgemm_wrapper(
const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
const int m, const int n, const int k,
const float alpha, const void *host_a, const int lda,
const float *host_b, const int ldb, const float beta,
float *host_c, const int ldc, const int btype) {
cl_kernel kernel;
size_t global = n * k, local, size_qb;
bool dequant;
switch (btype) {
case GGML_TYPE_F32:
dequant = false;
break;
case GGML_TYPE_Q4_0:
dequant = true;
kernel = kernel_q4_0;
local = 16;
size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
break;
case GGML_TYPE_Q4_1:
dequant = true;
kernel = kernel_q4_1;
local = 16;
size_qb = global * (sizeof(ggml_fp16_t) * 2 + local) / 32;
break;
case GGML_TYPE_Q5_0:
dequant = true;
kernel = kernel_q5_0;
local = 16;
size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
break;
case GGML_TYPE_Q5_1:
dequant = true;
kernel = kernel_q5_1;
local = 16;
size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
break;
case GGML_TYPE_Q8_0:
dequant = true;
kernel = kernel_q8_0;
local = 32;
size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
break;
default:
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
abort();
}
const size_t size_a = m * k * sizeof(float);
const size_t size_b = n * k * sizeof(float);
const size_t size_c = m * n * sizeof(float);
// Prepare buffers
ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
if (dequant) {
ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
}
ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
cl_event ev_a, ev_qb, ev_b;
if (dequant) {
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b));
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb));
} else {
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b));
}
CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a));
if (dequant) {
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b));
CL_CHECK(clReleaseEvent(ev_qb));
}
CL_CHECK(clWaitForEvents(1, &ev_a));
CL_CHECK(clWaitForEvents(1, &ev_b));
CL_CHECK(clReleaseEvent(ev_a));
CL_CHECK(clReleaseEvent(ev_b));
cl_event ev_sgemm;
CLBLAST_CHECK(CLBlastSgemm(
(CLBlastLayout)order,
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
m, n, k,
alpha,
cl_buffer_a, 0, lda,
cl_buffer_b, 0, ldb,
beta,
cl_buffer_c, 0, ldc,
&queue, &ev_sgemm));
cl_event ev_c;
CL_CHECK(clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c));
// Wait for completion
CL_CHECK(clWaitForEvents(1, &ev_c));
CL_CHECK(clReleaseEvent(ev_sgemm));
CL_CHECK(clReleaseEvent(ev_c));
}

1034
ggml-opencl.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,23 +1,21 @@
#pragma once
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
void ggml_cl_init(void);
enum ggml_blas_order {
GGML_BLAS_ORDER_ROW_MAJOR = 101,
GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
};
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
enum ggml_blas_op {
GGML_BLAS_OP_N = 111,
GGML_BLAS_OP_T = 112,
GGML_BLAS_OP_C = 113,
};
void * ggml_cl_host_malloc(size_t size);
void ggml_cl_host_free(void * ptr);
void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
#ifdef __cplusplus
}

141
ggml.c
View File

@@ -3494,7 +3494,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"NONE",
"DUP",
@@ -3749,6 +3749,9 @@ const char * ggml_type_name(enum ggml_type type) {
return GGML_TYPE_NAME[type];
}
const char * ggml_op_name(enum ggml_op op) {
return GGML_OP_NAME[op];
}
size_t ggml_element_size(const struct ggml_tensor * tensor) {
return GGML_TYPE_SIZE[tensor->type];
@@ -3805,6 +3808,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
return wtype;
}
size_t ggml_tensor_overhead(void) {
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
}
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1];
}
@@ -4017,6 +4024,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
return result;
}
void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
ctx->no_alloc = no_alloc;
}
// IMPORTANT:
// when creating "opt" tensors, always save and load the scratch buffer
// this is an error prone process, but it is necessary to support inplace
@@ -4061,7 +4072,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
if (ctx->scratch.data == NULL || data != NULL) {
size_needed += sizeof(struct ggml_tensor);
size_needed += GGML_TENSOR_SIZE;
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@@ -4077,14 +4088,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
};
} else {
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
assert(false);
return NULL;
}
if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
assert(false);
return NULL;
}
@@ -4093,7 +4105,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
*obj_new = (struct ggml_object) {
.offs = cur_end + GGML_OBJECT_SIZE,
.size = sizeof(struct ggml_tensor),
.size = GGML_TENSOR_SIZE,
.next = NULL,
};
@@ -9431,7 +9443,7 @@ static void ggml_compute_forward_rms_norm_back(
// ggml_compute_forward_mul_mat
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
static bool ggml_compute_forward_mul_mat_use_blas(
@@ -9472,7 +9484,7 @@ static void ggml_compute_forward_mul_mat_f32(
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
const int64_t ne10 = src1->ne[0];
#endif
const int64_t ne11 = src1->ne[1];
@@ -9536,9 +9548,16 @@ static void ggml_compute_forward_mul_mat_f32(
}
return;
}
#elif defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
}
return;
}
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (params->ith != 0) {
return;
@@ -9558,21 +9577,11 @@ static void ggml_compute_forward_mul_mat_f32(
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
#if defined(GGML_USE_CLBLAST)
// zT = y * xT
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
ne11, ne01, ne10,
1.0f, y, ne10,
x, ne10,
0.0f, d, ne01,
GGML_TYPE_F32);
#else
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10,
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
#endif
}
}
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@@ -9711,9 +9720,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
}
return;
}
#elif defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
}
return;
}
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
GGML_ASSERT(nb10 == sizeof(float));
@@ -9743,20 +9759,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
assert(id*sizeof(float) <= params->wsize);
}
#if defined(GGML_USE_CLBLAST)
const float * x = wdata;
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
// zT = y * xT
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
ne11, ne01, ne10,
1.0f, y, ne10,
x, ne10,
0.0f, d, ne01,
GGML_TYPE_F32);
#else
const float * x = wdata;
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
@@ -9768,7 +9770,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
#endif
}
}
@@ -9931,9 +9932,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
}
return;
}
#elif defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
}
return;
}
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (params->ith != 0) {
return;
@@ -9956,9 +9964,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
#if defined(GGML_USE_CLBLAST)
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
#else
{
size_t id = 0;
for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -9970,23 +9975,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
}
const float * x = wdata;
#endif
#if defined(GGML_USE_CLBLAST)
// zT = y * xT
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
ne11, ne01, ne10,
1.0f, y, ne10,
x, ne10,
0.0f, d, ne01,
type);
#else
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10,
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
#endif
}
}
@@ -13810,11 +13804,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
// reached a leaf node, not part of the gradient graph (e.g. a constant)
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
if (strlen(node->name) == 0) {
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
}
cgraph->leafs[cgraph->n_leafs] = node;
cgraph->n_leafs++;
} else {
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
if (strlen(node->name) == 0) {
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
}
cgraph->nodes[cgraph->n_nodes] = node;
cgraph->grads[cgraph->n_nodes] = node->grad;
cgraph->n_nodes++;
@@ -14165,9 +14167,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
}
else
#elif defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
}
else
#endif
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
@@ -14181,13 +14190,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
#endif
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
cur = 0;
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
}
#endif
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -14521,6 +14530,26 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
}
}
struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) {
for (int i = 0; i < cgraph->n_leafs; i++) {
struct ggml_tensor * leaf = cgraph->leafs[i];
if (strcmp(leaf->name, name) == 0) {
return leaf;
}
}
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
if (strcmp(node->name, name) == 0) {
return node;
}
}
return NULL;
}
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
@@ -14538,7 +14567,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
i,
node->ne[0], node->ne[1], node->ne[2],
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
(double) node->perf_time_us / 1000.0,
@@ -14552,7 +14581,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
i,
node->ne[0], node->ne[1],
GGML_OP_LABEL[node->op]);
GGML_OP_NAME[node->op]);
}
for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -14560,7 +14589,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
continue;
}
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
}
GGML_PRINT("========================================\n");

13
ggml.h
View File

@@ -198,6 +198,7 @@
#define GGML_MAX_PARAMS 256
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4
#define GGML_MAX_NAME 32
#define GGML_DEFAULT_N_THREADS 4
#define GGML_ASSERT(x) \
@@ -249,6 +250,7 @@ extern "C" {
enum ggml_backend {
GGML_BACKEND_CPU = 0,
GGML_BACKEND_CUDA = 1,
GGML_BACKEND_CL = 2,
};
// model file types
@@ -371,11 +373,13 @@ extern "C" {
void * data;
char name[32];
char name[GGML_MAX_NAME];
char padding[16];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
// computation graph
struct ggml_cgraph {
int n_nodes;
@@ -428,6 +432,7 @@ extern "C" {
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op);
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
@@ -436,6 +441,9 @@ extern "C" {
// TODO: temporary until model loading of ggml examples is refactored
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
// use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void);
// main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -444,6 +452,7 @@ extern "C" {
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
@@ -969,6 +978,8 @@ extern "C" {
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name);
// print info and performance information for the graph
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

View File

@@ -12,6 +12,8 @@
#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#endif
#include <array>
@@ -1092,7 +1094,7 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
}
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
#else
#elif !defined(GGML_USE_CLBLAST)
(void) n_gpu_layers;
#endif
}
@@ -1125,7 +1127,33 @@ static void llama_model_load_internal(
done_size += lt.size;
}
}
#endif // GGML_USE_CUBLAS
#elif defined(GGML_USE_CLBLAST)
{
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
size_t vram_total = 0;
for (int i = 0; i < n_gpu; ++i) {
const auto & layer = model.layers[i];
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
}
if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
}
fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
}
#endif
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);