mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0d3984424f | ||
|
|
3e63a58ef7 | ||
|
|
2589ad3704 | ||
|
|
482548716f | ||
|
|
3ac67535c8 | ||
|
|
0b4be4c435 | ||
|
|
e0e806f52e | ||
|
|
7e00e60ef8 | ||
|
|
ea1431b0fa | ||
|
|
71e74a3ac9 | ||
|
|
bfb1e012a0 |
8
.github/workflows/build.yml
vendored
8
.github/workflows/build.yml
vendored
@@ -839,12 +839,12 @@ jobs:
|
||||
-DGGML_CUDA=ON
|
||||
cmake --build build
|
||||
|
||||
windows-2019-cmake-cuda:
|
||||
runs-on: windows-2019
|
||||
windows-2022-cmake-cuda:
|
||||
runs-on: windows-2022
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '11.7']
|
||||
cuda: ['12.4']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -878,7 +878,7 @@ jobs:
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DLLAMA_BUILD_SERVER=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
|
||||
17
.github/workflows/release.yml
vendored
17
.github/workflows/release.yml
vendored
@@ -131,8 +131,9 @@ jobs:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-22.04-arm
|
||||
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
|
||||
# - build: 'arm64'
|
||||
# os: ubuntu-22.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
@@ -159,6 +160,9 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
@@ -207,6 +211,9 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_VULKAN=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
@@ -373,11 +380,11 @@ jobs:
|
||||
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
|
||||
windows-cuda:
|
||||
runs-on: windows-2019
|
||||
runs-on: windows-2022
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '11.7']
|
||||
cuda: ['12.4']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -405,7 +412,7 @@ jobs:
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
|
||||
2
.github/workflows/server.yml
vendored
2
.github/workflows/server.yml
vendored
@@ -180,7 +180,7 @@ jobs:
|
||||
|
||||
|
||||
server-windows:
|
||||
runs-on: windows-2019
|
||||
runs-on: windows-2022
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
41
README.md
41
README.md
@@ -28,6 +28,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
----
|
||||
|
||||
## Quick start
|
||||
|
||||
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
|
||||
|
||||
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
|
||||
- Run with Docker - see our [Docker documentation](docs/docker.md)
|
||||
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
|
||||
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
|
||||
|
||||
Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
|
||||
|
||||
Example command:
|
||||
|
||||
```sh
|
||||
# Use a local model file
|
||||
llama-cli -m my_model.gguf
|
||||
|
||||
# Or download and run a model directly from Hugging Face
|
||||
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
||||
|
||||
# Launch OpenAI-compatible API server
|
||||
llama-server -hf ggml-org/gemma-3-1b-it-GGUF
|
||||
```
|
||||
|
||||
## Description
|
||||
|
||||
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
||||
@@ -230,6 +254,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
## Supported backends
|
||||
|
||||
| Backend | Target devices |
|
||||
@@ -246,16 +271,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
|
||||
## Building the project
|
||||
|
||||
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
||||
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
|
||||
|
||||
- Clone this repository and build locally, see [how to build](docs/build.md)
|
||||
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
|
||||
- Use a Docker image, see [documentation for Docker](docs/docker.md)
|
||||
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
|
||||
|
||||
## Obtaining and quantizing models
|
||||
|
||||
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
|
||||
@@ -263,7 +278,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
|
||||
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
||||
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
||||
|
||||
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
|
||||
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
|
||||
|
||||
```sh
|
||||
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
||||
```
|
||||
|
||||
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
|
||||
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
# Build llama.cpp locally
|
||||
|
||||
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
||||
|
||||
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
|
||||
|
||||
**To get the Code:**
|
||||
|
||||
```bash
|
||||
|
||||
@@ -1,28 +1,42 @@
|
||||
# Install pre-built version of llama.cpp
|
||||
|
||||
## Homebrew
|
||||
| Install via | Windows | Mac | Linux |
|
||||
|-------------|---------|-----|-------|
|
||||
| Winget | ✅ | | |
|
||||
| Homebrew | | ✅ | ✅ |
|
||||
| MacPorts | | ✅ | |
|
||||
| Nix | | ✅ | ✅ |
|
||||
|
||||
On Mac and Linux, the homebrew package manager can be used via
|
||||
## Winget (Windows)
|
||||
|
||||
```sh
|
||||
winget install llama.cpp
|
||||
```
|
||||
|
||||
The package is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/issues/8188
|
||||
|
||||
## Homebrew (Mac and Linux)
|
||||
|
||||
```sh
|
||||
brew install llama.cpp
|
||||
```
|
||||
|
||||
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
|
||||
|
||||
## MacPorts
|
||||
## MacPorts (Mac)
|
||||
|
||||
```sh
|
||||
sudo port install llama.cpp
|
||||
```
|
||||
see also: https://ports.macports.org/port/llama.cpp/details/
|
||||
|
||||
## Nix
|
||||
See also: https://ports.macports.org/port/llama.cpp/details/
|
||||
|
||||
On Mac and Linux, the Nix package manager can be used via
|
||||
## Nix (Mac and Linux)
|
||||
|
||||
```sh
|
||||
nix profile install nixpkgs#llama-cpp
|
||||
```
|
||||
|
||||
For flake enabled installs.
|
||||
|
||||
Or
|
||||
@@ -34,13 +48,3 @@ nix-env --file '<nixpkgs>' --install --attr llama-cpp
|
||||
For non-flake enabled installs.
|
||||
|
||||
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
|
||||
|
||||
## Flox
|
||||
|
||||
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
|
||||
|
||||
```sh
|
||||
flox install llama-cpp
|
||||
```
|
||||
|
||||
Flox follows the nixpkgs build of llama.cpp.
|
||||
|
||||
@@ -8132,8 +8132,8 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
||||
#define WKV_VECTOR_SIZE 4
|
||||
#endif
|
||||
|
||||
int wkv_vector_size;
|
||||
#ifdef WKV_VECTOR_SIZE
|
||||
int wkv_vector_size;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
wkv_vector_size = svcntw();
|
||||
#else
|
||||
@@ -8348,8 +8348,8 @@ static void ggml_compute_forward_gla_f32(
|
||||
#define GLA_VECTOR_SIZE 4
|
||||
#endif
|
||||
|
||||
int gla_vector_size;
|
||||
#ifdef GLA_VECTOR_SIZE
|
||||
int gla_vector_size;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
gla_vector_size = svcntw();
|
||||
#else
|
||||
|
||||
@@ -652,9 +652,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
float KQ_max_scale[cols_per_thread];
|
||||
#pragma unroll
|
||||
for (int col = 0; col < cols_per_thread; ++col) {
|
||||
KQ_max_scale[col] = expf(KQ_max[col] - KQ_max_new[col]);
|
||||
const float KQ_max_diff = KQ_max[col] - KQ_max_new[col];
|
||||
KQ_max_scale[col] = expf(KQ_max_diff);
|
||||
KQ_max[col] = KQ_max_new[col];
|
||||
|
||||
*((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD;
|
||||
|
||||
// Scale previous KQ_rowsum to account for a potential increase in KQ_max:
|
||||
KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_rowsum_add[col];
|
||||
}
|
||||
|
||||
@@ -95,6 +95,12 @@ set(GGML_OPENCL_KERNELS
|
||||
sub
|
||||
sum_rows
|
||||
transpose
|
||||
concat
|
||||
tsembd
|
||||
upscale
|
||||
tanh
|
||||
pad
|
||||
repeat
|
||||
)
|
||||
|
||||
foreach (K ${GGML_OPENCL_KERNELS})
|
||||
|
||||
@@ -315,6 +315,12 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_softmax_4_f16;
|
||||
cl_program program_argsort_f32_i32;
|
||||
cl_program program_sum_rows_f32;
|
||||
cl_program program_repeat;
|
||||
cl_program program_pad;
|
||||
cl_program program_tanh;
|
||||
cl_program program_upscale;
|
||||
cl_program program_concat;
|
||||
cl_program program_tsembd;
|
||||
|
||||
cl_kernel kernel_add, kernel_add_row;
|
||||
cl_kernel kernel_mul, kernel_mul_row;
|
||||
@@ -351,6 +357,15 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
||||
cl_kernel kernel_argsort_f32_i32;
|
||||
cl_kernel kernel_sum_rows_f32;
|
||||
cl_kernel kernel_repeat;
|
||||
cl_kernel kernel_pad;
|
||||
cl_kernel kernel_tanh_f32_nd;
|
||||
cl_kernel kernel_tanh_f16_nd;
|
||||
cl_kernel kernel_upscale;
|
||||
cl_kernel kernel_upscale_bilinear;
|
||||
cl_kernel kernel_concat_f32_contiguous;
|
||||
cl_kernel kernel_concat_f32_non_contiguous;
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Transpose kernels
|
||||
@@ -1097,6 +1112,150 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// repeat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "repeat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("repeat.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_repeat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
|
||||
backend_ctx->program_repeat = nullptr;
|
||||
backend_ctx->kernel_repeat = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// pad
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "pad.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("pad.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_pad =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n");
|
||||
backend_ctx->program_pad = nullptr;
|
||||
backend_ctx->kernel_pad = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// tanh
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "tanh.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("tanh.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_tanh =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
|
||||
backend_ctx->program_tanh = nullptr;
|
||||
backend_ctx->kernel_tanh_f32_nd = nullptr;
|
||||
backend_ctx->kernel_tanh_f16_nd = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// upscale
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "upscale.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("upscale.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_upscale =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
|
||||
if (backend_ctx->program_upscale) {
|
||||
cl_int err_bilinear;
|
||||
backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
|
||||
if (err_bilinear != CL_SUCCESS) {
|
||||
GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
|
||||
backend_ctx->kernel_upscale_bilinear = nullptr;
|
||||
}
|
||||
} else {
|
||||
backend_ctx->kernel_upscale_bilinear = nullptr;
|
||||
}
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n");
|
||||
backend_ctx->program_upscale = nullptr;
|
||||
backend_ctx->kernel_upscale = nullptr;
|
||||
backend_ctx->kernel_upscale_bilinear = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// concat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "concat.cl.h"
|
||||
};
|
||||
#else
|
||||
|
||||
const std::string kernel_src = read_file("concat.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_concat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
|
||||
backend_ctx->program_concat = nullptr;
|
||||
backend_ctx->kernel_concat_f32_contiguous = nullptr;
|
||||
backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// timestep_embedding
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "tsembd.cl.h"
|
||||
};
|
||||
#else
|
||||
|
||||
const std::string kernel_src = read_file("tsembd.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_tsembd =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
|
||||
backend_ctx->program_tsembd = nullptr;
|
||||
backend_ctx->kernel_timestep_embedding = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Adreno kernels
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// transpose
|
||||
@@ -1863,7 +2022,12 @@ static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const g
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
|
||||
GGML_UNUSED(backend);
|
||||
auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseEvent(evt));
|
||||
}
|
||||
|
||||
// Syncronizes the 'backend_ctx's device with others so that commands
|
||||
@@ -1976,9 +2140,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_UNARY_OP_TANH:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -1988,6 +2155,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return true;
|
||||
case GGML_OP_REPEAT:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
|
||||
case GGML_OP_PAD:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
|
||||
op->src[0]->ne[3] == 1 && op->ne[3] == 1;
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONCAT:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_MUL_MAT:
|
||||
@@ -2052,7 +2230,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
|
||||
/* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
|
||||
/* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
|
||||
/* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
|
||||
/* .synchronize = */ NULL, /* ggml_backend_opencl_synchronize */
|
||||
/* .synchronize = */ ggml_backend_opencl_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
@@ -4108,6 +4286,536 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
UNUSED(src1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_tanh_f32_nd;
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
kernel = backend_ctx->kernel_tanh_f16_nd;
|
||||
} else {
|
||||
GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
|
||||
}
|
||||
GGML_ASSERT(kernel != nullptr);
|
||||
|
||||
const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
|
||||
const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
|
||||
const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
||||
|
||||
size_t global_work_size[3];
|
||||
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
||||
return;
|
||||
}
|
||||
global_work_size[0] = (size_t)ne10;
|
||||
global_work_size[1] = (size_t)ne11;
|
||||
global_work_size[2] = (size_t)ne12;
|
||||
|
||||
size_t lws0 = 16, lws1 = 4, lws2 = 1;
|
||||
if (ne10 < 16) lws0 = ne10;
|
||||
if (ne11 < 4) lws1 = ne11;
|
||||
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
|
||||
|
||||
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
|
||||
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
|
||||
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
|
||||
|
||||
|
||||
size_t local_work_size[] = {lws0, lws1, lws2};
|
||||
|
||||
size_t* local_work_size_ptr = local_work_size;
|
||||
if (!backend_ctx->non_uniform_workgroups) {
|
||||
if (global_work_size[0] % local_work_size[0] != 0 ||
|
||||
global_work_size[1] % local_work_size[1] != 0 ||
|
||||
global_work_size[2] % local_work_size[2] != 0) {
|
||||
local_work_size_ptr = NULL;
|
||||
}
|
||||
}
|
||||
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
||||
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_ASSERT(dst->type == src0->type);
|
||||
|
||||
UNUSED(src1_shape_def);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
if (backend_ctx->kernel_repeat == nullptr) {
|
||||
GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
||||
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
||||
|
||||
const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
|
||||
const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
|
||||
|
||||
const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
|
||||
const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_repeat;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
|
||||
|
||||
size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
|
||||
size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
|
||||
size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
|
||||
|
||||
size_t global_work_size[] = { gws0, gws1, gws2 };
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
if (backend_ctx->kernel_pad == nullptr) {
|
||||
GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
||||
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
||||
|
||||
const int s_ne0 = src0->ne[0];
|
||||
const int s_ne1 = src0->ne[1];
|
||||
const int s_ne2 = src0->ne[2];
|
||||
|
||||
const int d_ne0 = dst->ne[0];
|
||||
const int d_ne1 = dst->ne[1];
|
||||
const int d_ne2 = dst->ne[2];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_pad;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne2));
|
||||
|
||||
size_t lws0 = 64;
|
||||
size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
|
||||
|
||||
size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
|
||||
size_t local_work_size[] = { lws0, 1, 1 };
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
||||
cl_kernel kernel = nullptr;
|
||||
|
||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||
kernel = backend_ctx->kernel_upscale;
|
||||
if (kernel == nullptr) {
|
||||
GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||
kernel = backend_ctx->kernel_upscale_bilinear;
|
||||
if (kernel == nullptr) {
|
||||
GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
||||
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne00_src = src0->ne[0];
|
||||
const int ne01_src = src0->ne[1];
|
||||
|
||||
const int ne10_dst = dst->ne[0];
|
||||
const int ne11_dst = dst->ne[1];
|
||||
const int ne12_dst = dst->ne[2];
|
||||
const int ne13_dst = dst->ne[3];
|
||||
|
||||
const float sf0 = (float)dst->ne[0] / src0->ne[0];
|
||||
const float sf1 = (float)dst->ne[1] / src0->ne[1];
|
||||
const float sf2 = (float)dst->ne[2] / src0->ne[2];
|
||||
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
|
||||
|
||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
|
||||
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
|
||||
}
|
||||
|
||||
|
||||
size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
if (dst_total_elements == 0) {
|
||||
return;
|
||||
}
|
||||
size_t global_work_size[] = { dst_total_elements, 1, 1 };
|
||||
size_t local_work_size_pref = 256;
|
||||
size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
size_t profiling_gws[3] = {global_work_size[0], 1, 1};
|
||||
size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1};
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
|
||||
GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
|
||||
cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
|
||||
cl_ulong off_dst = extrad_cl->offset + dst->view_offs;
|
||||
|
||||
const int32_t dim = ((const int32_t *) dst->op_params)[0];
|
||||
GGML_ASSERT(dim >= 0 && dim <= 3);
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
||||
if (dim == 3) {
|
||||
|
||||
size_t nbytes_src0 = ggml_nbytes(src0);
|
||||
size_t nbytes_src1 = ggml_nbytes(src1);
|
||||
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
|
||||
off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
|
||||
off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
|
||||
} else {
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
|
||||
size_t global_work_size[3];
|
||||
|
||||
for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
|
||||
cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
|
||||
cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
|
||||
cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
|
||||
|
||||
int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
|
||||
int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
|
||||
int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), ¤t_off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), ¤t_off_src1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), ¤t_off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
|
||||
|
||||
global_work_size[0] = d_ne0;
|
||||
global_work_size[1] = d_ne1;
|
||||
global_work_size[2] = d_ne2;
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
|
||||
|
||||
long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
|
||||
cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
|
||||
|
||||
cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
|
||||
|
||||
long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
|
||||
cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
|
||||
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long), &d_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long), &d_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long), &d_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long), &d_ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
|
||||
|
||||
size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
|
||||
d_ne2 > 0 ? (size_t)d_ne2 : 1,
|
||||
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
if (backend_ctx->kernel_timestep_embedding == nullptr) {
|
||||
GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
||||
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
||||
|
||||
const int logical_dim = dst->op_params[0];
|
||||
const int max_period = dst->op_params[1];
|
||||
const int dst_nb1_bytes = dst->nb[1];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period));
|
||||
|
||||
size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
|
||||
|
||||
size_t gws1 = (size_t)src0->ne[0];
|
||||
|
||||
size_t global_work_size[] = {gws0, gws1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1};
|
||||
size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -5667,6 +6375,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_sigmoid;
|
||||
break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_tanh;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
} break;
|
||||
@@ -5694,6 +6408,36 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_group_norm;
|
||||
break;
|
||||
case GGML_OP_REPEAT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_repeat;
|
||||
break;
|
||||
case GGML_OP_PAD:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
ggml_cl_pad(backend, tensor->src[0], tensor);
|
||||
return true;
|
||||
case GGML_OP_UPSCALE:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
ggml_cl_upscale(backend, tensor->src[0], tensor);
|
||||
return true;
|
||||
case GGML_OP_CONCAT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_concat;
|
||||
break;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
|
||||
return true;
|
||||
case GGML_OP_MUL_MAT:
|
||||
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
||||
return false;
|
||||
|
||||
109
ggml/src/ggml-opencl/kernels/concat.cl
Normal file
109
ggml/src/ggml-opencl/kernels/concat.cl
Normal file
@@ -0,0 +1,109 @@
|
||||
kernel void kernel_concat_f32_contiguous(
|
||||
global const char * p_src0, ulong off_src0,
|
||||
global const char * p_src1, ulong off_src1,
|
||||
global char * p_dst, ulong off_dst,
|
||||
int d_ne00, int d_ne01, int d_ne02, // src0->ne[0..2] for the slice
|
||||
int d_ne10, int d_ne11, int d_ne12, // src1->ne[0..2] for the slice (d_ne1X must match d_ne0X on non-concat axes)
|
||||
int d_ne0, int d_ne1, int d_ne2, // dst->ne[0..2] for the slice
|
||||
int dim
|
||||
) {
|
||||
global const float * src0 = (global const float*)((global char*)p_src0 + off_src0);
|
||||
global const float * src1 = (global const float*)((global char*)p_src1 + off_src1);
|
||||
global float * dst = (global float*)((global char*)p_dst + off_dst);
|
||||
|
||||
int i0 = get_global_id(0); // Index along dst's 0th dimension
|
||||
int i1 = get_global_id(1); // Index along dst's 1st dimension
|
||||
int i2 = get_global_id(2); // Index along dst's 2nd dimension
|
||||
|
||||
if (i0 >= d_ne0 || i1 >= d_ne1 || i2 >= d_ne2) {
|
||||
return;
|
||||
}
|
||||
|
||||
ulong dst_idx = (ulong)i2 * d_ne0 * d_ne1 + (ulong)i1 * d_ne0 + i0;
|
||||
ulong src_idx;
|
||||
|
||||
if (dim == 0) {
|
||||
if (i0 < d_ne00) { // Data from src0
|
||||
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
|
||||
dst[dst_idx] = src0[src_idx];
|
||||
} else { // Data from src1
|
||||
src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + (i0 - d_ne00);
|
||||
dst[dst_idx] = src1[src_idx];
|
||||
}
|
||||
} else if (dim == 1) {
|
||||
if (i1 < d_ne01) { // Data from src0
|
||||
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
|
||||
dst[dst_idx] = src0[src_idx];
|
||||
} else { // Data from src1
|
||||
src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)(i1 - d_ne01) * d_ne10 + i0;
|
||||
dst[dst_idx] = src1[src_idx];
|
||||
}
|
||||
} else if (dim == 2) {
|
||||
if (i2 < d_ne02) { // Data from src0
|
||||
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
|
||||
dst[dst_idx] = src0[src_idx];
|
||||
} else { // Data from src1
|
||||
|
||||
src_idx = (ulong)(i2 - d_ne02) * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + i0;
|
||||
dst[dst_idx] = src1[src_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_concat_f32_non_contiguous(
|
||||
global const char * p_src0, ulong off_src0,
|
||||
global const char * p_src1, ulong off_src1,
|
||||
global char * p_dst, ulong off_dst,
|
||||
|
||||
long ne00, long ne01, long ne02, long ne03,
|
||||
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
||||
|
||||
ulong nb10, ulong nb11, ulong nb12, ulong nb13, // Strides for src1
|
||||
|
||||
long d_ne0, long d_ne1, long d_ne2, long d_ne3,
|
||||
ulong d_nb0, ulong d_nb1, ulong d_nb2, ulong d_nb3,
|
||||
int dim
|
||||
) {
|
||||
global const char * src0_base = p_src0 + off_src0;
|
||||
global const char * src1_base = p_src1 + off_src1;
|
||||
global char * dst_base = p_dst + off_dst;
|
||||
|
||||
long current_i1 = get_global_id(0); // Index for dst_dim_1
|
||||
long current_i2 = get_global_id(1); // Index for dst_dim_2
|
||||
long current_i3 = get_global_id(2); // Index for dst_dim_3
|
||||
|
||||
if (current_i1 >= d_ne1 || current_i2 >= d_ne2 || current_i3 >= d_ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
global const float * x_val_ptr;
|
||||
global float * y_val_ptr;
|
||||
|
||||
for (long current_i0 = 0; current_i0 < d_ne0; ++current_i0) {
|
||||
bool use_src0;
|
||||
long s_i0 = current_i0, s_i1 = current_i1, s_i2 = current_i2, s_i3 = current_i3;
|
||||
|
||||
if (dim == 0) {
|
||||
use_src0 = (current_i0 < ne00);
|
||||
if (!use_src0) { s_i0 = current_i0 - ne00; }
|
||||
} else if (dim == 1) {
|
||||
use_src0 = (current_i1 < ne01);
|
||||
if (!use_src0) { s_i1 = current_i1 - ne01; }
|
||||
} else if (dim == 2) {
|
||||
use_src0 = (current_i2 < ne02);
|
||||
if (!use_src0) { s_i2 = current_i2 - ne02; }
|
||||
} else { // dim == 3
|
||||
use_src0 = (current_i3 < ne03);
|
||||
if (!use_src0) { s_i3 = current_i3 - ne03; }
|
||||
}
|
||||
|
||||
if (use_src0) {
|
||||
x_val_ptr = (global const float *)(src0_base + (ulong)s_i3*nb03 + (ulong)s_i2*nb02 + (ulong)s_i1*nb01 + (ulong)s_i0*nb00);
|
||||
} else {
|
||||
x_val_ptr = (global const float *)(src1_base + (ulong)s_i3*nb13 + (ulong)s_i2*nb12 + (ulong)s_i1*nb11 + (ulong)s_i0*nb10);
|
||||
}
|
||||
|
||||
y_val_ptr = (global float *)(dst_base + (ulong)current_i3*d_nb3 + (ulong)current_i2*d_nb2 + (ulong)current_i1*d_nb1 + (ulong)current_i0*d_nb0);
|
||||
*y_val_ptr = *x_val_ptr;
|
||||
}
|
||||
}
|
||||
30
ggml/src/ggml-opencl/kernels/pad.cl
Normal file
30
ggml/src/ggml-opencl/kernels/pad.cl
Normal file
@@ -0,0 +1,30 @@
|
||||
kernel void kernel_pad(
|
||||
global const void * src0_ptr,
|
||||
ulong src0_offset,
|
||||
global void * dst_ptr,
|
||||
ulong dst_offset,
|
||||
int s_ne0, int s_ne1, int s_ne2,
|
||||
int d_ne0, int d_ne1, int d_ne2
|
||||
) {
|
||||
global const float * src0 = (global const float *)((global const char *)src0_ptr + src0_offset);
|
||||
global float * dst = (global float *)((global char *)dst_ptr + dst_offset);
|
||||
|
||||
int nidx = get_global_id(0);
|
||||
int idx_d1 = get_group_id(1);
|
||||
int idx_d2 = get_group_id(2);
|
||||
|
||||
if (nidx >= d_ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int dst_el_offset = nidx + idx_d1 * d_ne0 + idx_d2 * d_ne0 * d_ne1;
|
||||
|
||||
bool in_src_bounds = (nidx < s_ne0) && (idx_d1 < s_ne1) && (idx_d2 < s_ne2);
|
||||
|
||||
if (in_src_bounds) {
|
||||
int src_el_offset = nidx + idx_d1 * s_ne0 + idx_d2 * s_ne0 * s_ne1;
|
||||
dst[dst_el_offset] = src0[src_el_offset];
|
||||
} else {
|
||||
dst[dst_el_offset] = 0.0f;
|
||||
}
|
||||
}
|
||||
39
ggml/src/ggml-opencl/kernels/repeat.cl
Normal file
39
ggml/src/ggml-opencl/kernels/repeat.cl
Normal file
@@ -0,0 +1,39 @@
|
||||
kernel void kernel_repeat(
|
||||
global const char * src0_data_in,
|
||||
global char * dst_data_in,
|
||||
ulong src0_offset,
|
||||
ulong dst_offset,
|
||||
int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3,
|
||||
ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3,
|
||||
int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3,
|
||||
ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3
|
||||
) {
|
||||
global const char * src0_data = src0_data_in + src0_offset;
|
||||
global char * dst_data = dst_data_in + dst_offset;
|
||||
|
||||
const int d3 = get_global_id(2);
|
||||
const int d2 = get_global_id(1);
|
||||
const int d1 = get_global_id(0);
|
||||
|
||||
if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int s3 = d3 % src0_ne3;
|
||||
const int s2 = d2 % src0_ne2;
|
||||
const int s1 = d1 % src0_ne1;
|
||||
|
||||
const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1;
|
||||
global char * p_dst_slice = dst_data + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1;
|
||||
|
||||
for (int d0 = 0; d0 < dst_ne0; ++d0) {
|
||||
// Determine source index for dimension 0 based on tiling/broadcasting.
|
||||
const int s0 = d0 % src0_ne0;
|
||||
|
||||
const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0;
|
||||
global char * restrict current_dst_el_ptr = p_dst_slice + (ulong)d0*dst_nb0;
|
||||
for (int k = 0; k < src0_nb0; ++k) {
|
||||
current_dst_el_ptr[k] = current_src_el_ptr[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
63
ggml/src/ggml-opencl/kernels/tanh.cl
Normal file
63
ggml/src/ggml-opencl/kernels/tanh.cl
Normal file
@@ -0,0 +1,63 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
kernel void kernel_tanh_f32_nd(
|
||||
global void * p_src0_base, ulong off_src0_abs,
|
||||
global void * p_dst_base, ulong off_dst_abs,
|
||||
int ne00, int ne01, int ne02, int ne03,
|
||||
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
ulong nb10, ulong nb11, ulong nb12, ulong nb13
|
||||
) {
|
||||
int i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int i2 = get_global_id(2);
|
||||
|
||||
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||
|
||||
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||
|
||||
*dst_val_ptr = tanh(*src_val_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_tanh_f16_nd(
|
||||
global void * p_src0_base, ulong off_src0_abs,
|
||||
global void * p_dst_base, ulong off_dst_abs,
|
||||
int ne00, int ne01, int ne02, int ne03,
|
||||
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
ulong nb10, ulong nb11, ulong nb12, ulong nb13
|
||||
) {
|
||||
int i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int i2 = get_global_id(2);
|
||||
|
||||
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||
|
||||
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||
|
||||
*dst_val_ptr = tanh(*src_val_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
48
ggml/src/ggml-opencl/kernels/tsembd.cl
Normal file
48
ggml/src/ggml-opencl/kernels/tsembd.cl
Normal file
@@ -0,0 +1,48 @@
|
||||
kernel void kernel_timestep_embedding(
|
||||
global const void * p_timesteps,
|
||||
ulong off_timesteps,
|
||||
global void * p_dst,
|
||||
ulong off_dst,
|
||||
int dst_nb1_bytes,
|
||||
int logical_dim,
|
||||
int max_period
|
||||
) {
|
||||
int local_i;
|
||||
int local_j;
|
||||
int local_half_dim;
|
||||
float local_timestep_val;
|
||||
float local_freq;
|
||||
float local_arg;
|
||||
global float * local_embed_data_ptr;
|
||||
global const float * local_timesteps_input_ptr;
|
||||
global float * local_dst_output_base_ptr;
|
||||
|
||||
local_timesteps_input_ptr = (global const float *)((global char *)p_timesteps + off_timesteps);
|
||||
local_dst_output_base_ptr = (global float *)((global char *)p_dst + off_dst);
|
||||
|
||||
local_i = get_global_id(1);
|
||||
local_j = get_global_id(0);
|
||||
|
||||
local_half_dim = logical_dim / 2;
|
||||
local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes);
|
||||
|
||||
if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) {
|
||||
local_embed_data_ptr[logical_dim] = 0.0f;
|
||||
}
|
||||
|
||||
if (local_j >= local_half_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
local_timestep_val = local_timesteps_input_ptr[local_i];
|
||||
|
||||
if (local_half_dim == 0) {
|
||||
local_freq = 1.0f;
|
||||
} else {
|
||||
local_freq = exp(-log((float)max_period) * (float)local_j / (float)local_half_dim);
|
||||
}
|
||||
|
||||
local_arg = local_timestep_val * local_freq;
|
||||
local_embed_data_ptr[local_j] = cos(local_arg);
|
||||
local_embed_data_ptr[local_j + local_half_dim] = sin(local_arg);
|
||||
}
|
||||
121
ggml/src/ggml-opencl/kernels/upscale.cl
Normal file
121
ggml/src/ggml-opencl/kernels/upscale.cl
Normal file
@@ -0,0 +1,121 @@
|
||||
kernel void kernel_upscale(
|
||||
global const void * p_src0,
|
||||
ulong off_src0,
|
||||
global void * p_dst,
|
||||
ulong off_dst,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
float sf0,
|
||||
float sf1,
|
||||
float sf2,
|
||||
float sf3
|
||||
) {
|
||||
global const char * src_base = (global const char *)p_src0 + off_src0;
|
||||
global float * dst_base = (global float *)((global char *)p_dst + off_dst);
|
||||
|
||||
int index = get_global_id(0);
|
||||
int dst_total_elements = ne10 * ne11 * ne12 * ne13;
|
||||
|
||||
if (index >= dst_total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i10 = index % ne10;
|
||||
int i11 = (index / ne10) % ne11;
|
||||
int i12 = (index / (ne10 * ne11)) % ne12;
|
||||
int i13 = index / (ne10 * ne11 * ne12);
|
||||
|
||||
int i00 = (int)(i10 / sf0);
|
||||
int i01 = (int)(i11 / sf1);
|
||||
int i02 = (int)(i12 / sf2);
|
||||
int i03 = (int)(i13 / sf3);
|
||||
|
||||
ulong offset_src_element = (ulong)i03 * nb03 + (ulong)i02 * nb02 + (ulong)i01 * nb01 + (ulong)i00 * nb00;
|
||||
global const float * src_element_ptr = (global const float *)(src_base + offset_src_element);
|
||||
|
||||
dst_base[index] = *src_element_ptr;
|
||||
}
|
||||
|
||||
kernel void kernel_upscale_bilinear(
|
||||
global const void * p_src0,
|
||||
ulong off_src0,
|
||||
global void * p_dst,
|
||||
ulong off_dst,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne00_src,
|
||||
int ne01_src,
|
||||
int ne10_dst,
|
||||
int ne11_dst,
|
||||
int ne12_dst,
|
||||
int ne13_dst,
|
||||
float sf0,
|
||||
float sf1,
|
||||
float sf2,
|
||||
float sf3
|
||||
) {
|
||||
global const char * src_base = (global const char *)p_src0 + off_src0;
|
||||
global float * dst_base = (global float *)((global char *)p_dst + off_dst);
|
||||
|
||||
int index = get_global_id(0);
|
||||
int dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
|
||||
if (index >= dst_total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i10_dst = index % ne10_dst;
|
||||
int i11_dst = (index / ne10_dst) % ne11_dst;
|
||||
int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
|
||||
int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
|
||||
|
||||
int i02_src = (int)(i12_dst / sf2);
|
||||
int i03_src = (int)(i13_dst / sf3);
|
||||
|
||||
const float pixel_offset = 0.5f;
|
||||
|
||||
float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
|
||||
long y0_src = (long)floor(y_src_f);
|
||||
long y1_src = y0_src + 1;
|
||||
|
||||
y0_src = max(0L, min(y0_src, (long)ne01_src - 1));
|
||||
y1_src = max(0L, min(y1_src, (long)ne01_src - 1));
|
||||
|
||||
float dy = y_src_f - (float)y0_src;
|
||||
dy = max(0.0f, min(dy, 1.0f));
|
||||
|
||||
float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset;
|
||||
long x0_src = (long)floor(x_src_f);
|
||||
long x1_src = x0_src + 1;
|
||||
|
||||
x0_src = max(0L, min(x0_src, (long)ne00_src - 1));
|
||||
x1_src = max(0L, min(x1_src, (long)ne00_src - 1));
|
||||
|
||||
float dx = x_src_f - (float)x0_src;
|
||||
dx = max(0.0f, min(dx, 1.0f));
|
||||
|
||||
global const float * p_a = (global const float *)(src_base + (ulong)x0_src * nb00 + (ulong)y0_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
|
||||
global const float * p_b = (global const float *)(src_base + (ulong)x1_src * nb00 + (ulong)y0_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
|
||||
global const float * p_c = (global const float *)(src_base + (ulong)x0_src * nb00 + (ulong)y1_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
|
||||
global const float * p_d = (global const float *)(src_base + (ulong)x1_src * nb00 + (ulong)y1_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03);
|
||||
|
||||
const float val_a = *p_a;
|
||||
const float val_b = *p_b;
|
||||
const float val_c = *p_c;
|
||||
const float val_d = *p_d;
|
||||
|
||||
float result = val_a * (1.0f - dx) * (1.0f - dy) +
|
||||
val_b * dx * (1.0f - dy) +
|
||||
val_c * (1.0f - dx) * dy +
|
||||
val_d * dx * dy;
|
||||
|
||||
dst_base[index] = result;
|
||||
}
|
||||
@@ -396,6 +396,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_count_equal_i32;
|
||||
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_conv_transpose_1d_f32;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
@@ -444,7 +445,7 @@ struct vk_device_struct {
|
||||
// for GGML_VK_PERF_LOGGER
|
||||
std::unique_ptr<vk_perf_logger> perf_logger;
|
||||
vk::QueryPool query_pool;
|
||||
uint32_t num_queries;
|
||||
int32_t num_queries;
|
||||
|
||||
~vk_device_struct() {
|
||||
VK_LOG_DEBUG("destroy device " << name);
|
||||
@@ -706,6 +707,21 @@ struct vk_op_timestep_embedding_push_constants {
|
||||
uint32_t max_period;
|
||||
};
|
||||
|
||||
struct vk_op_conv_transpose_1d_push_constants {
|
||||
uint32_t Cout;
|
||||
uint32_t Cin;
|
||||
uint32_t K;
|
||||
uint32_t L;
|
||||
uint32_t KL;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb11;
|
||||
uint32_t nb1;
|
||||
|
||||
int32_t s0;
|
||||
};
|
||||
|
||||
struct vk_op_pool2d_push_constants {
|
||||
uint32_t IW; uint32_t IH;
|
||||
uint32_t OW; uint32_t OH;
|
||||
@@ -2726,6 +2742,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||
@@ -6392,6 +6410,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_timestep_embedding_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_conv_transpose_1d_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_POOL_2D:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_pool2d_f32;
|
||||
@@ -6726,6 +6749,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
uint32_t half_ceil = (dim + 1) / 2;
|
||||
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
|
||||
} break;
|
||||
case GGML_OP_POOL_2D:
|
||||
{
|
||||
const uint32_t N = dst->ne[3];
|
||||
@@ -7529,6 +7556,37 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
// src0: (K, Cout, Cin, 1) -- kernel
|
||||
// src1: (L, Cin, 1, 1) -- input
|
||||
// dst: (*, Cout, 1, 1)
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
const int32_t s0 = dst->op_params[0];
|
||||
|
||||
vk_op_conv_transpose_1d_push_constants p{};
|
||||
p.Cout = static_cast<uint32_t>(ne01);
|
||||
p.Cin = static_cast<uint32_t>(ne02);
|
||||
p.K = static_cast<uint32_t>(ne00);
|
||||
p.L = static_cast<uint32_t>(ne10);
|
||||
p.KL = static_cast<uint32_t>(ne0);
|
||||
p.nb01 = static_cast<uint32_t>(nb01 / nb00);
|
||||
p.nb02 = static_cast<uint32_t>(nb02 / nb00);
|
||||
p.nb11 = static_cast<uint32_t>(nb11 / nb10);
|
||||
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
|
||||
p.s0 = static_cast<uint32_t>(s0);
|
||||
|
||||
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
||||
const int32_t k1 = dst->op_params[1];
|
||||
@@ -8600,6 +8658,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
@@ -8664,6 +8723,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
@@ -8835,6 +8895,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_POOL_2D:
|
||||
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
|
||||
@@ -8963,6 +9027,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
@@ -9513,8 +9578,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
if (ctx->device->query_pool) {
|
||||
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
|
||||
}
|
||||
VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
|
||||
query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
|
||||
vk::QueryPoolCreateInfo query_create_info;
|
||||
query_create_info.queryType = vk::QueryType::eTimestamp;
|
||||
query_create_info.queryCount = cgraph->n_nodes + 100;
|
||||
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
|
||||
ctx->device->num_queries = query_create_info.queryCount;
|
||||
@@ -9600,7 +9665,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
|
||||
// Get the results and pass them to the logger
|
||||
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
|
||||
ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
|
||||
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
|
||||
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
|
||||
@@ -10024,6 +10089,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
return true;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -10515,6 +10582,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
const int32_t dim = tensor->op_params[0];
|
||||
const int32_t max_period = tensor->op_params[1];
|
||||
tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
|
||||
} else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t p0 = tensor->op_params[1];
|
||||
const int32_t d0 = tensor->op_params[2];
|
||||
tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
|
||||
} else if (tensor->op == GGML_OP_POOL_2D) {
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
|
||||
const int32_t k0 = tensor->op_params[1];
|
||||
|
||||
98
ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp
Normal file
98
ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp
Normal file
@@ -0,0 +1,98 @@
|
||||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; // src0 - kernel: [K, Cout, Cin]
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; // src1 - input: [L, Cin]
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; // dst - result [KL, Cout]
|
||||
|
||||
layout(local_size_x = 128 , local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint32_t Cout;
|
||||
uint32_t Cin;
|
||||
uint32_t K;
|
||||
uint32_t L;
|
||||
uint32_t KL;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb11;
|
||||
uint32_t nb1;
|
||||
|
||||
int32_t s0;
|
||||
} p;
|
||||
|
||||
|
||||
uint32_t Cout_idx = gl_WorkGroupID.x;
|
||||
const uint32_t bs = gl_WorkGroupSize.x;
|
||||
uint32_t tid = gl_LocalInvocationID.x;
|
||||
// Code is more straightforward if we assume it is bs*s0+K instead of (bs-1)*s0+K.
|
||||
uint32_t tmp_len = bs*p.s0+p.K;
|
||||
shared D_TYPE tmp[4096];
|
||||
|
||||
uint splitWork(uint workSize){
|
||||
return (bs + workSize -1) / bs;
|
||||
}
|
||||
|
||||
void main(){
|
||||
for(uint32_t i = 0; i < splitWork(tmp_len); i++){
|
||||
uint32_t idx = i*bs+tid;
|
||||
if(idx < tmp_len){
|
||||
tmp[idx] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t L_blocks = splitWork(p.L);
|
||||
for(uint32_t L_block_id = 0; L_block_id < L_blocks; L_block_id++){
|
||||
if(L_block_id > 0){
|
||||
barrier();
|
||||
// Shift values in tmp to the current processing window
|
||||
for(int i = 0; i < splitWork(tmp_len); i++){
|
||||
uint32_t idx = i*bs+tid;
|
||||
if(idx >= bs*p.s0 && idx < tmp_len){
|
||||
tmp[idx-bs*p.s0] = tmp[idx];
|
||||
tmp[idx] = 0.0;
|
||||
}else if(idx >= p.K && idx < bs*p.s0){
|
||||
tmp[idx] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
|
||||
// Save contributions of the block to tmp
|
||||
uint32_t L_idx = L_block_id*bs + tid;
|
||||
for(uint32_t K_idx = 0; K_idx < p.K; K_idx++){
|
||||
D_TYPE dp = 0.0;
|
||||
for(uint32_t Cin_idx = 0; Cin_idx < p.Cin; Cin_idx++){
|
||||
A_TYPE elemKrn = data_a[K_idx + Cout_idx * p.nb01 + Cin_idx * p.nb02];
|
||||
if(L_idx < p.L){
|
||||
B_TYPE elemInp = data_b[L_idx + Cin_idx*p.nb11];
|
||||
dp = fma(elemKrn, elemInp, dp);
|
||||
}
|
||||
}
|
||||
tmp[tid*p.s0 + K_idx] += dp;
|
||||
barrier();
|
||||
}
|
||||
|
||||
// Save the computed values except the last block that can have different size
|
||||
uint32_t KLb_idx = L_block_id*bs*p.s0;
|
||||
if(L_block_id < L_blocks-1){
|
||||
for(uint32_t s0_idx = 0; s0_idx < p.s0; s0_idx++){
|
||||
uint32_t sh_idx = p.s0*tid+s0_idx;
|
||||
uint32_t KL_idx = KLb_idx+sh_idx;
|
||||
if(KL_idx < p.KL){
|
||||
data_d[KL_idx + Cout_idx*p.nb1] = tmp[sh_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(uint32_t i = 0; i < splitWork(tmp_len); i++){
|
||||
uint32_t idx = i*bs+tid;
|
||||
uint32_t KL_idx = (L_blocks-1)*bs*p.s0+idx;
|
||||
if(KL_idx < p.KL){
|
||||
data_d[KL_idx + Cout_idx*p.nb1] = tmp[idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -622,6 +622,8 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
@@ -429,22 +429,54 @@ const llama_kv_cache * llama_context::get_kv_self() const {
|
||||
return kv_self;
|
||||
}
|
||||
|
||||
bool llama_context::kv_self_update() {
|
||||
void llama_context::kv_self_defrag_sched() {
|
||||
if (!memory) {
|
||||
return;
|
||||
}
|
||||
|
||||
memory_force_optimize = true;
|
||||
}
|
||||
|
||||
bool llama_context::kv_self_update(bool optimize) {
|
||||
if (!memory) {
|
||||
return false;
|
||||
}
|
||||
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
if (!kv_self->update(*this)) {
|
||||
// no updates have been performed
|
||||
return false;
|
||||
{
|
||||
// TODO: remove in the future
|
||||
optimize |= memory_force_optimize;
|
||||
memory_force_optimize = false;
|
||||
|
||||
const auto kv_state = kv_self->init_update(this, optimize);
|
||||
switch (kv_state->get_status()) {
|
||||
case LLAMA_MEMORY_STATUS_SUCCESS:
|
||||
{
|
||||
// noop
|
||||
} break;
|
||||
case LLAMA_MEMORY_STATUS_NO_UPDATE:
|
||||
{
|
||||
// no updates need to be performed
|
||||
return false;
|
||||
}
|
||||
case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
|
||||
case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
|
||||
{
|
||||
LLAMA_LOG_ERROR("%s: failed to prepare memory update\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!kv_state->apply()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// if the KV cache did any computation, we have to reserve a new worst-case graph
|
||||
const auto kv_state = kv_self->init_full();
|
||||
if (!kv_state) {
|
||||
throw std::runtime_error("failed to initialize KV cache");
|
||||
throw std::runtime_error("failed to initialize memory state");
|
||||
}
|
||||
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
@@ -452,7 +484,7 @@ bool llama_context::kv_self_update() {
|
||||
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
|
||||
if (!gf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__);
|
||||
LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -940,13 +972,13 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
n_outputs_all = 1;
|
||||
}
|
||||
|
||||
bool did_optimize = false;
|
||||
|
||||
// handle any pending defrags/shifts
|
||||
kv_self_update();
|
||||
kv_self_update(false);
|
||||
|
||||
llama_memory_state_ptr kv_state;
|
||||
|
||||
bool did_defrag = false;
|
||||
|
||||
while (true) {
|
||||
kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
|
||||
if (!kv_state) {
|
||||
@@ -957,25 +989,32 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
case LLAMA_MEMORY_STATUS_SUCCESS:
|
||||
{
|
||||
} break;
|
||||
case LLAMA_MEMORY_STATUS_NO_UPDATE:
|
||||
{
|
||||
LLAMA_LOG_ERROR("%s: unexpected memory state status: %d\n", __func__, kv_state->get_status());
|
||||
|
||||
return -2;
|
||||
}
|
||||
case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
|
||||
{
|
||||
if (!did_defrag) {
|
||||
did_defrag = true;
|
||||
if (!did_optimize) {
|
||||
did_optimize = true;
|
||||
|
||||
kv_self->defrag_sched(-1.0f);
|
||||
if (kv_self_update()) {
|
||||
LLAMA_LOG_DEBUG("%s: failed to init batch of size %d, retrying after defrag\n", __func__, batch.n_tokens);
|
||||
if (kv_self_update(true)) {
|
||||
LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, batch.n_tokens);
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
|
||||
LLAMA_LOG_WARN("%s: failed to find a memory slot for batch of size %d\n", __func__, batch.n_tokens);
|
||||
|
||||
return 1;
|
||||
}
|
||||
case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
|
||||
{
|
||||
LLAMA_LOG_ERROR("%s: compute failed while preparing batch of size %d\n", __func__, batch.n_tokens);
|
||||
|
||||
return -2;
|
||||
}
|
||||
}
|
||||
@@ -1189,11 +1228,6 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||
//synchronize();
|
||||
|
||||
// decide if we need to defrag the kv cache
|
||||
if (cparams.defrag_thold > 0.0f) {
|
||||
kv_self->defrag_sched(cparams.defrag_thold);
|
||||
}
|
||||
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
@@ -2283,7 +2317,7 @@ llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
|
||||
|
||||
// deprecated
|
||||
void llama_kv_self_update(llama_context * ctx) {
|
||||
ctx->kv_self_update();
|
||||
ctx->kv_self_update(false);
|
||||
}
|
||||
|
||||
enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
|
||||
@@ -2538,13 +2572,8 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
|
||||
|
||||
// deprecated
|
||||
void llama_kv_self_defrag(llama_context * ctx) {
|
||||
auto * kv = ctx->get_kv_self();
|
||||
if (!kv) {
|
||||
return;
|
||||
}
|
||||
|
||||
// force defrag
|
||||
kv->defrag_sched(-1.0f);
|
||||
ctx->kv_self_defrag_sched();
|
||||
}
|
||||
|
||||
bool llama_kv_self_can_shift(const llama_context * ctx) {
|
||||
|
||||
@@ -52,7 +52,8 @@ struct llama_context {
|
||||
|
||||
// return true of the KV cache was updated
|
||||
// TODO: remove
|
||||
bool kv_self_update();
|
||||
bool kv_self_update(bool optimize);
|
||||
void kv_self_defrag_sched();
|
||||
|
||||
enum llama_pooling_type pooling_type() const;
|
||||
|
||||
@@ -231,6 +232,9 @@ private:
|
||||
|
||||
std::unique_ptr<llama_memory_i> memory;
|
||||
|
||||
// TODO: temporary, until the llama_kv_self_defrag() API is removed
|
||||
bool memory_force_optimize = false;
|
||||
|
||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||
size_t logits_size = 0; // capacity (of floats) for logits
|
||||
float * logits = nullptr;
|
||||
|
||||
@@ -769,9 +769,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
||||
|
||||
if (weight_before_ffn) {
|
||||
// TODO: this is a workaround as we don't yet have a repeat op that takes custom dim (ggml_repeat_4d)
|
||||
ggml_tensor * repeated = ggml_new_tensor_3d(ctx0, cur->type, n_embd, n_expert_used, n_tokens);
|
||||
repeated = ggml_repeat(ctx0, cur, repeated); // [n_embd, n_expert_used, n_tokens]
|
||||
// repeat cur to [n_embd, n_expert_used, n_tokens]
|
||||
ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_used, n_tokens, 1);
|
||||
cur = ggml_mul(ctx0, repeated, weights);
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "llama-kv-cache-recurrent.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-io.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-model.h"
|
||||
|
||||
@@ -386,6 +387,13 @@ llama_memory_state_ptr llama_kv_cache_recurrent::init_full() {
|
||||
return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_recurrent::init_update(llama_context * lctx, bool optimize) {
|
||||
GGML_UNUSED(lctx);
|
||||
GGML_UNUSED(optimize);
|
||||
|
||||
return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_NO_UPDATE);
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
|
||||
// simply remember the full state because it is very small for this type of cache
|
||||
// TODO: optimize
|
||||
@@ -419,17 +427,6 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
|
||||
return success;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::update(llama_context & lctx) {
|
||||
GGML_UNUSED(lctx);
|
||||
// noop
|
||||
return false;
|
||||
}
|
||||
|
||||
void llama_kv_cache_recurrent::defrag_sched(float thold) {
|
||||
GGML_UNUSED(thold);
|
||||
// noop
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
const uint32_t n_tokens = ubatch.n_tokens;
|
||||
const uint32_t n_seqs = ubatch.n_seqs;
|
||||
|
||||
@@ -52,9 +52,7 @@ public:
|
||||
|
||||
llama_memory_state_ptr init_full() override;
|
||||
|
||||
bool update(llama_context & lctx) override;
|
||||
|
||||
void defrag_sched(float thold) override;
|
||||
llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
|
||||
|
||||
bool prepare(const std::vector<llama_ubatch> & ubatches);
|
||||
|
||||
|
||||
@@ -123,26 +123,16 @@ llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch
|
||||
|
||||
assert(heads_base.size() == heads_swa.size());
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_SUCCESS,
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(
|
||||
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(this);
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
|
||||
bool res = false;
|
||||
|
||||
res = res | kv_base->update(lctx);
|
||||
res = res | kv_swa ->update(lctx);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
|
||||
kv_base->defrag_sched(thold);
|
||||
kv_swa ->defrag_sched(thold);
|
||||
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) {
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(this, lctx, optimize);
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified_iswa::get_can_shift() const {
|
||||
@@ -174,26 +164,38 @@ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const {
|
||||
llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(llama_memory_status status) : status(status) {}
|
||||
|
||||
llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
|
||||
llama_memory_status status,
|
||||
llama_kv_cache_unified_iswa * kv) : status(status) {
|
||||
state_base.reset(new llama_kv_cache_unified_state(status, kv->get_base()));
|
||||
state_swa .reset(new llama_kv_cache_unified_state(status, kv->get_swa ()));
|
||||
llama_kv_cache_unified_iswa * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS) {
|
||||
state_base = kv->get_base()->init_full();
|
||||
state_swa = kv->get_swa ()->init_full();
|
||||
|
||||
status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status());
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
|
||||
llama_kv_cache_unified_iswa * kv,
|
||||
llama_context * lctx,
|
||||
bool optimize) : status(LLAMA_MEMORY_STATUS_SUCCESS) {
|
||||
state_base = kv->get_base()->init_update(lctx, optimize);
|
||||
state_swa = kv->get_swa ()->init_update(lctx, optimize);
|
||||
|
||||
status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status());
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
|
||||
llama_memory_status status,
|
||||
llama_kv_cache_unified_iswa * kv,
|
||||
llama_sbatch sbatch,
|
||||
std::vector<uint32_t> heads_base,
|
||||
std::vector<uint32_t> heads_swa,
|
||||
std::vector<llama_ubatch> ubatches)
|
||||
: status(status),
|
||||
sbatch(std::move(sbatch)),
|
||||
ubatches(std::move(ubatches)) {
|
||||
// note: here we copy the ubatches. not sure if this is ideal
|
||||
state_base.reset(new llama_kv_cache_unified_state(status, kv->get_base(), {}, std::move(heads_base), this->ubatches));
|
||||
state_swa .reset(new llama_kv_cache_unified_state(status, kv->get_swa (), {}, std::move(heads_swa), this->ubatches));
|
||||
}
|
||||
: status(LLAMA_MEMORY_STATUS_SUCCESS),
|
||||
sbatch(std::move(sbatch)),
|
||||
ubatches(std::move(ubatches)) {
|
||||
// note: here we copy the ubatches. not sure if this is ideal
|
||||
state_base.reset(new llama_kv_cache_unified_state(kv->get_base(), {}, std::move(heads_base), this->ubatches));
|
||||
state_swa .reset(new llama_kv_cache_unified_state(kv->get_swa (), {}, std::move(heads_swa), this->ubatches));
|
||||
|
||||
status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status());
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_iswa_state:: ~llama_kv_cache_unified_iswa_state() = default;
|
||||
|
||||
@@ -233,17 +235,18 @@ llama_memory_status llama_kv_cache_unified_iswa_state::get_status() const {
|
||||
|
||||
const llama_ubatch & llama_kv_cache_unified_iswa_state::get_ubatch() const {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
return ubatches[i_next];
|
||||
}
|
||||
|
||||
const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_base() const {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
return state_base.get();
|
||||
return static_cast<const llama_kv_cache_unified_state *>(state_base.get());
|
||||
}
|
||||
|
||||
const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_swa() const {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
return state_swa.get();
|
||||
return static_cast<const llama_kv_cache_unified_state *>(state_swa.get());
|
||||
}
|
||||
|
||||
@@ -54,9 +54,7 @@ public:
|
||||
|
||||
llama_memory_state_ptr init_full() override;
|
||||
|
||||
bool update(llama_context & lctx) override;
|
||||
|
||||
void defrag_sched(float thold) override;
|
||||
llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
@@ -86,12 +84,16 @@ public:
|
||||
|
||||
// used to create a full-cache state
|
||||
llama_kv_cache_unified_iswa_state(
|
||||
llama_memory_status status,
|
||||
llama_kv_cache_unified_iswa * kv);
|
||||
|
||||
// used to create an update state
|
||||
llama_kv_cache_unified_iswa_state(
|
||||
llama_kv_cache_unified_iswa * kv,
|
||||
llama_context * lctx,
|
||||
bool optimize);
|
||||
|
||||
// used to create a state from a batch
|
||||
llama_kv_cache_unified_iswa_state(
|
||||
llama_memory_status status,
|
||||
llama_kv_cache_unified_iswa * kv,
|
||||
llama_sbatch sbatch,
|
||||
std::vector<uint32_t> heads_base,
|
||||
@@ -120,7 +122,7 @@ public:
|
||||
const llama_kv_cache_unified_state * get_swa() const;
|
||||
|
||||
private:
|
||||
const llama_memory_status status;
|
||||
llama_memory_status status;
|
||||
|
||||
//llama_kv_cache_unified_iswa * kv;
|
||||
|
||||
@@ -131,6 +133,6 @@ private:
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
std::unique_ptr<llama_kv_cache_unified_state> state_base;
|
||||
std::unique_ptr<llama_kv_cache_unified_state> state_swa;
|
||||
llama_memory_state_ptr state_base;
|
||||
llama_memory_state_ptr state_swa;
|
||||
};
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "llama-kv-cache-unified.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-io.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-context.h"
|
||||
|
||||
@@ -149,12 +150,27 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
||||
p1 = std::numeric_limits<llama_pos>::max();
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.pos_in(i, p0, p1)) {
|
||||
continue;
|
||||
}
|
||||
if (seq_id >= 0) {
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.pos_in(i, p0, p1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
|
||||
if (new_head == cells.size()) {
|
||||
new_head = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// match any sequence
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (!cells.pos_in(i, p0, p1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cells.rm(i);
|
||||
|
||||
if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
|
||||
if (new_head == cells.size()) {
|
||||
new_head = i;
|
||||
}
|
||||
@@ -305,16 +321,49 @@ llama_memory_state_ptr llama_kv_cache_unified::init_batch(
|
||||
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_SUCCESS,
|
||||
return std::make_unique<llama_kv_cache_unified_state>(
|
||||
this, std::move(sbatch), std::move(heads), std::move(ubatches));
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified::init_full() {
|
||||
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
|
||||
return std::make_unique<llama_kv_cache_unified_state>(this);
|
||||
}
|
||||
|
||||
std::vector<uint32_t> llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
|
||||
std::vector<uint32_t> res;
|
||||
llama_memory_state_ptr llama_kv_cache_unified::init_update(llama_context * lctx, bool optimize) {
|
||||
bool do_shift = get_has_shift();
|
||||
|
||||
defrag_info dinfo;
|
||||
|
||||
// see if we need to defrag
|
||||
{
|
||||
bool do_defrag = optimize;
|
||||
|
||||
const auto thold = lctx->get_cparams().defrag_thold;
|
||||
|
||||
if (!do_defrag && thold > 0.0f) {
|
||||
const auto n_kv = cells.used_max_p1();
|
||||
|
||||
// - do not defrag small contexts (i.e. < 2048 tokens)
|
||||
// - count the padding towards the number of used tokens
|
||||
const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
|
||||
|
||||
if (fragmentation > thold) {
|
||||
LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
|
||||
|
||||
do_defrag = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (do_defrag) {
|
||||
dinfo = defrag_prepare(lctx->graph_max_nodes());
|
||||
}
|
||||
}
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_state>(this, lctx, do_shift, std::move(dinfo));
|
||||
}
|
||||
|
||||
llama_kv_cache_unified::ubatch_heads llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
|
||||
llama_kv_cache_unified::ubatch_heads res;
|
||||
|
||||
struct state {
|
||||
uint32_t head_old; // old position of the head, before placing the ubatch
|
||||
@@ -359,12 +408,12 @@ std::vector<uint32_t> llama_kv_cache_unified::prepare(const std::vector<llama_ub
|
||||
return res;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo) {
|
||||
bool updated = false;
|
||||
|
||||
auto * sched = lctx.get_sched();
|
||||
auto * sched = lctx->get_sched();
|
||||
|
||||
if (cells.get_has_shift()) {
|
||||
if (do_shift) {
|
||||
if (!get_can_shift()) {
|
||||
GGML_ABORT("The current KV cache / model configuration does not support K-shift");
|
||||
}
|
||||
@@ -375,9 +424,9 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
auto * gf = lctx.graph_init();
|
||||
auto * gf = lctx->graph_init();
|
||||
|
||||
auto res = build_graph_shift(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
|
||||
auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf);
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
|
||||
return updated;
|
||||
@@ -390,7 +439,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
if (lctx.graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
|
||||
if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
|
||||
LLAMA_LOG_ERROR("%s: failed to compute K-shift\n", __func__);
|
||||
return updated;
|
||||
}
|
||||
@@ -401,56 +450,55 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
cells.reset_shift();
|
||||
}
|
||||
|
||||
if (do_defrag) {
|
||||
if (!dinfo.empty()) {
|
||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||
|
||||
if (defrag_prepare(lctx.graph_max_nodes())) {
|
||||
ggml_backend_sched_reset(sched);
|
||||
// apply moves:
|
||||
{
|
||||
const auto n_kv = dinfo.ids.size();
|
||||
|
||||
auto * gf = lctx.graph_init();
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
assert(dinfo.ids[i] <= n_kv);
|
||||
|
||||
auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
|
||||
return updated;
|
||||
if (dinfo.ids[i] == n_kv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cells.mv(i, dinfo.ids[i]);
|
||||
}
|
||||
|
||||
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
|
||||
return updated;
|
||||
}
|
||||
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
if (lctx.graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
|
||||
LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
|
||||
return updated;
|
||||
}
|
||||
|
||||
updated = true;
|
||||
// reset the head so we can find the first free slot during the next ubatch
|
||||
head = 0;
|
||||
}
|
||||
|
||||
do_defrag = false;
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
auto * gf = lctx->graph_init();
|
||||
|
||||
auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo);
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
|
||||
return updated;
|
||||
}
|
||||
|
||||
if (!ggml_backend_sched_alloc_graph(sched, gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
|
||||
return updated;
|
||||
}
|
||||
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
|
||||
LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
|
||||
return updated;
|
||||
}
|
||||
|
||||
updated = true;
|
||||
}
|
||||
|
||||
return updated;
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::defrag_sched(float thold) {
|
||||
const auto n_kv = cells.used_max_p1();
|
||||
|
||||
// - do not defrag small contexts (i.e. < 2048 tokens)
|
||||
// - count the padding towards the number of used tokens
|
||||
const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
|
||||
|
||||
// queue defragmentation for next llama_kv_cache_update
|
||||
if (fragmentation > thold) {
|
||||
LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
|
||||
|
||||
do_defrag = true;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
const uint32_t n_tokens = ubatch.n_tokens;
|
||||
|
||||
@@ -597,6 +645,10 @@ uint32_t llama_kv_cache_unified::get_size() const {
|
||||
return cells.size();
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::get_has_shift() const {
|
||||
return cells.get_has_shift();
|
||||
}
|
||||
|
||||
uint32_t llama_kv_cache_unified::get_n_kv() const {
|
||||
return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
|
||||
}
|
||||
@@ -926,12 +978,13 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf) const {
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const defrag_info & dinfo) const {
|
||||
auto res = std::make_unique<llm_graph_result>();
|
||||
|
||||
const auto & ids = defrag_info.ids;
|
||||
const auto & ids = dinfo.ids;
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
@@ -1072,7 +1125,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
|
||||
return res;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
|
||||
const uint32_t n_layer = layers.size();
|
||||
|
||||
const uint32_t n_kv = cells.used_max_p1();
|
||||
@@ -1093,14 +1146,9 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
|
||||
|
||||
// determine which KV cells to move where
|
||||
//
|
||||
// cell i moves to ids[i]
|
||||
//
|
||||
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
||||
//
|
||||
auto & ids = defrag_info.ids;
|
||||
defrag_info res;
|
||||
auto & ids = res.ids;
|
||||
|
||||
ids.clear();
|
||||
ids.resize(n_kv, n_kv);
|
||||
|
||||
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
||||
@@ -1164,11 +1212,6 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
// this cell goes to (i0 + nf)
|
||||
ids[i1] = i0 + nf;
|
||||
|
||||
// move the cell meta data
|
||||
cells.mv(i1, i0 + nf);
|
||||
|
||||
head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
n_moves++;
|
||||
cont = true;
|
||||
@@ -1191,14 +1234,14 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
}
|
||||
|
||||
if (n_moves == 0) {
|
||||
return false;
|
||||
return {};
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
|
||||
|
||||
return true;
|
||||
return res;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
||||
@@ -1621,24 +1664,27 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
llama_kv_cache_unified_state::llama_kv_cache_unified_state(llama_memory_status status) : status(status) {}
|
||||
|
||||
llama_kv_cache_unified_state::llama_kv_cache_unified_state(
|
||||
llama_memory_status status,
|
||||
llama_kv_cache_unified * kv) : status(status), kv(kv) {
|
||||
n_kv = kv->get_size();
|
||||
head = 0;
|
||||
}
|
||||
llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
|
||||
n_kv = kv->get_size();
|
||||
head = 0;
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_state::llama_kv_cache_unified_state(
|
||||
llama_memory_status status,
|
||||
llama_kv_cache_unified * kv,
|
||||
llama_sbatch sbatch,
|
||||
std::vector<uint32_t> heads,
|
||||
std::vector<llama_ubatch> ubatches)
|
||||
: status(status),
|
||||
kv(kv),
|
||||
sbatch(std::move(sbatch)),
|
||||
heads(std::move(heads)),
|
||||
ubatches(std::move(ubatches)) {
|
||||
llama_kv_cache_unified * kv,
|
||||
llama_context * lctx,
|
||||
bool do_shift,
|
||||
defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
|
||||
if (!do_shift && dinfo.empty()) {
|
||||
status = LLAMA_MEMORY_STATUS_NO_UPDATE;
|
||||
}
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_state::llama_kv_cache_unified_state(
|
||||
llama_kv_cache_unified * kv,
|
||||
llama_sbatch sbatch,
|
||||
llama_kv_cache_unified::ubatch_heads heads,
|
||||
std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), sbatch(std::move(sbatch)), heads(std::move(heads)), ubatches(std::move(ubatches)) {
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_state::~llama_kv_cache_unified_state() = default;
|
||||
|
||||
@@ -1655,6 +1701,13 @@ bool llama_kv_cache_unified_state::next() {
|
||||
bool llama_kv_cache_unified_state::apply() {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
// no ubatches -> this is a KV cache update
|
||||
if (ubatches.empty()) {
|
||||
kv->update(lctx, do_shift, dinfo);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
kv->apply_ubatch(heads[i_next], ubatches[i_next]);
|
||||
|
||||
n_kv = kv->get_n_kv();
|
||||
|
||||
@@ -24,6 +24,19 @@ public:
|
||||
// this callback is used to filter out layers that should not be included in the cache
|
||||
using layer_filter_cb = std::function<bool(int32_t il)>;
|
||||
|
||||
using ubatch_heads = std::vector<uint32_t>;
|
||||
|
||||
struct defrag_info {
|
||||
bool empty() const {
|
||||
return ids.empty();
|
||||
}
|
||||
|
||||
// contains information about which cell moves where:
|
||||
// - cell i moves to ids[i]
|
||||
// - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
|
||||
std::vector<uint32_t> ids;
|
||||
};
|
||||
|
||||
llama_kv_cache_unified(
|
||||
const llama_model & model,
|
||||
layer_filter_cb && filter,
|
||||
@@ -66,9 +79,7 @@ public:
|
||||
|
||||
llama_memory_state_ptr init_full() override;
|
||||
|
||||
bool update(llama_context & lctx) override;
|
||||
|
||||
void defrag_sched(float thold) override;
|
||||
llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
@@ -83,6 +94,8 @@ public:
|
||||
|
||||
uint32_t get_size() const;
|
||||
|
||||
bool get_has_shift() const;
|
||||
|
||||
//
|
||||
// graph_build API
|
||||
//
|
||||
@@ -103,7 +116,9 @@ public:
|
||||
|
||||
// find places for the provided ubatches in the cache, returns the head locations
|
||||
// return empty vector on failure
|
||||
std::vector<uint32_t> prepare(const std::vector<llama_ubatch> & ubatches);
|
||||
ubatch_heads prepare(const std::vector<llama_ubatch> & ubatches);
|
||||
|
||||
bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo);
|
||||
|
||||
// return the cell position where we can insert the ubatch
|
||||
// return -1 on failure to find a contiguous slot of kv cells
|
||||
@@ -133,8 +148,7 @@ private:
|
||||
ggml_tensor * v;
|
||||
};
|
||||
|
||||
bool do_defrag = false;
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
|
||||
// the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
|
||||
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
|
||||
@@ -160,13 +174,8 @@ private:
|
||||
// model layer id -> KV cache layer id
|
||||
std::unordered_map<int32_t, int32_t> map_layer_ids;
|
||||
|
||||
// defrag
|
||||
struct {
|
||||
std::vector<uint32_t> ids;
|
||||
} defrag_info;
|
||||
|
||||
// return true if cells have been moved
|
||||
bool defrag_prepare(int32_t n_max_nodes);
|
||||
// return non-empty vector if cells have been moved
|
||||
defrag_info defrag_prepare(int32_t n_max_nodes) const;
|
||||
|
||||
size_t total_size() const;
|
||||
|
||||
@@ -192,7 +201,8 @@ private:
|
||||
llm_graph_result_ptr build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf) const;
|
||||
ggml_cgraph * gf,
|
||||
const defrag_info & dinfo) const;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||
@@ -203,20 +213,29 @@ private:
|
||||
|
||||
class llama_kv_cache_unified_state : public llama_memory_state_i {
|
||||
public:
|
||||
// some shorthands
|
||||
using ubatch_heads = llama_kv_cache_unified::ubatch_heads;
|
||||
using defrag_info = llama_kv_cache_unified::defrag_info;
|
||||
|
||||
// used for errors
|
||||
llama_kv_cache_unified_state(llama_memory_status status);
|
||||
|
||||
// used to create a full-cache state
|
||||
llama_kv_cache_unified_state(
|
||||
llama_memory_status status,
|
||||
llama_kv_cache_unified * kv);
|
||||
|
||||
// used to create a state from a batch
|
||||
// used to create an update state
|
||||
llama_kv_cache_unified_state(
|
||||
llama_kv_cache_unified * kv,
|
||||
llama_context * lctx,
|
||||
bool do_shift,
|
||||
defrag_info dinfo);
|
||||
|
||||
// used to create a decode state from a batch
|
||||
llama_kv_cache_unified_state(
|
||||
llama_memory_status status,
|
||||
llama_kv_cache_unified * kv,
|
||||
llama_sbatch sbatch,
|
||||
std::vector<uint32_t> heads,
|
||||
ubatch_heads heads,
|
||||
std::vector<llama_ubatch> ubatches);
|
||||
|
||||
virtual ~llama_kv_cache_unified_state();
|
||||
@@ -253,16 +272,30 @@ public:
|
||||
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
||||
|
||||
private:
|
||||
const llama_memory_status status;
|
||||
llama_memory_status status;
|
||||
|
||||
llama_kv_cache_unified * kv;
|
||||
llama_context * lctx;
|
||||
|
||||
//
|
||||
// update state
|
||||
//
|
||||
|
||||
bool do_shift = false;
|
||||
|
||||
defrag_info dinfo;
|
||||
|
||||
//
|
||||
// batch processing state
|
||||
//
|
||||
|
||||
llama_sbatch sbatch;
|
||||
|
||||
// the index of the next ubatch to process
|
||||
size_t i_next = 0;
|
||||
|
||||
std::vector<uint32_t> heads;
|
||||
ubatch_heads heads;
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
//
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-io.h"
|
||||
#include "llama-memory.h"
|
||||
|
||||
class llama_io_write_i;
|
||||
class llama_io_read_i;
|
||||
|
||||
struct llama_kv_cache : public llama_memory_i {
|
||||
virtual ~llama_kv_cache() = default;
|
||||
|
||||
// TODO: move the init_ interfaces to llama_memory_i
|
||||
|
||||
// split the input batch into a set of ubatches and verify that they can fit into the cache
|
||||
// return a state object containing the ubatches and KV cache state required to process them
|
||||
// check the llama_memory_state_i::get_status() for the result
|
||||
@@ -19,16 +23,9 @@ struct llama_kv_cache : public llama_memory_i {
|
||||
// simulate full cache, used for allocating worst-case compute buffers
|
||||
virtual llama_memory_state_ptr init_full() = 0;
|
||||
|
||||
// process any pending defrag/shift/etc. operations
|
||||
// optionally call once before processing a new batch
|
||||
// return true if any operations were performed
|
||||
virtual bool update(llama_context & lctx) = 0;
|
||||
|
||||
// schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
|
||||
// TODO: change to
|
||||
// llama_memory_state_ptr init_defrag(float thold) = 0;
|
||||
//
|
||||
virtual void defrag_sched(float thold) = 0;
|
||||
// prepare for any pending memory updates, such as shifts, defrags, etc.
|
||||
// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
|
||||
virtual llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) = 0;
|
||||
|
||||
// getters
|
||||
virtual bool get_can_shift() const = 0;
|
||||
|
||||
@@ -1 +1,42 @@
|
||||
#include "llama-memory.h"
|
||||
|
||||
llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1) {
|
||||
bool has_update = false;
|
||||
|
||||
switch (s0) {
|
||||
case LLAMA_MEMORY_STATUS_SUCCESS:
|
||||
{
|
||||
has_update = true;
|
||||
break;
|
||||
}
|
||||
case LLAMA_MEMORY_STATUS_NO_UPDATE:
|
||||
{
|
||||
break;
|
||||
}
|
||||
case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
|
||||
case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
|
||||
{
|
||||
return s0;
|
||||
}
|
||||
}
|
||||
|
||||
switch (s1) {
|
||||
case LLAMA_MEMORY_STATUS_SUCCESS:
|
||||
{
|
||||
has_update = true;
|
||||
break;
|
||||
}
|
||||
case LLAMA_MEMORY_STATUS_NO_UPDATE:
|
||||
{
|
||||
break;
|
||||
}
|
||||
case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
|
||||
case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
|
||||
{
|
||||
return s1;
|
||||
}
|
||||
}
|
||||
|
||||
// if either status has an update, then the combined status has an update
|
||||
return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE;
|
||||
}
|
||||
|
||||
@@ -36,12 +36,19 @@ public:
|
||||
virtual bool get_can_edit() const = 0;
|
||||
};
|
||||
|
||||
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
|
||||
|
||||
enum llama_memory_status {
|
||||
LLAMA_MEMORY_STATUS_SUCCESS = 0,
|
||||
LLAMA_MEMORY_STATUS_NO_UPDATE,
|
||||
LLAMA_MEMORY_STATUS_FAILED_PREPARE,
|
||||
LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
|
||||
};
|
||||
|
||||
// helper function for combining the status of two memory states
|
||||
// useful for implementing hybrid memory types (e.g. iSWA)
|
||||
llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
|
||||
|
||||
// the interface for managing the memory state during batch processing
|
||||
// this interface is implemented per memory type. see:
|
||||
// - llama_kv_cache_unified_state
|
||||
@@ -69,7 +76,7 @@ public:
|
||||
// get the current ubatch
|
||||
virtual const llama_ubatch & get_ubatch() const = 0;
|
||||
|
||||
// get the status of the memory state
|
||||
// get the status of the memory state - used for error handling and checking if any updates would be applied
|
||||
virtual llama_memory_status get_status() const = 0;
|
||||
};
|
||||
|
||||
|
||||
@@ -2706,8 +2706,8 @@ struct test_conv_transpose_1d : public test_case {
|
||||
return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
|
||||
}
|
||||
|
||||
test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
|
||||
std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
||||
test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_channels, 1 /* assert in cpu kernel*/, 1 (should be batch)]
|
||||
std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, output_channels, input_channels, 1 (should be batch)]
|
||||
int s0 = 1, int p0 = 0, int d0 = 1)
|
||||
: ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
|
||||
|
||||
@@ -4029,6 +4029,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false));
|
||||
test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true));
|
||||
|
||||
for(uint32_t Cout : {1, 9}){
|
||||
for(uint32_t Cin : {1, 7}){
|
||||
for(uint32_t K : {1, 3, 1337}){
|
||||
for(uint32_t L : {1, 2, 13}){
|
||||
for(uint32_t s0: {1, 2, 3}){
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({L,Cin,1,1}, {K,Cout,Cin,1}, s0, 0, 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_conv_transpose_1d());
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
|
||||
|
||||
Reference in New Issue
Block a user