mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
35 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4762ad7316 | ||
|
|
1dc9614e06 | ||
|
|
446595b9b3 | ||
|
|
66906cd82a | ||
|
|
11dd5a44eb | ||
|
|
9b8f3c6c77 | ||
|
|
c7f3169cd5 | ||
|
|
793c0d7f46 | ||
|
|
ce111d39d6 | ||
|
|
e7fecba934 | ||
|
|
e2b7621e7c | ||
|
|
c1dbea752a | ||
|
|
749e0d27f0 | ||
|
|
64bf1c3744 | ||
|
|
c12bbde372 | ||
|
|
3f4fc97f1d | ||
|
|
2df255da3c | ||
|
|
60f816a79d | ||
|
|
5592f278b6 | ||
|
|
e4868d16d2 | ||
|
|
820de57d4f | ||
|
|
cb4a63aad6 | ||
|
|
86f5623d90 | ||
|
|
39cffdf188 | ||
|
|
065908cb09 | ||
|
|
4ec6291a24 | ||
|
|
a12363bbf0 | ||
|
|
a86f52b285 | ||
|
|
b284197df4 | ||
|
|
221c0e0c58 | ||
|
|
07a19e27a2 | ||
|
|
18f3b5ff9e | ||
|
|
7233358d29 | ||
|
|
6c88b3bb25 | ||
|
|
14c28dfc50 |
@@ -1,10 +1,10 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG MUSA_VERSION=rc4.0.1
|
||||
ARG MUSA_VERSION=rc4.2.0
|
||||
# Target the MUSA build image
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
|
||||
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=6.3
|
||||
ARG AMDGPU_VERSION=6.3
|
||||
ARG ROCM_VERSION=6.4
|
||||
ARG AMDGPU_VERSION=6.4
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -515,7 +515,7 @@ jobs:
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
||||
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
2
.github/workflows/close-issue.yml
vendored
2
.github/workflows/close-issue.yml
vendored
@@ -17,7 +17,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
|
||||
exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
|
||||
days-before-issue-stale: 30
|
||||
days-before-issue-close: 14
|
||||
stale-issue-label: "stale"
|
||||
|
||||
@@ -54,7 +54,7 @@ docker run --privileged -it \
|
||||
-v $HOME/llama.cpp/ci-cache:/ci-cache \
|
||||
-v $HOME/llama.cpp/ci-results:/ci-results \
|
||||
-v $PWD:/ws -w /ws \
|
||||
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
||||
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
```
|
||||
|
||||
Inside the container, execute the following commands:
|
||||
|
||||
@@ -3791,7 +3791,7 @@ class Plamo2Model(TextModel):
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
|
||||
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
|
||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
|
||||
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
|
||||
|
||||
# Mamba parameters
|
||||
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
|
||||
@@ -3802,7 +3802,7 @@ class Plamo2Model(TextModel):
|
||||
self.gguf_writer.add_ssm_group_count(0)
|
||||
|
||||
# MLP feed forward parameters (for attention layers)
|
||||
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
|
||||
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
@@ -6486,7 +6486,7 @@ class JaisModel(TextModel):
|
||||
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
||||
|
||||
|
||||
@ModelBase.register("Glm4ForCausalLM")
|
||||
@ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
|
||||
class Glm4Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GLM4
|
||||
|
||||
@@ -6508,7 +6508,8 @@ class Glm4Model(TextModel):
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
rope_dim = self.hparams["head_dim"]
|
||||
if (rope_dim := self.hparams.get("head_dim")) is None:
|
||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
@@ -6516,6 +6517,13 @@ class Glm4Model(TextModel):
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("model.visual."): # ignore visual part of Glm4v
|
||||
return []
|
||||
elif name.startswith("model.language_model."):
|
||||
name = name.replace("language_model.", "") # for Glm4v
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
||||
class ChatGLMModel(TextModel):
|
||||
|
||||
@@ -42,14 +42,14 @@ cmake --build build --config Release -j $(nproc)
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
- By default, NNPA is enabled when available. To disable it (not recommended):
|
||||
- By default, NNPA is disabled by default. To enable it:
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||
-DGGML_NNPA=OFF
|
||||
-DGGML_NNPA=ON
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
@@ -84,9 +84,9 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||

|
||||
|
||||
You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
|
||||
You can find popular models pre-converted and verified at [s390x Verified Models](https://huggingface.co/collections/taronaeo/s390x-verified-models-672765393af438d0ccb72a08) or [s390x Runnable Models](https://huggingface.co/collections/taronaeo/s390x-runnable-models-686e951824198df12416017e).
|
||||
|
||||
These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.
|
||||
These models have already been converted from `safetensors` to `GGUF` Big-Endian and their respective tokenizers verified to run correctly on IBM z15 and later system.
|
||||
|
||||
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
|
||||
|
||||
@@ -94,6 +94,14 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||
The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
|
||||
|
||||
Ensure that you have installed the required packages in advance
|
||||
|
||||
```bash
|
||||
pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
Convert the `safetensors` model to `GGUF`
|
||||
|
||||
```bash
|
||||
python3 convert_hf_to_gguf.py \
|
||||
--outfile model-name-be.f16.gguf \
|
||||
@@ -116,7 +124,7 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||

|
||||
|
||||
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
|
||||
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B GGUF](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
|
||||
|
||||
```bash
|
||||
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
|
||||
@@ -141,15 +149,15 @@ Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by
|
||||
|
||||
### 2. NNPA Vector Intrinsics Acceleration
|
||||
|
||||
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
|
||||
### 3. zDNN Accelerator
|
||||
|
||||
_Only available in IBM z16 or later system. No direction at the moment._
|
||||
_Only available in IBM z16 / LinuxONE 4 or later system. No support currently available._
|
||||
|
||||
### 4. Spyre Accelerator
|
||||
|
||||
_No direction at the moment._
|
||||
_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
@@ -189,6 +197,26 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
|
||||
Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
|
||||
|
||||
4. Failing to install the `sentencepiece` package using GCC 15+
|
||||
|
||||
Answer: The `sentencepiece` team are aware of this as seen in [this issue](https://github.com/google/sentencepiece/issues/1108).
|
||||
|
||||
As a temporary workaround, please run the installation command with the following environment variables.
|
||||
|
||||
```bash
|
||||
export CXXFLAGS="-include cstdint"
|
||||
```
|
||||
|
||||
For example,
|
||||
|
||||
```bash
|
||||
CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
5. `-DGGML_NNPA=ON` generates gibberish output
|
||||
|
||||
Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
|
||||
|
||||
## Getting Help on IBM Z & LinuxONE
|
||||
|
||||
1. **Bugs, Feature Requests**
|
||||
@@ -244,3 +272,5 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
- ✅ - acceleration available
|
||||
- 🚫 - acceleration unavailable, will still run using scalar implementation
|
||||
- ❓ - acceleration unknown, please contribute if you can test it yourself
|
||||
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 25, 2025.
|
||||
|
||||
@@ -68,6 +68,9 @@ cmake --build build --config Release
|
||||
cmake --build build-x64-windows-llvm-release
|
||||
```
|
||||
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
|
||||
- **Debian / Ubuntu:** `sudo apt-get install libcurl4-openssl-dev` # (or `libcurl4-gnutls-dev` if you prefer GnuTLS)
|
||||
- **Fedora / RHEL / Rocky / Alma:** `sudo dnf install libcurl-devel`
|
||||
- **Arch / Manjaro:** `sudo pacman -S curl` # includes libcurl headers
|
||||
|
||||
## BLAS Build
|
||||
|
||||
|
||||
@@ -23,11 +23,19 @@ The convert script reads the model configuration, tokenizer, tensor names+data a
|
||||
|
||||
The required steps to implement for an HF model are:
|
||||
|
||||
1. Define the model `Model.register` annotation in a new `Model` subclass, example:
|
||||
1. Define the model `ModelBase.register` annotation in a new `TextModel` or `MmprojModel` subclass, example:
|
||||
|
||||
```python
|
||||
@Model.register("MyModelForCausalLM")
|
||||
class MyModel(Model):
|
||||
@ModelBase.register("MyModelForCausalLM")
|
||||
class MyModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.MYMODEL
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```python
|
||||
@ModelBase.register("MyModelForConditionalGeneration")
|
||||
class MyModel(MmprojModel):
|
||||
model_arch = gguf.MODEL_ARCH.MYMODEL
|
||||
```
|
||||
|
||||
@@ -75,9 +83,10 @@ block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
|
||||
|
||||
Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
|
||||
- `Model#set_gguf_parameters`
|
||||
- `Model#set_vocab`
|
||||
- `Model#write_tensors`
|
||||
- `TextModel#set_gguf_parameters`
|
||||
- `MmprojModel#set_gguf_parameters`
|
||||
- `ModelBase#set_vocab`
|
||||
- `ModelBase#modify_tensors`
|
||||
|
||||
NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
||||
|
||||
|
||||
@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
|
||||
|
||||
The defaults are:
|
||||
|
||||
- `MUSA_VERSION` set to `rc4.0.1`
|
||||
- `MUSA_VERSION` set to `rc4.2.0`
|
||||
|
||||
The resulting images, are essentially the same as the non-MUSA images:
|
||||
|
||||
|
||||
16
docs/ops.md
16
docs/ops.md
@@ -2,6 +2,11 @@
|
||||
|
||||
List of GGML operations and backend support status.
|
||||
|
||||
## How to add a backend to this table:
|
||||
|
||||
1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)
|
||||
2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`
|
||||
|
||||
Legend:
|
||||
- ✅ Fully supported by this backend
|
||||
- 🟡 Partially supported by this backend
|
||||
@@ -18,7 +23,8 @@ Legend:
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | 🟡 |
|
||||
| CONCAT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| CONT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| CONT | ❌ | ✅ | ✅ | ✅ |
|
||||
| CONV_2D | ❌ | ✅ | ❌ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
|
||||
@@ -30,7 +36,7 @@ Legend:
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
|
||||
| DIV | ❌ | ✅ | ✅ | 🟡 |
|
||||
| DUP | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| ELU | ❌ | ✅ | ❌ | 🟡 |
|
||||
| ELU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| EXP | ❌ | ✅ | 🟡 | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
|
||||
@@ -66,14 +72,16 @@ Legend:
|
||||
| REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
|
||||
| RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
|
||||
| RMS_NORM_MUL | ❌ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ❌ |
|
||||
| ROLL | ❌ | ✅ | ❌ | ❌ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ |
|
||||
| ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
|
||||
| RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
|
||||
| SCALE | ❌ | ✅ | ✅ | ✅ |
|
||||
| SET | ❌ | ✅ | ❌ | ✅ |
|
||||
| SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
|
||||
| SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 |
|
||||
| SGN | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| SILU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
|
||||
13883
docs/ops/CPU.csv
13883
docs/ops/CPU.csv
File diff suppressed because it is too large
Load Diff
13883
docs/ops/CUDA.csv
13883
docs/ops/CUDA.csv
File diff suppressed because it is too large
Load Diff
@@ -131,7 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
option(GGML_NNPA "ggml: enable nnpa" ON)
|
||||
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
||||
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||
|
||||
@@ -1,12 +1,108 @@
|
||||
@PACKAGE_INIT@
|
||||
|
||||
@GGML_VARIABLES_EXPANDED@
|
||||
|
||||
@PACKAGE_INIT@
|
||||
# Find all dependencies before creating any target.
|
||||
include(CMakeFindDependencyMacro)
|
||||
find_dependency(Threads)
|
||||
if (NOT GGML_SHARED_LIB)
|
||||
set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
|
||||
set(GGML_CPU_INTERFACE_LINK_OPTIONS "")
|
||||
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||
if(NOT ACCELERATE_FRAMEWORK)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
|
||||
return()
|
||||
endif()
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
find_dependency(OpenMP)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
find_library(memkind memkind)
|
||||
if(NOT memkind)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
|
||||
return()
|
||||
endif()
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
|
||||
endif()
|
||||
|
||||
if (GGML_BLAS)
|
||||
find_dependency(BLAS)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
set(GGML_CUDA_INTERFACE_LINK_LIBRARIES "")
|
||||
find_dependency(CUDAToolkit)
|
||||
if (GGML_STATIC)
|
||||
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cudart_static>)
|
||||
if (WIN32)
|
||||
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas> $<LINK_ONLY:CUDA::cublasLt>)
|
||||
else()
|
||||
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas_static> $<LINK_ONLY:CUDA::cublasLt_static>)
|
||||
endif()
|
||||
endif()
|
||||
if (NOT GGML_CUDA_NO_VMM)
|
||||
list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cuda_driver>)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation)
|
||||
find_library(METAL_FRAMEWORK Metal)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit)
|
||||
if(NOT FOUNDATION_LIBRARY OR NOT METAL_FRAMEWORK OR NOT METALKIT_FRAMEWORK)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
|
||||
return()
|
||||
endif()
|
||||
set(GGML_METAL_INTERFACE_LINK_LIBRARIES
|
||||
${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
if (GGML_OPENCL)
|
||||
find_dependency(OpenCL)
|
||||
set(GGML_OPENCL_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:OpenCL::OpenCL>)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN)
|
||||
find_dependency(Vulkan)
|
||||
set(GGML_VULKAN_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:Vulkan::Vulkan>)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP)
|
||||
find_dependency(hip)
|
||||
find_dependency(hipblas)
|
||||
find_dependency(rocblas)
|
||||
set(GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL)
|
||||
set(GGML_SYCL_INTERFACE_LINK_LIBRARIES "")
|
||||
find_package(DNNL)
|
||||
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
|
||||
endif()
|
||||
if (WIN32)
|
||||
find_dependency(IntelSYCL)
|
||||
find_dependency(MKL)
|
||||
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
|
||||
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
||||
|
||||
if(NOT TARGET ggml::ggml)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
find_library(GGML_LIBRARY ggml
|
||||
@@ -29,66 +125,6 @@ set_target_properties(ggml::ggml-base
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
|
||||
if (NOT GGML_SHARED_LIB)
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP REQUIRED)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
find_library(memkind memkind REQUIRED)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
|
||||
endif()
|
||||
|
||||
if (GGML_BLAS)
|
||||
find_package(BLAS REQUIRED)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
|
||||
list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
|
||||
${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN)
|
||||
find_package(Vulkan REQUIRED)
|
||||
list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL)
|
||||
find_package(DNNL)
|
||||
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
|
||||
endif()
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(_ggml_all_targets "")
|
||||
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||
@@ -149,4 +185,6 @@ set_target_properties(ggml::all
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
|
||||
|
||||
endif() # TARGET ggml::ggml
|
||||
|
||||
check_required_components(ggml)
|
||||
|
||||
@@ -647,6 +647,7 @@ struct ggml_backend_sched {
|
||||
// pipeline parallelism support
|
||||
int n_copies;
|
||||
int cur_copy;
|
||||
int next_copy;
|
||||
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
||||
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
||||
int n_graph_inputs;
|
||||
@@ -1433,8 +1434,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
}
|
||||
|
||||
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -1535,10 +1534,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1550,6 +1549,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
|
||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
||||
GGML_ASSERT(!sched->is_alloc);
|
||||
|
||||
sched->cur_copy = sched->next_copy;
|
||||
sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
|
||||
|
||||
ggml_backend_sched_split_graph(sched, graph);
|
||||
|
||||
@@ -1590,7 +1593,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
||||
// if the graph is not already allocated, always use copy 0 after a synchronization
|
||||
// this ensures that during generation the same copy is used every time,
|
||||
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
|
||||
sched->cur_copy = 0;
|
||||
sched->next_copy = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -77,6 +77,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
for (int i = 0; i < final_dims; i++) {
|
||||
acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
|
||||
}
|
||||
size_t elem_offset = offset / ggml_element_size(tensor);
|
||||
acl_storage_len += elem_offset;
|
||||
|
||||
// Reverse ne and stride.
|
||||
std::reverse(acl_ne, acl_ne + final_dims);
|
||||
@@ -84,7 +86,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
|
||||
aclTensor* acl_tensor = aclCreateTensor(
|
||||
acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
|
||||
offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
|
||||
elem_offset, format, &acl_storage_len, 1,
|
||||
tensor->data);
|
||||
|
||||
return acl_tensor;
|
||||
|
||||
@@ -99,7 +99,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_unary_op(
|
||||
void ggml_cann_op_unary(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
@@ -111,6 +111,42 @@ void ggml_cann_unary_op(
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
}
|
||||
|
||||
void ggml_cann_op_unary_gated(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr;
|
||||
if(src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
|
||||
acl_src0 = ggml_cann_create_tensor(src0);
|
||||
acl_src1 = ggml_cann_create_tensor(src1);
|
||||
} else {
|
||||
int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
|
||||
size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]};
|
||||
acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
|
||||
acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
|
||||
if (swapped) {
|
||||
std::swap(acl_src0, acl_src1);
|
||||
}
|
||||
}
|
||||
|
||||
unary_op(ctx, acl_src0, acl_dst);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1);
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_dst);
|
||||
if(src1)
|
||||
ggml_cann_release_resources(ctx, acl_src1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Repeats elements of a tensor along each dimension according to the
|
||||
* specified repeat array.
|
||||
@@ -1785,8 +1821,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
||||
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
|
||||
bcast_weight_nb[2], bcast_weight_nb[3],
|
||||
bcast_weight_nb[4], bcast_weight_nb[5]};
|
||||
aclTensor* acl_weight_tensor =
|
||||
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
|
||||
aclTensor* acl_weight_tensor;
|
||||
|
||||
bool weightToNZ = false;
|
||||
#ifdef ASCEND_310P
|
||||
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
||||
#endif
|
||||
if (weightToNZ && is_matmul_weight(weight)) {
|
||||
int64_t acl_stride[2] = {1, transpose_ne[1]};
|
||||
|
||||
// Reverse ne.
|
||||
std::reverse(transpose_ne, transpose_ne + n_dims);
|
||||
|
||||
std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
|
||||
|
||||
acl_weight_tensor = aclCreateTensor(
|
||||
transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
|
||||
0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
|
||||
} else {
|
||||
acl_weight_tensor =
|
||||
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
||||
}
|
||||
aclTensor* acl_dst =
|
||||
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#ifndef CANN_ACLNN_OPS
|
||||
#define CANN_ACLNN_OPS
|
||||
|
||||
#include <unordered_set>
|
||||
#include <functional>
|
||||
#include <aclnnop/aclnn_abs.h>
|
||||
#include <aclnnop/aclnn_neg.h>
|
||||
@@ -1020,6 +1021,37 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
|
||||
*/
|
||||
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Check whether a tensor is a weight tensor for matrix multiplication.
|
||||
*
|
||||
* @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
|
||||
* typically within neural network layers. The function maintains a static set of canonical weight
|
||||
* naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
|
||||
* tensors even with hierarchical naming patterns.
|
||||
*
|
||||
* @param tensor Pointer to the target ggml_tensor object (const-qualified).
|
||||
*/
|
||||
static bool is_matmul_weight(const ggml_tensor* tensor) {
|
||||
std::string name = ggml_get_name(tensor);
|
||||
static const std::unordered_set<std::string> weight_suffixes{
|
||||
"output.weight",
|
||||
"attn_q.weight",
|
||||
"attn_k.weight",
|
||||
"attn_v.weight",
|
||||
"attn_output.weight",
|
||||
"ffn_gate.weight",
|
||||
"ffn_up.weight",
|
||||
"ffn_down.weight"
|
||||
};
|
||||
|
||||
for (const auto& suffix : weight_suffixes) {
|
||||
if (name.find(suffix) != std::string::npos) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies a element-wise operation to two input tensors using the CANN
|
||||
* backend.
|
||||
@@ -1066,7 +1098,7 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
* @param dst The destination tensor. Its src[0] is treated as the input tensor.
|
||||
*/
|
||||
template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
||||
void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
@@ -1077,49 +1109,125 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies a unary operation to a ggml tensor using the CANN backend.
|
||||
* @brief Applies a unary operation to a ggml tensor using the CANN backend.
|
||||
*
|
||||
* @details This function performs a unary operation on the input tensor using
|
||||
* a user-provided lambda or callable object `unary_op`, which accepts the CANN
|
||||
* context and two ACL tensors (source and destination). Internally, this function
|
||||
* creates ACL representations of the ggml tensors and invokes the unary operation.
|
||||
* The result is stored in the destination tensor `dst`. This utility abstracts the
|
||||
* common boilerplate of tensor conversion and cleanup when implementing unary ops.
|
||||
* @details This function applies a unary operation to the input tensor using
|
||||
* a user-provided lambda or callable `unary_op`. The lambda receives the
|
||||
* CANN backend context and two ACL tensors: the source and the destination.
|
||||
*
|
||||
* @param unary_op A callable that performs the unary operation using CANN APIs.
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
* The source tensor is retrieved from `dst->src[0]`.
|
||||
* Internally, this function handles the conversion from GGML tensors to ACL tensors,
|
||||
* calls the provided unary op, and manages resource cleanup. The input is assumed
|
||||
* to be `dst->src[0]`, and the result is written to `dst`.
|
||||
*
|
||||
* This utility simplifies writing unary op wrappers by abstracting tensor preparation.
|
||||
*
|
||||
* @param unary_op A callable that performs the unary operation using CANN ACL APIs.
|
||||
* @param ctx The CANN context for operation execution.
|
||||
* @param dst The destination ggml_tensor where the result will be stored.
|
||||
* The input tensor is assumed to be `dst->src[0]`.
|
||||
*
|
||||
* @see GGML_CANN_CALL_OP_UNARY
|
||||
*/
|
||||
void ggml_cann_unary_op(
|
||||
void ggml_cann_op_unary(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
|
||||
* @brief Applies a gated (GLU-style) unary operation using the CANN backend.
|
||||
*
|
||||
* This macro defines an inline lambda wrapping a specific ACL operation name,
|
||||
* and passes it to the templated ggml_cann_unary_op function. It simplifies
|
||||
* calling unary ops by hiding the lambda boilerplate.
|
||||
* @details This function performs a gated activation such as GEGLU or ReGLU.
|
||||
* It supports two input modes:
|
||||
*
|
||||
* 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors.
|
||||
* These are used directly as the value and gate tensors.
|
||||
*
|
||||
* 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to
|
||||
* contain a concatenation of value and gate along the first dimension. This tensor
|
||||
* will be split into two equal halves to form the value and gate inputs.
|
||||
*
|
||||
* The function applies a user-provided unary operation (e.g., GELU) to the value tensor,
|
||||
* then multiplies the result in-place with the gate tensor:
|
||||
*
|
||||
* Internally, the lambda will call:
|
||||
* @code
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* dst = unary_op(value) * gate;
|
||||
* @endcode
|
||||
*
|
||||
* The `swapped` parameter (from `dst->op_params[1]`) allows flipping the
|
||||
* order of value/gate in the packed input case.
|
||||
*
|
||||
* @param unary_op A callable that performs the unary operation using CANN ACL APIs.
|
||||
* It receives (ctx, acl_value_tensor, acl_output_tensor).
|
||||
* @param ctx The CANN context used for execution.
|
||||
* @param dst The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`.
|
||||
*
|
||||
* @see GGML_CANN_CALL_OP_UNARY_GATED
|
||||
*/
|
||||
void ggml_cann_op_unary_gated(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
|
||||
*
|
||||
* This macro wraps the specified ACLNN unary operator name into a lambda expression,
|
||||
* and passes it to `ggml_cann_op_unary`, which handles the common logic for executing
|
||||
* unary ops in the CANN backend.
|
||||
*
|
||||
* Internally, this macro expands to a lambda like:
|
||||
* @code
|
||||
* [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* };
|
||||
* @endcode
|
||||
*
|
||||
* This lambda is then passed to `ggml_cann_op_unary`, which applies the operation.
|
||||
*
|
||||
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
|
||||
*
|
||||
* @see ggml_cann_unary_op
|
||||
* @see ggml_cann_op_unary
|
||||
* @see GGML_CANN_CALL_ACLNN_OP
|
||||
*/
|
||||
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
|
||||
#define GGML_CANN_CALL_OP_UNARY(OP_NAME) \
|
||||
do { \
|
||||
auto lambda = [](ggml_backend_cann_context& ctx, \
|
||||
aclTensor* acl_src, \
|
||||
aclTensor* acl_dst) { \
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
|
||||
}; \
|
||||
ggml_cann_unary_op(lambda, ctx, dst); \
|
||||
ggml_cann_op_unary(lambda, ctx, dst); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
/**
|
||||
* @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
|
||||
*
|
||||
* This macro wraps the specified ACLNN unary operator name into a lambda expression,
|
||||
* and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for
|
||||
* executing gated unary ops in the CANN backend.
|
||||
*
|
||||
* Internally, this macro expands to a lambda like:
|
||||
* @code
|
||||
* [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* };
|
||||
* @endcode
|
||||
*
|
||||
* This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation.
|
||||
*
|
||||
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
|
||||
*
|
||||
* @see ggml_cann_op_unary_gated
|
||||
* @see GGML_CANN_CALL_ACLNN_OP
|
||||
*/
|
||||
#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME) \
|
||||
do { \
|
||||
auto lambda = [](ggml_backend_cann_context& ctx, \
|
||||
aclTensor* acl_src, \
|
||||
aclTensor* acl_dst) { \
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
|
||||
}; \
|
||||
ggml_cann_op_unary_gated(lambda, ctx, dst); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
#endif // CANN_ACLNN_OPS
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
#include <acl/acl.h>
|
||||
#include <stdarg.h>
|
||||
#include <aclnnop/aclnn_trans_matmul_weight.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
@@ -1115,6 +1116,63 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
|
||||
aclDataType dataType, aclTensor **tensor)
|
||||
{
|
||||
uint64_t size = 1;
|
||||
for (auto i : shape) {
|
||||
size *= i;
|
||||
}
|
||||
|
||||
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
|
||||
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
|
||||
|
||||
size *= sizeof(int16_t);
|
||||
|
||||
ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
|
||||
std::vector<int64_t> strides(shape.size(), 1);
|
||||
for (int64_t i = shape.size() - 2; i >= 0; i--) {
|
||||
strides[i] = shape[i + 1] * strides[i + 1];
|
||||
}
|
||||
|
||||
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
shape.data(), shape.size(), *deviceAddr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
|
||||
aclrtStream stream;
|
||||
ACL_CHECK(aclrtCreateStream(&stream));
|
||||
|
||||
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
|
||||
void *weightTransposedDeviceAddr = nullptr;
|
||||
aclTensor *weightTransposed = nullptr;
|
||||
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
|
||||
ggml_cann_type_mapping(tensor->type), &weightTransposed);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
void *workspaceAddr = nullptr;
|
||||
|
||||
// TransMatmulWeight
|
||||
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
|
||||
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
|
||||
if (workspaceSize > 0) {
|
||||
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
workspaceAddrPtrTrans.reset(workspaceAddr);
|
||||
}
|
||||
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
|
||||
|
||||
size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
|
||||
|
||||
aclrtMemcpy((char *)tensor->data + offset, size,
|
||||
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
ACL_CHECK(aclDestroyTensor(weightTransposed));
|
||||
aclrtFree(weightTransposedDeviceAddr);
|
||||
}
|
||||
|
||||
// TODO: need handle tensor which has paddings.
|
||||
/**
|
||||
* @brief Set tensor data in a CANN buffer.
|
||||
@@ -1139,9 +1197,16 @@ static void ggml_backend_cann_buffer_set_tensor(
|
||||
// For acl, synchronous functions use this default stream.
|
||||
// Why aclrtSynchronizeDevice?
|
||||
|
||||
bool weightToNZ = false;
|
||||
#ifdef ASCEND_310P
|
||||
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
||||
#endif
|
||||
if (!need_transform(tensor->type)) {
|
||||
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
||||
ACL_MEMCPY_HOST_TO_DEVICE));
|
||||
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
|
||||
weight_format_to_nz(tensor, data, offset);
|
||||
}
|
||||
} else {
|
||||
void *transform_buffer = malloc(size);
|
||||
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
||||
@@ -1616,16 +1681,18 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(dst)) {
|
||||
case GGML_UNARY_OP_ABS:
|
||||
GGML_CANN_CALL_UNARY_OP(Abs);
|
||||
GGML_CANN_CALL_OP_UNARY(Abs);
|
||||
break;
|
||||
case GGML_UNARY_OP_NEG:
|
||||
GGML_CANN_CALL_UNARY_OP(Neg);
|
||||
GGML_CANN_CALL_OP_UNARY(Neg);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
GGML_CANN_CALL_UNARY_OP(Gelu);
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
// aclnnGelu internally uses the erf-based approximation.
|
||||
GGML_CANN_CALL_OP_UNARY(Gelu);
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
GGML_CANN_CALL_UNARY_OP(Silu);
|
||||
GGML_CANN_CALL_OP_UNARY(Silu);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK: {
|
||||
auto lambda = [](ggml_backend_cann_context& ctx,
|
||||
@@ -1633,31 +1700,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_unary_op(lambda, ctx, dst);
|
||||
ggml_cann_op_unary(lambda, ctx, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
GGML_CANN_CALL_UNARY_OP(Tanh);
|
||||
GGML_CANN_CALL_OP_UNARY(Tanh);
|
||||
break;
|
||||
case GGML_UNARY_OP_RELU:
|
||||
GGML_CANN_CALL_UNARY_OP(Relu);
|
||||
GGML_CANN_CALL_OP_UNARY(Relu);
|
||||
break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
GGML_CANN_CALL_UNARY_OP(Sigmoid);
|
||||
GGML_CANN_CALL_OP_UNARY(Sigmoid);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
|
||||
GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
GGML_CANN_CALL_UNARY_OP(Hardswish);
|
||||
GGML_CANN_CALL_OP_UNARY(Hardswish);
|
||||
break;
|
||||
case GGML_UNARY_OP_EXP:
|
||||
GGML_CANN_CALL_UNARY_OP(Exp);
|
||||
GGML_CANN_CALL_OP_UNARY(Exp);
|
||||
break;
|
||||
case GGML_UNARY_OP_ELU:
|
||||
ggml_cann_elu(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SGN:
|
||||
GGML_CANN_CALL_UNARY_OP(Sign);
|
||||
GGML_CANN_CALL_OP_UNARY(Sign);
|
||||
break;
|
||||
case GGML_UNARY_OP_STEP:
|
||||
ggml_cann_step(ctx, dst);
|
||||
@@ -1666,6 +1733,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(dst)) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Relu);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
// aclnnGelu internally uses the erf-based approximation.
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
|
||||
break;
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Silu);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK: {
|
||||
auto lambda = [](ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_src,
|
||||
aclTensor* acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_op_unary_gated(lambda, ctx, dst);
|
||||
} break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_NORM:
|
||||
ggml_cann_norm(ctx, dst);
|
||||
break;
|
||||
@@ -1708,7 +1800,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
ggml_cann_binary_op<aclnn_mul>(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SQRT:
|
||||
GGML_CANN_CALL_UNARY_OP(Sqrt);
|
||||
GGML_CANN_CALL_OP_UNARY(Sqrt);
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
ggml_cann_clamp(ctx, dst);
|
||||
@@ -1753,16 +1845,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
ggml_cann_argmax(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_COS:
|
||||
ggml_cann_unary_op<aclnn_cos>(ctx, dst);
|
||||
ggml_cann_op_unary<aclnn_cos>(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SIN:
|
||||
ggml_cann_unary_op<aclnn_sin>(ctx, dst);
|
||||
ggml_cann_op_unary<aclnn_sin>(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_cann_conv_transpose_1d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_LOG:
|
||||
GGML_CANN_CALL_UNARY_OP(Log);
|
||||
GGML_CANN_CALL_OP_UNARY(Log);
|
||||
break;
|
||||
case GGML_OP_MEAN:
|
||||
ggml_cann_mean(ctx, dst);
|
||||
@@ -2036,10 +2128,23 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
case GGML_UNARY_OP_STEP:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(op)) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F16:
|
||||
|
||||
@@ -70,10 +70,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
||||
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
endif()
|
||||
@@ -456,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list(APPEND ARCH_FLAGS -march=z16)
|
||||
elseif (${S390X_M} MATCHES "9175|9176")
|
||||
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
||||
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
||||
message(STATUS "z17 target")
|
||||
list(APPEND ARCH_FLAGS -march=z17)
|
||||
else()
|
||||
|
||||
@@ -544,7 +544,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
__m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
|
||||
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
||||
__m128 tmp = max4;
|
||||
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
|
||||
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x1 ));
|
||||
const float max_scalar = ((v4f32)max4)[0];
|
||||
|
||||
// Quantize these floats
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cstdlib> // for qsort
|
||||
#include <cstdio> // for GGML_ASSERT
|
||||
|
||||
#include "repack.h"
|
||||
|
||||
@@ -56,7 +56,7 @@
|
||||
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
||||
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
||||
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
||||
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
||||
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
||||
|
||||
@@ -72,8 +72,9 @@
|
||||
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
|
||||
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
|
||||
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
|
||||
|
||||
// Moore Threads
|
||||
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
@@ -226,6 +227,10 @@ typedef float2 dfloat2;
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
|
||||
#define AMD_MFMA_AVAILABLE
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
@@ -288,6 +293,11 @@ static bool fp32_mma_hardware_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_CDNA(cc);
|
||||
}
|
||||
|
||||
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
|
||||
static bool amd_mfma_available(const int cc) {
|
||||
return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
|
||||
}
|
||||
|
||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
||||
static bool new_mma_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
||||
@@ -765,7 +775,7 @@ struct ggml_tensor_extra_gpu {
|
||||
};
|
||||
|
||||
|
||||
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
|
||||
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
|
||||
#define USE_CUDA_GRAPH
|
||||
#endif
|
||||
|
||||
|
||||
@@ -6,24 +6,33 @@
|
||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||
const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03) {
|
||||
const int64_t i00 = 2 * (int64_t(blockDim.x)*blockIdx.x + threadIdx.x);
|
||||
|
||||
if (i >= k) {
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t ib = i/qk; // block index
|
||||
const int64_t iqs = (i%qk)/qr; // quant index
|
||||
const int64_t iybs = i - i%qk; // y block start index
|
||||
const int64_t i01 = blockIdx.y;
|
||||
const int64_t i02 = blockIdx.z % ne02;
|
||||
const int64_t i03 = blockIdx.z / ne02;
|
||||
|
||||
const int64_t ibx0 = i03*s03 + i02*s02 + i01*s01;
|
||||
|
||||
const int64_t ib = ibx0 + i00/qk; // block index
|
||||
const int64_t iqs = (i00%qk)/qr; // quant index
|
||||
const int64_t iybs = i00 - i00%qk; // y block start index
|
||||
const int64_t y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// dequantize
|
||||
dfloat2 v;
|
||||
dequantize_kernel(vx, ib, iqs, v);
|
||||
|
||||
y[iybs + iqs + 0] = v.x;
|
||||
y[iybs + iqs + y_offset] = v.y;
|
||||
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
|
||||
y[iy0 + 0] = float(v.x);
|
||||
y[iy0 + y_offset] = float(v.y);
|
||||
}
|
||||
|
||||
template <bool need_check>
|
||||
@@ -457,9 +466,17 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
static void dequantize_block_cuda(const void * vx, dst_t * y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03, cudaStream_t stream) {
|
||||
const dim3 num_blocks((ne00 + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE), ne01, ne02*ne03);
|
||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>
|
||||
(vx, y, ne00, ne01, ne02, s01, s02, s03);
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_cont_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
dequantize_block_cuda<qk, qr, dequantize_kernel, dst_t>(vx, y, k, 1, 1, 1, k/qk, k/qk, k/qk, stream);
|
||||
}
|
||||
|
||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
@@ -624,14 +641,14 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_row_q4_1_cuda;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
|
||||
return dequantize_block_q8_0_f16_cuda;
|
||||
}
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_cuda;
|
||||
case GGML_TYPE_Q3_K:
|
||||
@@ -676,11 +693,11 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_row_q4_1_cuda;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
return dequantize_block_cont_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
return dequantize_block_cont_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
return dequantize_block_cont_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_cuda;
|
||||
case GGML_TYPE_Q3_K:
|
||||
@@ -722,6 +739,16 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16>;
|
||||
default:
|
||||
@@ -733,6 +760,16 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float, nv_bfloat16>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half, nv_bfloat16>;
|
||||
default:
|
||||
@@ -744,6 +781,16 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half, float>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16, float>;
|
||||
default:
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#include "cpy.cuh"
|
||||
#include "dequantize.cuh"
|
||||
#include "cpy-utils.cuh"
|
||||
#ifdef GGML_USE_MUSA
|
||||
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
||||
#include "ggml-musa/mudnn.cuh"
|
||||
#endif // GGML_USE_MUSA
|
||||
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
||||
|
||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||
|
||||
@@ -121,7 +121,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
|
||||
// Copy destination pointers to GPU to be available when pointer indirection is in use
|
||||
|
||||
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||
if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
if (cuda_graph->dest_ptrs_d != nullptr) {
|
||||
@@ -314,7 +314,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
|
||||
char ** dest_ptrs_d = nullptr;
|
||||
int graph_cpynode_index = -1;
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
||||
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
||||
@@ -324,11 +324,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
#endif
|
||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||
#ifdef GGML_USE_MUSA
|
||||
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
||||
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
|
||||
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
|
||||
} else
|
||||
#endif // GGML_USE_MUSA
|
||||
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
||||
{
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
}
|
||||
@@ -379,7 +379,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
}
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
||||
}
|
||||
|
||||
@@ -23,33 +23,13 @@ typedef void (* fattn_kernel_t)(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3);
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33);
|
||||
|
||||
typedef half (*vec_dot_KQ_f16_t)(
|
||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
|
||||
@@ -745,33 +725,58 @@ void launch_fattn(
|
||||
size_t nb23 = V ? V->nb[3] : nb13;
|
||||
|
||||
if (need_f16_K && K->type != GGML_TYPE_F16) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(K));
|
||||
K_f16.alloc(ggml_nelements(K));
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
||||
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
||||
K_data = (char *) K_f16.ptr;
|
||||
|
||||
const size_t bs = ggml_blck_size(K->type);
|
||||
const size_t ts = ggml_type_size(K->type);
|
||||
|
||||
nb11 = nb11*bs*sizeof(half)/ts;
|
||||
nb12 = nb12*bs*sizeof(half)/ts;
|
||||
nb13 = nb13*bs*sizeof(half)/ts;
|
||||
K_f16.alloc(ggml_nelements(K));
|
||||
if (ggml_is_contiguously_allocated(K)) {
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
||||
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
||||
|
||||
nb11 = nb11*bs*sizeof(half)/ts;
|
||||
nb12 = nb12*bs*sizeof(half)/ts;
|
||||
nb13 = nb13*bs*sizeof(half)/ts;
|
||||
} else {
|
||||
GGML_ASSERT(K->nb[0] == ts);
|
||||
to_fp16_nc_cuda_t to_fp16 = ggml_get_to_fp16_nc_cuda(K->type);
|
||||
const int64_t s01 = nb11 / ts;
|
||||
const int64_t s02 = nb12 / ts;
|
||||
const int64_t s03 = nb13 / ts;
|
||||
to_fp16(K_data, K_f16.ptr, K->ne[0], K->ne[1], K->ne[2], K->ne[3], s01, s02, s03, main_stream);
|
||||
|
||||
nb11 = K->ne[0] * sizeof(half);
|
||||
nb12 = K->ne[1] * nb11;
|
||||
nb13 = K->ne[2] * nb12;
|
||||
}
|
||||
K_data = (char *) K_f16.ptr;
|
||||
}
|
||||
|
||||
if (V && need_f16_V && V->type != GGML_TYPE_F16) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(V));
|
||||
V_f16.alloc(ggml_nelements(V));
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
||||
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
||||
V_data = (char *) V_f16.ptr;
|
||||
|
||||
const size_t bs = ggml_blck_size(V->type);
|
||||
const size_t ts = ggml_type_size(V->type);
|
||||
|
||||
nb21 = nb21*bs*sizeof(half)/ts;
|
||||
nb22 = nb22*bs*sizeof(half)/ts;
|
||||
nb23 = nb23*bs*sizeof(half)/ts;
|
||||
V_f16.alloc(ggml_nelements(V));
|
||||
if (ggml_is_contiguously_allocated(V)) {
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
||||
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
||||
V_data = (char *) V_f16.ptr;
|
||||
|
||||
nb21 = nb21*bs*sizeof(half)/ts;
|
||||
nb22 = nb22*bs*sizeof(half)/ts;
|
||||
nb23 = nb23*bs*sizeof(half)/ts;
|
||||
} else {
|
||||
GGML_ASSERT(V->nb[0] == ts);
|
||||
to_fp16_nc_cuda_t to_fp16 = ggml_get_to_fp16_nc_cuda(V->type);
|
||||
const int64_t s01 = nb21 / ts;
|
||||
const int64_t s02 = nb22 / ts;
|
||||
const int64_t s03 = nb23 / ts;
|
||||
to_fp16(V_data, V_f16.ptr, V->ne[0], V->ne[1], V->ne[2], V->ne[3], s01, s02, s03, main_stream);
|
||||
|
||||
nb21 = V->ne[0] * sizeof(half);
|
||||
nb22 = V->ne[1] * nb21;
|
||||
nb23 = V->ne[2] * nb22;
|
||||
}
|
||||
V_data = (char *) V_f16.ptr;
|
||||
}
|
||||
|
||||
int parallel_blocks = 1;
|
||||
@@ -867,14 +872,11 @@ void launch_fattn(
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
!stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
|
||||
mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
nb11, nb12, nb13,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3], nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
|
||||
mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
|
||||
@@ -408,7 +408,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
const int stride_K,
|
||||
const int stride_V,
|
||||
const int stride_mask,
|
||||
const int jt,
|
||||
half2 * const __restrict__ tile_Q,
|
||||
half2 * const __restrict__ tile_K,
|
||||
half2 * const __restrict__ tile_V,
|
||||
@@ -455,7 +454,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
cp_async_wait_all();
|
||||
__syncthreads();
|
||||
flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(V_h2 + k_VKQ_0*stride_V, tile_V, nbatch_V2, stride_V);
|
||||
(V_h2 + int64_t(k_VKQ_0)*stride_V, tile_V, nbatch_V2, stride_V);
|
||||
} else {
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
if (ncols2 > 1 || mask_h2) {
|
||||
@@ -471,7 +470,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
if (nstages <= 1) {
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(K_h2 + k_VKQ_0*stride_K + k0_start, tile_K, k0_diff, stride_K);
|
||||
(K_h2 + int64_t(k_VKQ_0)*stride_K + k0_start, tile_K, k0_diff, stride_K);
|
||||
if (use_cp_async) {
|
||||
cp_async_wait_all();
|
||||
}
|
||||
@@ -715,7 +714,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
(mask_h2 + (k_VKQ_0 + c::nbatch_fa)/2, tile_mask, stride_mask);
|
||||
}
|
||||
flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(K_h2 + (k_VKQ_0 + c::nbatch_fa)*stride_K, tile_K, nbatch_K2, stride_K);
|
||||
(K_h2 + int64_t(k_VKQ_0 + c::nbatch_fa)*stride_K, tile_K, nbatch_K2, stride_K);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -732,7 +731,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
if (nstages <= 1 && i0_start < reusable_cutoff) {
|
||||
constexpr bool use_cp_async = nstages == 1;
|
||||
flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(V_h2 + k_VKQ_0*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V);
|
||||
(V_h2 + int64_t(k_VKQ_0)*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V);
|
||||
if (use_cp_async) {
|
||||
cp_async_wait_all();
|
||||
}
|
||||
@@ -771,8 +770,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_K); GGML_UNUSED(stride_V);
|
||||
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
|
||||
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
|
||||
GGML_UNUSED(stride_mask); GGML_UNUSED(tile_K);
|
||||
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
|
||||
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
|
||||
GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
|
||||
@@ -920,7 +918,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
(mask_h2 + kb0_start*c::nbatch_fa/2, tile_mask, stride_mask);
|
||||
}
|
||||
flash_attn_ext_f16_load_tile<stride_tile_K, nwarps, c::nbatch_fa, use_cp_async>
|
||||
(K_h2 + kb0_start*c::nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K);
|
||||
(K_h2 + int64_t(kb0_start)*c::nbatch_fa*stride_K, tile_K, nbatch_K2, stride_K);
|
||||
}
|
||||
|
||||
// Iterate over ne11 == previous tokens:
|
||||
@@ -928,13 +926,13 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
constexpr bool last_iter = false;
|
||||
flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
|
||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
|
||||
}
|
||||
{ // kb0_start is always < kb0_stop so the last iter can be executed unconditionally.
|
||||
constexpr bool last_iter = true;
|
||||
flash_attn_ext_f16_iter<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup, last_iter>
|
||||
(Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, jt, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
|
||||
ne01, ne02, stride_K, stride_V, stride_mask, tile_Q, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
|
||||
}
|
||||
|
||||
// With multi-stage loading there is no __syncthreads at the end of the iter,
|
||||
@@ -1214,33 +1212,13 @@ static __global__ void flash_attn_ext_f16(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -1352,15 +1330,16 @@ static __global__ void flash_attn_ext_f16(
|
||||
ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
|
||||
#else
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
||||
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
@@ -21,33 +21,13 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -127,7 +107,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
KV_tmp[i_KQ][k_KQ] = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||
KV_tmp[i_KQ][k_KQ] = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -221,7 +201,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
KV_tmp[k][i] = V_h2[(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
KV_tmp[k][i] = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -300,8 +280,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(nb23);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
|
||||
@@ -21,33 +21,13 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -57,17 +37,16 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
@@ -135,7 +114,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) {
|
||||
const half2 tmp = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
|
||||
const half2 tmp = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
|
||||
KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] = __low2float(tmp);
|
||||
KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp);
|
||||
}
|
||||
@@ -231,8 +210,9 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
KV_tmp2[k*(D/2) + i].x = __low2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
|
||||
KV_tmp2[k*(D/2) + i].y = __high2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
|
||||
const half2 tmp = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i];
|
||||
KV_tmp2[k*(D/2) + i].x = __low2float(tmp);
|
||||
KV_tmp2[k*(D/2) + i].y = __high2float(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -302,17 +282,16 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -18,33 +18,13 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -191,13 +171,16 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
|
||||
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||
|
||||
K += blockIdx.y*D * nb11;
|
||||
V += blockIdx.y*D * nb21;
|
||||
maskh += blockIdx.y*D;
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + k_VKQ_0 + tid];
|
||||
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + tid];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@@ -244,7 +227,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
|
||||
half sum = vec_dot_KQ(K + i_KQ*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum((float)sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
@@ -300,14 +283,18 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
}
|
||||
|
||||
half2 V_k;
|
||||
reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k_VKQ_0 + k0 + 0)*nb21, tid);
|
||||
reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k_VKQ_0 + k0 + 1)*nb21, tid);
|
||||
reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k0 + 0)*nb21, tid);
|
||||
reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k0 + 1)*nb21, tid);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
|
||||
}
|
||||
}
|
||||
|
||||
K += gridDim.y*D * nb11;
|
||||
V += gridDim.y*D * nb21;
|
||||
maskh += gridDim.y*D;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@@ -342,17 +329,16 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
|
||||
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
|
||||
@@ -18,33 +18,13 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#ifdef FLASH_ATTN_AVAILABLE
|
||||
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
@@ -59,8 +39,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(nb23);
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
@@ -198,13 +177,16 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
|
||||
float VKQ[ncols] = {0.0f};
|
||||
|
||||
K += blockIdx.y*D * nb11;
|
||||
V += blockIdx.y*D * nb21;
|
||||
maskh += blockIdx.y*D;
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + k_VKQ_0 + tid]);
|
||||
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + tid]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
@@ -246,7 +228,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
|
||||
float sum = vec_dot_KQ(K + i_KQ*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
@@ -297,13 +279,17 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
break;
|
||||
}
|
||||
|
||||
const float V_ki = dequantize_1_v(V + (k_VKQ_0 + k)*nb21, tid);
|
||||
const float V_ki = dequantize_1_v(V + k*nb21, tid);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_ki*KQ[j*D + k];
|
||||
}
|
||||
}
|
||||
|
||||
K += gridDim.y*D * nb11;
|
||||
V += gridDim.y*D * nb21;
|
||||
maskh += gridDim.y*D;
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@@ -348,7 +334,6 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -37,33 +37,13 @@ static __global__ void flash_attn_ext_f16(
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const float logit_softcap,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int ne33,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb33,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int nb21,
|
||||
const int nb22,
|
||||
const int nb23,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||
#if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
||||
// Skip unused kernel variants for faster compilation:
|
||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||
@@ -197,7 +177,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) {
|
||||
frag_a_K K_a;
|
||||
wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
|
||||
wmma::load_matrix_sync(K_a, K_h + int64_t(k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/frag_n; ++j) {
|
||||
wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]);
|
||||
@@ -344,7 +324,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int k = k0 + (threadIdx.y % VKQ_ratio)*16;
|
||||
|
||||
frag_a_V v_a;
|
||||
wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
|
||||
wmma::load_matrix_sync(v_a, V_h + int64_t(k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols/frag_n; ++j) {
|
||||
wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]);
|
||||
@@ -451,7 +431,6 @@ static __global__ void flash_attn_ext_f16(
|
||||
GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
||||
}
|
||||
|
||||
@@ -280,22 +280,12 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
|
||||
if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
if (fp16_mma_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||
return;
|
||||
}
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
|
||||
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
} else {
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_AMD(cc) && fp16_mma_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||
return;
|
||||
}
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
|
||||
if (!fast_fp16_available(cc)) {
|
||||
if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
|
||||
|
||||
@@ -12,7 +12,8 @@
|
||||
// The methods get_i and get_j can be used to get the physical 32 bit index of the lth element of a thread within a tile.
|
||||
// All matrix tiles have ne physical 32 bit elements per warp.
|
||||
//
|
||||
// As described in the documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
|
||||
// As described in the PTX documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
|
||||
// The API in this file also assumes that the pointers for load_generic are aligned to 16 bytes, unaligned pointers are considered undefined behavior.
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
@@ -66,7 +67,44 @@ namespace ggml_cuda_mma {
|
||||
struct tile {
|
||||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
static constexpr int ne = I * J / WARP_SIZE;
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
static constexpr int ne = I * J / 64;
|
||||
T x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
return threadIdx.x % 32;
|
||||
} else if constexpr (I == 16 && J == 16) {
|
||||
return 4 * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 32 && J == 32) {
|
||||
return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
|
||||
} else {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int get_j(const int l) {
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
return (2 * ((threadIdx.x / 16) % 2) + l);
|
||||
} else if constexpr (I == 16 && J == 8) {
|
||||
return 2 * (threadIdx.x / 16) + l;
|
||||
} else if constexpr (I == 32 && J == 4) {
|
||||
return 2 * (threadIdx.x / 32) + l;
|
||||
} else if constexpr (I == 16 && J == 16) {
|
||||
return threadIdx.x % 16;
|
||||
} else if constexpr (I == 32 && J == 32) {
|
||||
return threadIdx.x % 32;
|
||||
} else {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr int ne = I * J / 32;
|
||||
T x[ne] = {0};
|
||||
|
||||
static __device__ __forceinline__ int get_i(const int l) {
|
||||
@@ -94,6 +132,7 @@ namespace ggml_cuda_mma {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
};
|
||||
|
||||
template <int I_, int J_>
|
||||
@@ -148,10 +187,23 @@ namespace ggml_cuda_mma {
|
||||
|
||||
template <int I, int J, typename T>
|
||||
static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
} else {
|
||||
int64_t * xi = (int64_t *) t.x;
|
||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||
xi[0] = xs[0];
|
||||
}
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int l = 0; l < t.ne; ++l) {
|
||||
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
|
||||
}
|
||||
#endif // defined(AMD_MFMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -186,7 +238,7 @@ namespace ggml_cuda_mma {
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ void load_ldmatrix(
|
||||
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
||||
#ifdef NEW_MMA_AVAILABLE
|
||||
#if defined(NEW_MMA_AVAILABLE)
|
||||
int * xi = (int * ) t.x;
|
||||
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
|
||||
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
|
||||
@@ -393,4 +445,60 @@ namespace ggml_cuda_mma {
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||
int32x4_t * acc = (int32x4_t *) D.x;
|
||||
#if defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#endif // defined(CDNA3)
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
tile<32, 32, int> & D, const tile<32, 4, int> & A, const tile<32, 4, int> & B) {
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
|
||||
int32x16_t * acc = (int32x16_t *) D.x;
|
||||
#if defined(CDNA3)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
|
||||
((int64_t *) B.x)[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#elif defined(CDNA2) || defined(CDNA)
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
|
||||
B.x[0],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
|
||||
B.x[1],
|
||||
acc[0],
|
||||
0, 0, 0);
|
||||
#endif // defined(CDNA3)
|
||||
#else
|
||||
GGML_UNUSED(D);
|
||||
GGML_UNUSED(A);
|
||||
GGML_UNUSED(B);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,7 +109,8 @@ void ggml_cuda_mul_mat_q(
|
||||
const int64_t s03 = src0->nb[3] / ts_src0;
|
||||
const int64_t s3 = dst->nb[3] / ts_dst;
|
||||
|
||||
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
||||
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
|
||||
|
||||
if (!ids) {
|
||||
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
@@ -250,8 +251,9 @@ void ggml_cuda_op_mul_mat_q(
|
||||
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
|
||||
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
||||
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
|
||||
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
|
||||
&& src1_ncols == ne11;
|
||||
const mmq_args args = {
|
||||
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
||||
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
|
||||
@@ -304,7 +306,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (new_mma_available(cc)) {
|
||||
if (new_mma_available(cc) || amd_mfma_available(cc)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -44,6 +44,9 @@ static __global__ void k_set_rows_quant(
|
||||
block_type * dst_block = dst_row_ptr + i00 / qk;
|
||||
|
||||
quantize_func(src_block, dst_block);
|
||||
|
||||
GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne13);
|
||||
}
|
||||
|
||||
// Template dispatch function for quantized set_rows
|
||||
|
||||
14
ggml/src/ggml-cuda/vendors/hip.h
vendored
14
ggml/src/ggml-cuda/vendors/hip.h
vendored
@@ -160,7 +160,19 @@
|
||||
#endif
|
||||
|
||||
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
|
||||
#define CDNA
|
||||
#define CDNA // For the entire family
|
||||
#endif
|
||||
|
||||
#if defined(__gfx942__)
|
||||
#define CDNA3
|
||||
#endif
|
||||
|
||||
#if defined(__gfx90a__)
|
||||
#define CDNA2
|
||||
#endif
|
||||
|
||||
#if defined(__gfx908__)
|
||||
#define CDNA1
|
||||
#endif
|
||||
|
||||
#if defined(__GFX12__)
|
||||
|
||||
4
ggml/src/ggml-cuda/vendors/musa.h
vendored
4
ggml/src/ggml-cuda/vendors/musa.h
vendored
@@ -13,7 +13,7 @@
|
||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
|
||||
#define CUDA_R_16F MUSA_R_16F
|
||||
#define CUDA_R_16BF MUSA_R_16BF
|
||||
#define CUDA_R_32F MUSA_R_32F
|
||||
@@ -29,7 +29,7 @@
|
||||
#define cublasSgemm mublasSgemm
|
||||
#define cublasStatus_t mublasStatus_t
|
||||
#define cublasOperation_t mublasOperation_t
|
||||
#define cublasGetStatusString mublasStatus_to_string
|
||||
#define cublasGetStatusString mublasGetStatusString
|
||||
#define cudaDataType_t musaDataType_t
|
||||
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
||||
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
||||
|
||||
@@ -528,6 +528,7 @@ typedef struct {
|
||||
int64_t n_group;
|
||||
int64_t n_seq_tokens;
|
||||
int64_t n_seqs;
|
||||
int64_t s_off;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
|
||||
@@ -1955,6 +1955,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
static int ggml_metal_encode_node(
|
||||
ggml_backend_t backend,
|
||||
int idx,
|
||||
int idx_end,
|
||||
id<MTLComputeCommandEncoder> encoder,
|
||||
struct ggml_metal_mem_pool * mem_pool) {
|
||||
struct ggml_backend_metal_context * ctx = backend->context;
|
||||
@@ -2181,7 +2182,9 @@ static int ggml_metal_encode_node(
|
||||
size_t offs_fuse;
|
||||
id<MTLBuffer> id_fuse;
|
||||
|
||||
for (n_fuse = 0; n_fuse <= 6; ++n_fuse) {
|
||||
// note: in metal, we sometimes encode the graph in parallel so we have to avoid fusing nodes
|
||||
// across splits. idx_end indicates the last node in the current split
|
||||
for (n_fuse = 0; n_fuse <= 6 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
|
||||
if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) {
|
||||
break;
|
||||
}
|
||||
@@ -3138,6 +3141,7 @@ static int ggml_metal_encode_node(
|
||||
/*.n_group =*/ n_group,
|
||||
/*.n_seq_tokens =*/ n_seq_tokens,
|
||||
/*.n_seqs =*/ n_seqs,
|
||||
/*.s_off =*/ ggml_nelements(src1) * sizeof(float),
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
@@ -3166,12 +3170,22 @@ static int ggml_metal_encode_node(
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:7];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:8];
|
||||
|
||||
// One shared memory bucket for each simd group in the threadgroup
|
||||
// NOTE: Metal kernels require the buffer size to be multiple of 16 bytes
|
||||
// https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
|
||||
if (d_state >= 32) {
|
||||
GGML_ASSERT((int64_t)(d_state / 32) <= 32);
|
||||
const int64_t shmem_size = 32;
|
||||
GGML_ASSERT(d_state <= (int64_t)pipeline.maxTotalThreadsPerThreadgroup);
|
||||
[encoder setThreadgroupMemoryLength:(shmem_size)*sizeof(float) atIndex:0];
|
||||
}
|
||||
|
||||
if (ne30 == 1) {
|
||||
// Mamba-2
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)];
|
||||
} else {
|
||||
GGML_ASSERT(d_inner == 1);
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n_head, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n_head, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)];
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
@@ -4288,7 +4302,7 @@ static int ggml_metal_encode_node(
|
||||
ops[1] = GGML_OP_MUL;
|
||||
ops[2] = GGML_OP_ADD;
|
||||
|
||||
for (n_fuse = 0; n_fuse <= 1; ++n_fuse) {
|
||||
for (n_fuse = 0; n_fuse <= 1 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
|
||||
if (!ggml_can_fuse(gf, idx + n_fuse, ops + n_fuse, 2)) {
|
||||
break;
|
||||
}
|
||||
@@ -6271,7 +6285,11 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
||||
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
|
||||
}
|
||||
|
||||
const int res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
|
||||
const int res = ggml_metal_encode_node(backend, idx, node_end, encoder, mem_pool);
|
||||
if (idx + res > node_end) {
|
||||
GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s",
|
||||
"https://github.com/ggml-org/llama.cpp/pull/14849");
|
||||
}
|
||||
|
||||
if (should_capture) {
|
||||
[encoder popDebugGroup];
|
||||
|
||||
@@ -1823,10 +1823,16 @@ kernel void kernel_ssm_scan_f32(
|
||||
device const void * src5,
|
||||
device const void * src6,
|
||||
device float * dst,
|
||||
threadgroup float * shared [[threadgroup(0)]],
|
||||
constant ggml_metal_kargs_ssm_scan & args,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgptg[[simdgroups_per_threadgroup]],
|
||||
uint3 tgpg[[threadgroups_per_grid]]) {
|
||||
|
||||
const int64_t i0 = tpitg.x;
|
||||
const int64_t i1 = 0;
|
||||
const int64_t ir = tgpig.x; // current head
|
||||
const int64_t i3 = tgpig.y; // current seq
|
||||
@@ -1841,41 +1847,88 @@ kernel void kernel_ssm_scan_f32(
|
||||
const int64_t ng = args.n_group;
|
||||
const int64_t n_t = args.n_seq_tokens;
|
||||
|
||||
const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float);
|
||||
const int64_t s_off = args.s_off;
|
||||
|
||||
device const int32_t * ids = (device const int32_t *) src6;
|
||||
|
||||
device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
const int64_t i = i0 + i1*nc;
|
||||
float s0 = s0_buff[i];
|
||||
float s = s_buff[i];
|
||||
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31);
|
||||
device const float * x_block = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i3*args.nb13);
|
||||
device const float * dt_block = (device const float *) ((device const char *) src2 + ir*nb20 + i3*args.nb22);
|
||||
device const float * B_block = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i3*args.nb43);
|
||||
device const float * C_block = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i3*args.nb53);
|
||||
device float * y_block = (device float *) ((device char *) dst + (i1 + ir*(nr) + i3*(n_t*nh*nr))*nb00);
|
||||
|
||||
for (int64_t i2 = 0; i2 < n_t; ++i2) {
|
||||
device const float * x = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns}
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {d_state, nh}
|
||||
device const float * B = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns}
|
||||
device float * y = (device float *) ((device char *) dst + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns}
|
||||
device const float * x = (device const float *) ((device const char *) x_block + i2*args.nb12); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *) ((device const char *) dt_block + i2*args.nb21); // {nh, nt, ns}
|
||||
device const float * B = (device const float *) ((device const char *) B_block + i2*args.nb42); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *) ((device const char *) C_block + i2*args.nb52); // {d_state, ng, nt, ns}
|
||||
device float * y = (device float *) ((device char *) y_block + i2*(nh*nr*nb00)); // {dim, nh, nt, ns}
|
||||
|
||||
const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
|
||||
const float x_dt = x[0] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
const int64_t i = i0 + i1*nc;
|
||||
const float state = (s0[i] * exp(dt_soft_plus * A[i0])) + (B[i0] * x_dt);
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
const float state = (s0 * exp(dt_soft_plus * A[i0])) + (B[i0] * x_dt);
|
||||
s = state;
|
||||
|
||||
// Parallel sum: This relies on the fact that this kernel will be
|
||||
// dispatched with each threadgroup having (d_state, 1, 1) threads which
|
||||
// are subdivided into SIMD groups of size `sgptg`. The goal is to
|
||||
// compute y = sum({state * C[i] for i in range(d_state)}).
|
||||
// To parallelize this effectively, we first use simd_sum over each SIMD
|
||||
// group to compute the sum of each SIMD group, then place the result in
|
||||
// the SIMD group's indexed bucket in the shared memory. We then sum
|
||||
// over the individual group sums to compute the final sum.
|
||||
|
||||
// Computed for each thread
|
||||
float sumf = state * C[i0];
|
||||
|
||||
// Sum the threads in the simd group => simd sum
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
if (sgptg > 1) {
|
||||
|
||||
// Once per simd group, place the group sum into the shared buffer
|
||||
if (tiisg == 0) {
|
||||
shared[sgitg] = sumf;
|
||||
}
|
||||
|
||||
// Wait for all threads in the threadgroup to reach this point. This
|
||||
// ensures that all elements of the shared buffer are populated with the
|
||||
// sum of the individual simd groups.
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// For simd group 0 at indices < num simd groups, extract the shared
|
||||
// simd sum
|
||||
sumf = 0.0f;
|
||||
if (sgitg == 0) {
|
||||
if (tiisg < sgptg) {
|
||||
sumf = shared[tiisg];
|
||||
}
|
||||
sumf = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
y[0] = sumf;
|
||||
}
|
||||
}
|
||||
} else if (tiisg == 0) {
|
||||
y[0] = sumf;
|
||||
}
|
||||
|
||||
y[0] = sumf;
|
||||
|
||||
// recurse
|
||||
s0 = s;
|
||||
}
|
||||
|
||||
// Assign the final state to the output buffer
|
||||
s_buff[i] = s;
|
||||
}
|
||||
|
||||
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
|
||||
// TODO: optimize (e.g. by parallelizing over d_state)
|
||||
kernel void kernel_ssm_scan_f32_group(
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
@@ -1885,10 +1938,16 @@ kernel void kernel_ssm_scan_f32_group(
|
||||
device const void * src5,
|
||||
device const void * src6,
|
||||
device float * dst,
|
||||
threadgroup float * shared [[threadgroup(0)]],
|
||||
constant ggml_metal_kargs_ssm_scan & args,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgptg[[simdgroups_per_threadgroup]],
|
||||
uint3 tgpg[[threadgroups_per_grid]]) {
|
||||
|
||||
const int64_t i0 = tpitg.x;
|
||||
const int64_t i1 = tgpig.x;
|
||||
const int64_t ir = tgpig.y; // current head
|
||||
const int64_t i3 = tgpig.z; // current seq
|
||||
@@ -1903,38 +1962,81 @@ kernel void kernel_ssm_scan_f32_group(
|
||||
const int64_t ng = args.n_group;
|
||||
const int64_t n_t = args.n_seq_tokens;
|
||||
|
||||
const int64_t s_off = nr * nh * n_t * args.n_seqs * sizeof(float);
|
||||
const int64_t s_off = args.s_off;
|
||||
|
||||
device const int32_t * ids = (device const int32_t *) src6;
|
||||
|
||||
device const float * s0 = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
const int64_t i = i0 + i1*nc;
|
||||
float s0 = s0_buff[i];
|
||||
float s = s_buff[i];
|
||||
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh}
|
||||
device const float * x_block = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i3*args.nb13);
|
||||
device const float * dt_block = (device const float *) ((device const char *) src2 + ir*nb20 + i3*args.nb22);
|
||||
device const float * B_block = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i3*args.nb43);
|
||||
device const float * C_block = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i3*args.nb53);
|
||||
device float * y_block = (device float *) ((device char *) dst + (i1 + ir*(nr) + i3*(n_t*nh*nr))*nb00);
|
||||
|
||||
for (int64_t i2 = 0; i2 < n_t; ++i2) {
|
||||
device const float * x = (device const float *) ((device const char *) src1 + i1*nb10 + ir*args.nb11 + i2*args.nb12 + i3*args.nb13); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*args.nb21 + i3*args.nb22); // {nh, nt, ns}
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {1, nh}
|
||||
device const float * B = (device const float *) ((device const char *) src4 + (ir & (ng - 1))*args.nb41 + i2*args.nb42 + i3*args.nb43); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *) ((device const char *) src5 + (ir & (ng - 1))*args.nb51 + i2*args.nb52 + i3*args.nb53); // {d_state, ng, nt, ns}
|
||||
device float * y = (device float *) ((device char *) dst + (i1 + ir*(nr) + i2*(nh*nr) + i3*(n_t*nh*nr))*nb00); // {dim, nh, nt, ns}
|
||||
device const float * x = (device const float *) ((device const char *) x_block + i2*args.nb12); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *) ((device const char *) dt_block + i2*args.nb21); // {nh, nt, ns}
|
||||
device const float * B = (device const float *) ((device const char *) B_block + i2*args.nb42); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *) ((device const char *) C_block + i2*args.nb52); // {d_state, ng, nt, ns}
|
||||
device float * y = (device float *) ((device char *) y_block + i2*(nh*nr*nb00)); // {dim, nh, nt, ns}
|
||||
|
||||
const float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
|
||||
const float x_dt = x[0] * dt_soft_plus;
|
||||
const float dA = exp(dt_soft_plus * A[0]);
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
const int64_t i = i0 + i1*nc;
|
||||
const float state = (s0[i] * dA) + (B[i0] * x_dt);
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
const float state = (s0 * dA) + (B[i0] * x_dt);
|
||||
s = state;
|
||||
|
||||
// Parallel sum: This relies on the fact that this kernel will be
|
||||
// dispatched with each threadgroup having (d_state, 1, 1) threads which
|
||||
// are subdivided into SIMD groups of size `sgptg`. The goal is to
|
||||
// compute y = sum({state * C[i] for i in range(d_state)}).
|
||||
// To parallelize this effectively, we first use simd_sum over each SIMD
|
||||
// group to compute the sum of each SIMD group, then place the result in
|
||||
// the SIMD group's indexed bucket in the shared memory. We then sum
|
||||
// over the individual group sums to compute the final sum.
|
||||
|
||||
// Computed for each thread
|
||||
float sumf = state * C[i0];
|
||||
|
||||
// Sum the threads in the simd group => simd sum
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
// Once per simd group, place the group sum into the shared buffer
|
||||
if (tiisg == 0) {
|
||||
shared[sgitg] = sumf;
|
||||
}
|
||||
|
||||
y[0] = sumf;
|
||||
// Wait for all threads in the threadgroup to reach this point. This
|
||||
// ensures that all elements of the shared buffer are populated with the
|
||||
// sum of the individual simd groups.
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// For simd group 0 at indices < num simd groups, extract the shared
|
||||
// simd sum
|
||||
sumf = 0.0f;
|
||||
if (sgitg == 0) {
|
||||
if (tiisg < sgptg) {
|
||||
sumf = shared[tiisg];
|
||||
}
|
||||
sumf = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
y[0] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
// recurse
|
||||
s0 = s;
|
||||
}
|
||||
|
||||
// Assign the final state to the output buffer
|
||||
s_buff[i] = s;
|
||||
}
|
||||
|
||||
kernel void kernel_rwkv_wkv6_f32(
|
||||
|
||||
@@ -34,8 +34,12 @@ if (MUSAToolkit_FOUND)
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
file(GLOB SRCS "../ggml-musa/*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
|
||||
if (GGML_MUSA_MUDNN_COPY)
|
||||
file(GLOB SRCS "../ggml-musa/*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
add_compile_definitions(GGML_MUSA_MUDNN_COPY)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
||||
@@ -72,6 +76,10 @@ if (MUSAToolkit_FOUND)
|
||||
add_compile_definitions(GGML_USE_MUSA)
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (GGML_MUSA_GRAPHS)
|
||||
add_compile_definitions(GGML_MUSA_GRAPHS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
@@ -97,10 +105,16 @@ if (MUSAToolkit_FOUND)
|
||||
endif()
|
||||
|
||||
if (GGML_STATIC)
|
||||
# TODO: mudnn has not provided static libraries yet
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
|
||||
# TODO: mudnn has not provided static libraries yet
|
||||
# if (GGML_MUSA_MUDNN_COPY)
|
||||
# target_link_libraries(ggml-musa PRIVATE mudnn_static)
|
||||
# endif()
|
||||
else()
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
|
||||
if (GGML_MUSA_MUDNN_COPY)
|
||||
target_link_libraries(ggml-musa PRIVATE mudnn)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
|
||||
@@ -333,6 +333,7 @@ struct ggml_backend_opencl_context {
|
||||
size_t max_alloc_size;
|
||||
bool fp16_support;
|
||||
bool has_vector_subgroup_broadcast;
|
||||
bool disable_fusion;
|
||||
ggml_cl_compiler_version adreno_cl_compiler_version;
|
||||
|
||||
int adreno_wave_size;
|
||||
@@ -411,7 +412,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick,
|
||||
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
||||
cl_kernel kernel_norm;
|
||||
cl_kernel kernel_rms_norm;
|
||||
cl_kernel kernel_rms_norm, kernel_rms_norm_mul;
|
||||
cl_kernel kernel_group_norm;
|
||||
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
|
||||
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
||||
@@ -1100,7 +1101,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
backend_ctx->program_rms_norm =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_rms_norm_mul = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm_mul", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
@@ -2110,6 +2112,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;
|
||||
|
||||
dev_ctx->backend_ctx = backend_ctx.release();
|
||||
return dev_ctx->backend_ctx;
|
||||
}
|
||||
@@ -2279,7 +2283,45 @@ static void sync_with_other_backends(ggml_backend_t backend) {
|
||||
sync_with_other_backends(backend_ctx);
|
||||
}
|
||||
|
||||
static bool ggml_opencl_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
|
||||
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) {
|
||||
const ggml_tensor *rms_norm = cgraph->nodes[node_idx];
|
||||
const ggml_tensor *mul = cgraph->nodes[node_idx+1];
|
||||
|
||||
GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
|
||||
|
||||
// rms_norm only supports f32
|
||||
if (mul->src[0]->type != GGML_TYPE_F32 ||
|
||||
mul->src[1]->type != GGML_TYPE_F32 ||
|
||||
mul->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if rms_norm is the B operand, then we don't handle broadcast
|
||||
if (rms_norm == mul->src[1] &&
|
||||
!ggml_are_same_shape(mul->src[0], rms_norm->src[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// rms_norm assumes contiguous rows
|
||||
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor);
|
||||
|
||||
static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
@@ -2292,6 +2334,12 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
ggml_opencl_op_rms_norm_fused(backend, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
bool ok = ggml_cl_compute_forward(backend, node);
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
@@ -4455,6 +4503,117 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor) {
|
||||
GGML_ASSERT(mul_tensor);
|
||||
GGML_ASSERT(rms_norm_tensor);
|
||||
|
||||
// src0 is the src of rms_norm, src1 is the other src of mul (one being rms_norm)
|
||||
const ggml_tensor * src0 = rms_norm_tensor->src[0];
|
||||
const ggml_tensor * src1;
|
||||
if (mul_tensor->src[0] == rms_norm_tensor) {
|
||||
src1 = mul_tensor->src[1];
|
||||
} else if (mul_tensor->src[1] == rms_norm_tensor) {
|
||||
src1 = mul_tensor->src[0];
|
||||
} else {
|
||||
GGML_ASSERT(false && "Invalid args for rms_norm and mul");
|
||||
}
|
||||
const ggml_tensor * dst = mul_tensor;
|
||||
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
const cl_ulong nb13 = src1->nb[3];
|
||||
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
|
||||
size_t sgs;
|
||||
if (backend_ctx->gpu_family == ADRENO) {
|
||||
sgs = 64;
|
||||
} else if (backend_ctx->gpu_family == INTEL) {
|
||||
sgs = 32;
|
||||
} else {
|
||||
GGML_ASSERT(false && "Unsupported GPU");
|
||||
}
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_rms_norm_mul;
|
||||
|
||||
int nth = sgs;
|
||||
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
while (nth < ne00 && nth < max_workgroup_size) {
|
||||
nth *= 2;
|
||||
}
|
||||
nth = MIN(nth, max_workgroup_size);
|
||||
nth = MIN(nth, ne00);
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*nth/sgs, NULL));
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
|
||||
@@ -94,3 +94,82 @@ kernel void kernel_rms_norm(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// rms_norm_mul
|
||||
//------------------------------------------------------------------------------
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_32
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_rms_norm_mul(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
float eps,
|
||||
local float * sum
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float4 * x = (global float4 *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
global float4 * f = (global float4 *) (src1 + (i03%ne13)*nb13 + (i02%ne12)*nb12 + (i01%ne11)*nb11);
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
// parallel sum
|
||||
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
||||
sumf += dot(x[i00], x[i00]);
|
||||
}
|
||||
sumf = sub_group_reduce_add(sumf);
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
sum[get_sub_group_id()] = sumf;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
|
||||
if (get_local_id(0) < i) {
|
||||
sum[get_local_id(0)] += sum[get_local_id(0) + i];
|
||||
}
|
||||
}
|
||||
if (get_local_id(0) == 0) {
|
||||
sum[0] /= ne00;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
float mean = sum[0];
|
||||
float scale = 1.0f/sqrt(mean + eps);
|
||||
|
||||
global float4 * y = (global float4 *) (dst + i03*nb3 + i02*nb2 + i01*nb1);
|
||||
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
|
||||
y[i00] = (x[i00] * scale) * f[i00%(ne10/4)];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1055,7 +1055,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
GGML_ASSERT(ctx_ptr != nullptr);
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
|
||||
if (tensor == nullptr) {
|
||||
if (tensor == nullptr || tensor->buffer == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@@ -1124,7 +1124,7 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
|
||||
GGML_ASSERT(ctx_ptr != nullptr);
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||
if (tensor == nullptr) {
|
||||
if (tensor == nullptr || tensor->buffer == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@@ -1192,7 +1192,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
|
||||
GGML_ASSERT(ctx_ptr != nullptr);
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||
if (tensor == nullptr) {
|
||||
if (tensor == nullptr || tensor->buffer == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
@@ -1229,7 +1229,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
||||
|
||||
ggml_tensor * src = deserialize_tensor(ctx, &request.src);
|
||||
ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
|
||||
if (src == nullptr || dst == nullptr) {
|
||||
if (src == nullptr || dst == nullptr || src->buffer == nullptr || dst->buffer == nullptr) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3531,7 +3531,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
|
||||
|
||||
const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
|
||||
assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
|
||||
|
||||
{
|
||||
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
|
||||
|
||||
@@ -48,11 +48,11 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
|
||||
};
|
||||
|
||||
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
|
||||
return { block_index * (traits::qk / traits::qr), 0 };
|
||||
return { block_index * (QK4_0 / QR4_0), 0 };
|
||||
}
|
||||
|
||||
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
||||
return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 };
|
||||
return { (ncols / QR4_0 * nrows) + block_index * sizeof(ggml_half), 0 };
|
||||
}
|
||||
|
||||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
||||
@@ -71,14 +71,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
|
||||
}
|
||||
|
||||
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
||||
auto nblocks = (nrows * (ncols / traits::qk));
|
||||
return { nblocks * (QK_K / 2),
|
||||
auto nblocks = (nrows * (ncols / QK_K));
|
||||
return { nblocks * (QK_K / 2) + (block_index * K_SCALE_SIZE),
|
||||
(nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
|
||||
}
|
||||
|
||||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
||||
|
||||
constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
|
||||
};
|
||||
|
||||
template <> struct block_q_t<GGML_TYPE_Q6_K> {
|
||||
@@ -90,22 +88,23 @@ template <> struct block_q_t<GGML_TYPE_Q6_K> {
|
||||
};
|
||||
|
||||
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
|
||||
auto low_bits_index = block_index * (traits::qk / traits::qr);
|
||||
auto low_bits_index = block_index * (QK_K / QR6_K);
|
||||
// the index of high bits it's after all low bits
|
||||
auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
|
||||
return { low_bits_index, high_bits_index };
|
||||
}
|
||||
|
||||
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
||||
auto nblocks = (nrows * (ncols / traits::qk));
|
||||
auto nblocks = (nrows * (ncols / QK_K));
|
||||
auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
|
||||
auto block_scales = total_qs_bytes + block_index * (QK_K / 16);
|
||||
auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16);
|
||||
auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16) + block_index * sizeof(ggml_half);
|
||||
return { block_scales, sb_scale };
|
||||
}
|
||||
|
||||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
||||
};
|
||||
|
||||
} // namespace ggml_sycl_reordered
|
||||
|
||||
#endif // GGML_SYCL_QUANTS_HPP
|
||||
|
||||
@@ -350,11 +350,9 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
|
||||
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
||||
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
|
||||
const sycl::half2 * q8_1_ds, const int & iqs) {
|
||||
const int ib = ibx_offset.first / (QK_K / 2);
|
||||
|
||||
const uint8_t * base = static_cast<const uint8_t *>(vbq);
|
||||
const uint8_t * qs = base + ibx_offset.first;
|
||||
const uint8_t * scs = base + d_offset.first + ib * K_SCALE_SIZE;
|
||||
const uint8_t * scs = base + d_offset.first;
|
||||
const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
|
||||
|
||||
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
|
||||
@@ -427,13 +425,11 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
|
||||
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
||||
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds,
|
||||
const int iqs) {
|
||||
const int ib = ibx_offset.first / (QK_K / 2);
|
||||
|
||||
const uint8_t * base = static_cast<const uint8_t *>(vbq);
|
||||
const uint8_t * ql = base + ibx_offset.first;
|
||||
const uint8_t * qh = base + ibx_offset.second;
|
||||
const int8_t * scales = reinterpret_cast<const int8_t *>(base + d_offset.first);
|
||||
const ggml_half * d = (const ggml_half *) (base + d_offset.second) + ib;
|
||||
const ggml_half * d = (const ggml_half *) (base + d_offset.second);
|
||||
|
||||
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4);
|
||||
const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8);
|
||||
|
||||
@@ -6640,20 +6640,18 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
|
||||
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
||||
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
|
||||
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
|
||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
||||
fprintf(fp, " \"%p\" -> \"%p\" [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
||||
gparent0 ? (void *) gparent0 : (void *) parent,
|
||||
gparent0 ? "g" : "x",
|
||||
gparent ? (void *) gparent : (void *) node,
|
||||
gparent ? "g" : "x",
|
||||
gparent ? "empty" : "vee",
|
||||
gparent ? "dashed" : "solid",
|
||||
label);
|
||||
}
|
||||
|
||||
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
||||
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
|
||||
(void *) parent, "x",
|
||||
(void *) node, "x",
|
||||
fprintf(fp, " \"%p\" -> \"%p\" [ label = \"%s\"; ]\n",
|
||||
(void *) parent,
|
||||
(void *) node,
|
||||
label);
|
||||
}
|
||||
|
||||
|
||||
@@ -956,6 +956,7 @@ extern "C" {
|
||||
// in the order they have appeared in the batch.
|
||||
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
||||
// Cols: n_vocab
|
||||
// TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Logits for the ith token. For positive indices, Equivalent to:
|
||||
@@ -970,6 +971,7 @@ extern "C" {
|
||||
// in the order they have appeared in the batch.
|
||||
// shape: [n_outputs*n_embd]
|
||||
// Otherwise, returns NULL.
|
||||
// TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
||||
|
||||
@@ -112,6 +112,11 @@ class DocsGenerator:
|
||||
lines.append("")
|
||||
lines.append("List of GGML operations and backend support status.")
|
||||
lines.append("")
|
||||
lines.append("## How to add a backend to this table:")
|
||||
lines.append("")
|
||||
lines.append("1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)")
|
||||
lines.append("2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`")
|
||||
lines.append("")
|
||||
lines.append("Legend:")
|
||||
lines.append("- ✅ Fully supported by this backend")
|
||||
lines.append("- 🟡 Partially supported by this backend")
|
||||
|
||||
@@ -1 +1 @@
|
||||
3323219cd3cc050e5c7133cd4fc1e50d1f590faf
|
||||
56938c4a3b2d923f42040f9ad32d229c76c466cd
|
||||
|
||||
@@ -1933,12 +1933,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
}
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DREAM,
|
||||
{
|
||||
@@ -1956,6 +1950,12 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
|
||||
@@ -718,10 +718,9 @@ int32_t llm_chat_apply_template(
|
||||
}
|
||||
|
||||
ss << message->content << "<|im_end|>";
|
||||
|
||||
if (add_ass) {
|
||||
ss << "<|im_assistant|>assistant<|im_middle|>";
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|im_assistant|>assistant<|im_middle|>";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
|
||||
@@ -105,7 +105,7 @@ llama_context::llama_context(
|
||||
|
||||
{
|
||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||
const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
||||
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
||||
|
||||
if (!supports_set_rows && !cparams.kv_unified) {
|
||||
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
||||
@@ -508,12 +508,16 @@ enum llama_pooling_type llama_context::pooling_type() const {
|
||||
}
|
||||
|
||||
float * llama_context::get_logits() {
|
||||
output_reorder();
|
||||
|
||||
return logits;
|
||||
}
|
||||
|
||||
float * llama_context::get_logits_ith(int32_t i) {
|
||||
int64_t j = -1;
|
||||
|
||||
output_reorder();
|
||||
|
||||
try {
|
||||
if (logits == nullptr) {
|
||||
throw std::runtime_error("no logits");
|
||||
@@ -550,12 +554,16 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings() {
|
||||
output_reorder();
|
||||
|
||||
return embd;
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
int64_t j = -1;
|
||||
|
||||
output_reorder();
|
||||
|
||||
try {
|
||||
if (embd == nullptr) {
|
||||
throw std::runtime_error("no embeddings");
|
||||
@@ -891,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!supports_set_rows) {
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
// TODO: hacky solution
|
||||
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
||||
//cross.t_embd = t_embd;
|
||||
@@ -970,6 +984,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
|
||||
// TODO: this clear of the buffer can easily be forgotten - need something better
|
||||
embd_seq.clear();
|
||||
output_swaps.clear();
|
||||
|
||||
bool did_optimize = false;
|
||||
|
||||
@@ -1189,9 +1204,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint64_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
@@ -1207,16 +1219,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
continue;
|
||||
}
|
||||
std::swap(out_ids[i], out_ids[j_min]);
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
if (embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
||||
}
|
||||
}
|
||||
|
||||
// remember the swaps and apply them lazily upon logits/embeddings access
|
||||
output_swaps.push_back({ i, j_min });
|
||||
}
|
||||
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
@@ -1230,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||
//synchronize();
|
||||
|
||||
if (!supports_set_rows) {
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1307,6 +1318,30 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
return n_outputs_max;
|
||||
}
|
||||
|
||||
void llama_context::output_reorder() {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint64_t n_embd = model.hparams.n_embd;
|
||||
|
||||
for (uint32_t s = 0; s < output_swaps.size(); ++s) {
|
||||
const uint32_t i0 = output_swaps[s].i0;
|
||||
const uint32_t i1 = output_swaps[s].i1;
|
||||
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
|
||||
if (embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output_swaps.clear();
|
||||
}
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
|
||||
@@ -181,6 +181,8 @@ private:
|
||||
// Returns max number of outputs for which space was reserved.
|
||||
uint32_t output_reserve(int32_t n_outputs);
|
||||
|
||||
void output_reorder();
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
@@ -250,6 +252,13 @@ private:
|
||||
|
||||
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
||||
|
||||
struct swap_info {
|
||||
uint32_t i0;
|
||||
uint32_t i1;
|
||||
};
|
||||
|
||||
std::vector<swap_info> output_swaps;
|
||||
|
||||
ggml_backend_sched_ptr sched;
|
||||
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
@@ -278,6 +287,10 @@ private:
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
bool supports_set_rows = false;
|
||||
|
||||
// perf
|
||||
mutable int64_t t_start_us = 0;
|
||||
mutable int64_t t_load_us = 0;
|
||||
|
||||
@@ -98,7 +98,7 @@ struct llama_hparams {
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_scale_train_swa;
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul;
|
||||
float rope_yarn_log_mul = 0.0f;
|
||||
|
||||
std::array<int, 4> rope_sections;
|
||||
|
||||
|
||||
@@ -768,6 +768,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
// Iterate and write all the keys first, each row is a cell
|
||||
// Get whole range at a time
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (r_l[il] == nullptr) continue;
|
||||
|
||||
// Write key type
|
||||
const int32_t r_type_i = (int32_t)r_l[il]->type;
|
||||
@@ -787,6 +789,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
|
||||
if (!s_trans) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
// Write value type
|
||||
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
||||
@@ -807,6 +811,9 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
// When v is transposed, we also need the element size and get the element ranges from each row
|
||||
const uint32_t mem_size = size;
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
const uint32_t n_embd_s = hparams.n_embd_s();
|
||||
|
||||
// Write value type
|
||||
@@ -951,6 +958,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
|
||||
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers
|
||||
if (r_l[il] == nullptr) continue;
|
||||
|
||||
// Read type of key
|
||||
int32_t r_type_i_ref;
|
||||
@@ -978,11 +987,14 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
|
||||
if (!s_trans) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
// Read type of value
|
||||
int32_t s_type_i_ref;
|
||||
io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
|
||||
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
||||
|
||||
if (s_type_i != s_type_i_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
|
||||
return false;
|
||||
@@ -1005,6 +1017,9 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
} else {
|
||||
// For each layer, read the values for each cell (transposed)
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
const uint32_t n_embd_s = hparams.n_embd_s();
|
||||
|
||||
// Read type of value
|
||||
|
||||
@@ -646,6 +646,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
|
||||
// MiniCPM uses rope by default, unlike Granite which uses it as a switch
|
||||
hparams.rope_finetuned = true;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 52: type = LLM_TYPE_1B; break;
|
||||
case 40: type = LLM_TYPE_2B; break;
|
||||
@@ -1366,7 +1369,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
// that have no expert_gating_func model parameter set
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||
}
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 27: type = LLM_TYPE_16B; break;
|
||||
@@ -16188,7 +16191,7 @@ private:
|
||||
{
|
||||
// PLaMo-2 uses combined QKV tensor
|
||||
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(qkv, "qkv", il);
|
||||
cb(qkv, "wqkv", il);
|
||||
|
||||
// split QKV tensor into Q, K, V
|
||||
const int64_t n_embd_head_q = hparams.n_embd_head_k;
|
||||
@@ -16228,7 +16231,7 @@ private:
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
|
||||
cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
|
||||
}
|
||||
|
||||
cb(cur, "attn_out", il);
|
||||
@@ -16303,8 +16306,9 @@ private:
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0, last_conv,
|
||||
ggml_view_1d(ctx0, conv_states_all,
|
||||
(d_conv - 1)*(d_inner)*(n_seqs),
|
||||
kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
|
||||
(d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
|
||||
kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
|
||||
cb(conv_states_all, "mamba_conv1d_state", il);
|
||||
|
||||
// 1D convolution
|
||||
x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
|
||||
@@ -16367,9 +16371,9 @@ private:
|
||||
// store last states
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0,
|
||||
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
|
||||
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
|
||||
kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
||||
ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)),
|
||||
ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all))));
|
||||
cb(ssm_states_all, "mamba_ssm_states", il);
|
||||
|
||||
ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
|
||||
cb(y, "mamba_y_view", il);
|
||||
|
||||
@@ -868,16 +868,30 @@ struct sql_printer : public printer {
|
||||
|
||||
struct csv_printer : public printer {
|
||||
void print_header() override {
|
||||
std::vector<std::string> fields = test_result::get_fields();
|
||||
|
||||
std::vector<std::string> fields = test_result::get_fields();
|
||||
std::vector<std::string> fields_csv = get_fields_csv();
|
||||
for (size_t i = 0; i < fields.size(); i++) {
|
||||
if (std::find(std::begin(fields_csv), std::end(fields_csv), fields[i]) == std::end(fields_csv)) {
|
||||
continue;
|
||||
}
|
||||
printf("\"%s\"%s", fields[i].c_str(), i < fields.size() - 1 ? "," : "");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void print_test_result(const test_result & result) override {
|
||||
std::vector<std::string> values = result.get_values();
|
||||
|
||||
std::vector<std::string> values = result.get_values();
|
||||
std::vector<std::string> fields = test_result::get_fields();
|
||||
std::vector<std::string> fields_csv = get_fields_csv();
|
||||
|
||||
for (size_t i = 0; i < values.size(); i++) {
|
||||
|
||||
if (std::find(std::begin(fields_csv), std::end(fields_csv), fields[i]) == std::end(fields_csv)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Escape quotes and wrap in quotes for CSV
|
||||
std::string escaped_value = values[i];
|
||||
size_t pos = 0;
|
||||
@@ -889,6 +903,19 @@ struct csv_printer : public printer {
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static std::vector<std::string> get_fields_csv() {
|
||||
return {
|
||||
"op_name",
|
||||
"op_params",
|
||||
"supported",
|
||||
"error_message",
|
||||
"test_mode",
|
||||
"backend_reg_name",
|
||||
"backend_name",
|
||||
};
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
static std::unique_ptr<printer> create_printer(output_formats format) {
|
||||
@@ -4366,26 +4393,32 @@ struct test_flash_attn_ext : public test_case {
|
||||
const int64_t hsk_padded = GGML_PAD(hsk, ggml_blck_size(type_KV));
|
||||
const int64_t hsv_padded = GGML_PAD(hsv, ggml_blck_size(type_KV));
|
||||
|
||||
auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) -> ggml_tensor * {
|
||||
auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, bool is_view) -> ggml_tensor * {
|
||||
int64_t ne[4] = {ne0, ne1, ne2, ne3};
|
||||
int64_t ne_perm[4];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
ne_perm[permute[i]] = ne[i];
|
||||
}
|
||||
ggml_tensor * t = ggml_new_tensor_4d(ctx, type, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3]);
|
||||
ggml_tensor * t;
|
||||
if (is_view) {
|
||||
ggml_tensor * t0 = ggml_new_tensor_4d(ctx, type, ne_perm[0], 2*ne_perm[1], ne_perm[2], ne_perm[3]);
|
||||
t = ggml_view_4d(ctx, t0, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3], t0->nb[1], t0->nb[2], t0->nb[3], 0);
|
||||
} else {
|
||||
t = ggml_new_tensor_4d(ctx, type, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3]);
|
||||
}
|
||||
if (permute != std::array<int32_t, 4>{0, 1, 2, 3}) {
|
||||
t = ggml_permute(ctx, t, permute[0], permute[1], permute[2], permute[3]);
|
||||
}
|
||||
return t;
|
||||
};
|
||||
|
||||
ggml_tensor * q = create_permuted(GGML_TYPE_F32, hsk_padded, nb, nh*nr23[0], nr23[1]);
|
||||
ggml_tensor * q = create_permuted(GGML_TYPE_F32, hsk_padded, nb, nh*nr23[0], nr23[1], false);
|
||||
ggml_set_name(q, "q");
|
||||
|
||||
ggml_tensor * k = create_permuted(type_KV, hsk_padded, kv, nh, nr23[1]);
|
||||
ggml_tensor * k = create_permuted(type_KV, hsk_padded, kv, nh, nr23[1], true); // the K tensor is usually a view of the K cache
|
||||
ggml_set_name(k, "k");
|
||||
|
||||
ggml_tensor * v = create_permuted(type_KV, hsv_padded, kv, nh, nr23[1]);
|
||||
ggml_tensor * v = create_permuted(type_KV, hsv_padded, kv, nh, nr23[1], true); // the V tensor is usually a view of the V cache
|
||||
ggml_set_name(v, "v");
|
||||
|
||||
ggml_tensor * m = nullptr;
|
||||
|
||||
@@ -148,7 +148,7 @@ struct lora_merge_ctx {
|
||||
|
||||
ctx_out = gguf_init_empty();
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
|
||||
/*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead()),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
@@ -2315,7 +2315,7 @@ struct clip_model_loader {
|
||||
|
||||
// create data context
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ (gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
|
||||
/*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user