Compare commits

..

6 Commits

Author SHA1 Message Date
Francis Couture-Harpin
93fbd407f3 Merge branch 'master' into compilade/convert-prequant
Some checks failed
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled
Python check requirements.txt / check-requirements (push) Has been cancelled
Python Type-Check / pyright type-check (push) Has been cancelled
Update Operations Documentation / update-ops-docs (push) Has been cancelled
2025-10-23 14:23:12 -04:00
Francis Couture-Harpin
0d5cfed596 Merge branch 'master' into compilade/convert-prequant
Some checks failed
Copilot Setup Steps / copilot-setup-steps (push) Has been cancelled
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled
Python check requirements.txt / check-requirements (push) Has been cancelled
Python Type-Check / pyright type-check (push) Has been cancelled
Update Operations Documentation / update-ops-docs (push) Has been cancelled
2025-09-09 14:23:06 -04:00
Francis Couture-Harpin
adec43d774 Merge branch 'master' into compilade/convert-prequant 2025-09-01 10:13:29 -04:00
Francis Couture-Harpin
899398277d convert : fix conversion from FP8 for Deepseek-V3.1-Base 2025-08-19 17:27:59 -04:00
Francis Couture-Harpin
1ae6ab7601 Merge branch 'master' into compilade/convert-prequant 2025-08-14 17:05:21 -04:00
Francis Couture-Harpin
de12f8ac50 convert : begin handling pre-quantized models 2025-07-22 04:11:34 -04:00
354 changed files with 19318 additions and 71691 deletions

View File

@@ -34,7 +34,6 @@
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true,
useVulkan ? false,
useRpc ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
# It's necessary to consistently use backendStdenv when building with CUDA support,
@@ -176,7 +175,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
(cmakeBool "GGML_METAL" useMetalKit)
(cmakeBool "GGML_VULKAN" useVulkan)
(cmakeBool "GGML_STATIC" enableStatic)
(cmakeBool "GGML_RPC" useRpc)
]
++ optionals useCuda [
(

View File

@@ -24,9 +24,8 @@ RUN --mount=type=cache,target=/root/.ccache \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DLLAMA_BUILD_TESTS=OFF \
-DGGML_BACKEND_DL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS && \
cmake --build build --config Release -j $(nproc) && \
@@ -104,7 +103,6 @@ FROM base AS light
WORKDIR /llama.cpp/bin
# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
@@ -118,7 +116,6 @@ ENV LLAMA_ARG_HOST=0.0.0.0
WORKDIR /llama.cpp/bin
# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
EXPOSE 8080

View File

@@ -1,4 +1,4 @@
ARG UBUNTU_VERSION=25.10
ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build
@@ -7,16 +7,32 @@ FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget xz-utils
# Install Vulkan SDK
ARG VULKAN_VERSION=1.4.321.1
RUN ARCH=$(uname -m) && \
wget -qO /tmp/vulkan-sdk.tar.xz https://sdk.lunarg.com/sdk/download/${VULKAN_VERSION}/linux/vulkan-sdk-linux-${ARCH}-${VULKAN_VERSION}.tar.xz && \
mkdir -p /opt/vulkan && \
tar -xf /tmp/vulkan-sdk.tar.xz -C /tmp --strip-components=1 && \
mv /tmp/${ARCH}/* /opt/vulkan/ && \
rm -rf /tmp/*
# Install cURL and Vulkan SDK dependencies
RUN apt install -y libcurl4-openssl-dev curl \
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev
# Set environment variables
ENV VULKAN_SDK=/opt/vulkan
ENV PATH=$VULKAN_SDK/bin:$PATH
ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib:$LD_LIBRARY_PATH
ENV CMAKE_PREFIX_PATH=$VULKAN_SDK:$CMAKE_PREFIX_PATH
ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig:$PKG_CONFIG_PATH
# Build it
WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \
@@ -34,7 +50,7 @@ RUN mkdir -p /app/full \
FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
&& apt-get install -y libgomp1 curl libvulkan-dev \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \

View File

@@ -60,11 +60,3 @@ end_of_line = unset
charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset
[benches/**]
indent_style = unset
indent_size = unset
end_of_line = unset
charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset

4
.github/labeler.yml vendored
View File

@@ -76,10 +76,6 @@ ggml:
- changed-files:
- any-glob-to-any-file:
- ggml/**
model:
- changed-files:
- any-glob-to-any-file:
- src/models/**
nix:
- changed-files:
- any-glob-to-any-file:

View File

@@ -4,49 +4,49 @@ on:
workflow_call:
jobs:
# ubuntu-24-riscv64-cpu-cross:
# runs-on: ubuntu-24.04
ubuntu-24-riscv64-cpu-cross:
runs-on: ubuntu-24.04
# steps:
# - uses: actions/checkout@v4
# - name: Setup Riscv
# run: |
# sudo dpkg --add-architecture riscv64
steps:
- uses: actions/checkout@v4
- name: Setup Riscv
run: |
sudo dpkg --add-architecture riscv64
# # Add arch-specific repositories for non-amd64 architectures
# cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
# EOF
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get update || true ;# Prevent failure due to missing URLs.
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# gcc-14-riscv64-linux-gnu \
# g++-14-riscv64-linux-gnu
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu
# - name: Build
# run: |
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
# cmake --build build --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
# ubuntu-24-riscv64-vulkan-cross:
# runs-on: ubuntu-24.04

View File

@@ -161,16 +161,15 @@ jobs:
- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v2.0.0"
DAWN_VERSION="v1.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
curl -L -o artifact.zip \
curl -L -o artifact.tar.gz \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
mkdir dawn
unzip artifact.zip
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
@@ -522,16 +521,15 @@ jobs:
id: dawn-depends
run: |
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
DAWN_VERSION="v2.0.0"
DAWN_VERSION="v1.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
curl -L -o artifact.zip \
curl -L -o artifact.tar.gz \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
mkdir dawn
unzip artifact.zip
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build

View File

@@ -40,7 +40,7 @@ jobs:
# https://github.com/ggml-org/llama.cpp/issues/11888
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }

View File

@@ -134,8 +134,8 @@ jobs:
include:
- build: 'x64'
os: ubuntu-22.04
- build: 's390x'
os: ubuntu-24.04-s390x
- build: 's390x-z15' # z15 because our CI runners are on z15
os: ubuntu-22.04-s390x
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
# - build: 'arm64'
# os: ubuntu-22.04-arm

View File

@@ -65,7 +65,7 @@
/ggml/src/ggml-impl.h @ggerganov @slaren
/ggml/src/ggml-metal/ @ggerganov
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
/ggml/src/ggml-hexagon/ @max-krasnyansky @lhez
/ggml/src/ggml-hexagon/ @max-krasnyansky
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/ggml-quants.* @ggerganov
/ggml/src/ggml-rpc/ @rgerganov
@@ -89,7 +89,6 @@
/src/llama-model-loader.* @slaren
/src/llama-model.* @CISC
/src/llama-vocab.* @CISC
/src/models/ @CISC
/tests/ @ggerganov
/tests/test-backend-ops.cpp @slaren
/tests/test-thread-safety.cpp @slaren

View File

@@ -17,13 +17,14 @@ LLM inference in C/C++
## Hot topics
- **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
- [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
- [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)**
- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
@@ -83,7 +84,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
- [x] [Jamba](https://huggingface.co/ai21labs)
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)

View File

@@ -1,6 +0,0 @@
{
"chars": 2296.1916666666666,
"chars:std": 986.051306946325,
"score": 0.925,
"score:std": 0.26339134382131846
}

File diff suppressed because one or more lines are too long

View File

@@ -1,264 +0,0 @@
## System info
```bash
uname --all
Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
g++ --version
g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
nvidia-smi
Sun Nov 2 10:43:25 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A |
| N/A 35C P8 4W / N/A | Not Supported | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
```
## ggml-org/gpt-oss-20b-GGUF
Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF
- `llama-batched-bench`
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
| 512 | 32 | 1 | 544 | 0.374 | 1369.01 | 0.383 | 83.64 | 0.757 | 719.01 |
| 512 | 32 | 2 | 1088 | 0.274 | 3741.35 | 0.659 | 97.14 | 0.933 | 1166.66 |
| 512 | 32 | 4 | 2176 | 0.526 | 3896.47 | 0.817 | 156.73 | 1.342 | 1621.08 |
| 512 | 32 | 8 | 4352 | 1.044 | 3925.10 | 0.987 | 259.44 | 2.030 | 2143.56 |
| 512 | 32 | 16 | 8704 | 2.076 | 3945.84 | 1.248 | 410.32 | 3.324 | 2618.60 |
| 512 | 32 | 32 | 17408 | 4.170 | 3929.28 | 1.630 | 628.40 | 5.799 | 3001.76 |
| 4096 | 32 | 1 | 4128 | 1.083 | 3782.66 | 0.394 | 81.21 | 1.477 | 2795.13 |
| 4096 | 32 | 2 | 8256 | 2.166 | 3782.72 | 0.725 | 88.28 | 2.891 | 2856.14 |
| 4096 | 32 | 4 | 16512 | 4.333 | 3780.88 | 0.896 | 142.82 | 5.230 | 3157.38 |
| 4096 | 32 | 8 | 33024 | 8.618 | 3802.14 | 1.155 | 221.69 | 9.773 | 3379.08 |
| 4096 | 32 | 16 | 66048 | 17.330 | 3781.73 | 1.598 | 320.34 | 18.928 | 3489.45 |
| 4096 | 32 | 32 | 132096 | 34.671 | 3780.48 | 2.336 | 438.35 | 37.007 | 3569.51 |
| 8192 | 32 | 1 | 8224 | 2.233 | 3668.56 | 0.438 | 72.98 | 2.671 | 3078.44 |
| 8192 | 32 | 2 | 16448 | 4.425 | 3702.95 | 0.756 | 84.66 | 5.181 | 3174.95 |
| 8192 | 32 | 4 | 32896 | 8.859 | 3698.64 | 0.967 | 132.38 | 9.826 | 3347.72 |
| 8192 | 32 | 8 | 65792 | 17.714 | 3699.57 | 1.277 | 200.52 | 18.991 | 3464.35 |
| 8192 | 32 | 16 | 131584 | 35.494 | 3692.84 | 1.841 | 278.12 | 37.335 | 3524.46 |
| 8192 | 32 | 32 | 263168 | 70.949 | 3694.82 | 2.798 | 365.99 | 73.747 | 3568.53 |
- `llama-bench`
| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 3714.25 ± 20.36 |
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 86.58 ± 0.43 |
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 3445.17 ± 17.85 |
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 81.72 ± 0.53 |
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 3218.78 ± 11.34 |
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 74.86 ± 0.64 |
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 2732.83 ± 7.17 |
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 71.57 ± 0.51 |
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 2119.75 ± 12.81 |
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 62.33 ± 0.24 |
build: eeee367de (6989)
## ggml-org/gpt-oss-120b-GGUF
Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF
- `llama-batched-bench`
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
| 512 | 32 | 1 | 544 | 0.571 | 897.18 | 0.543 | 58.96 | 1.113 | 488.60 |
| 512 | 32 | 2 | 1088 | 0.593 | 1725.37 | 1.041 | 61.45 | 1.635 | 665.48 |
| 512 | 32 | 4 | 2176 | 1.043 | 1963.15 | 1.334 | 95.95 | 2.377 | 915.36 |
| 512 | 32 | 8 | 4352 | 2.099 | 1951.63 | 1.717 | 149.07 | 3.816 | 1140.45 |
| 512 | 32 | 16 | 8704 | 4.207 | 1947.12 | 2.311 | 221.56 | 6.518 | 1335.35 |
| 512 | 32 | 32 | 17408 | 8.422 | 1945.36 | 3.298 | 310.46 | 11.720 | 1485.27 |
| 4096 | 32 | 1 | 4128 | 2.138 | 1915.88 | 0.571 | 56.09 | 2.708 | 1524.12 |
| 4096 | 32 | 2 | 8256 | 4.266 | 1920.25 | 1.137 | 56.27 | 5.404 | 1527.90 |
| 4096 | 32 | 4 | 16512 | 8.564 | 1913.02 | 1.471 | 86.99 | 10.036 | 1645.29 |
| 4096 | 32 | 8 | 33024 | 17.092 | 1917.19 | 1.979 | 129.33 | 19.071 | 1731.63 |
| 4096 | 32 | 16 | 66048 | 34.211 | 1915.65 | 2.850 | 179.66 | 37.061 | 1782.15 |
| 4096 | 32 | 32 | 132096 | 68.394 | 1916.44 | 4.381 | 233.72 | 72.775 | 1815.13 |
| 8192 | 32 | 1 | 8224 | 4.349 | 1883.45 | 0.620 | 51.65 | 4.969 | 1655.04 |
| 8192 | 32 | 2 | 16448 | 8.674 | 1888.83 | 1.178 | 54.33 | 9.852 | 1669.48 |
| 8192 | 32 | 4 | 32896 | 17.351 | 1888.55 | 1.580 | 81.01 | 18.931 | 1737.68 |
| 8192 | 32 | 8 | 65792 | 34.743 | 1886.31 | 2.173 | 117.80 | 36.916 | 1782.20 |
| 8192 | 32 | 16 | 131584 | 69.413 | 1888.29 | 3.297 | 155.28 | 72.710 | 1809.70 |
| 8192 | 32 | 32 | 263168 | 138.903 | 1887.24 | 5.004 | 204.63 | 143.907 | 1828.73 |
- `llama-bench`
| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 1919.36 ± 5.01 |
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 60.40 ± 0.30 |
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 1825.30 ± 6.37 |
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 56.94 ± 0.29 |
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1739.19 ± 6.00 |
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 52.51 ± 0.42 |
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1536.75 ± 4.27 |
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 49.33 ± 0.27 |
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1255.85 ± 3.26 |
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 42.99 ± 0.18 |
build: eeee367de (6989)
## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
- `llama-batched-bench`
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
| 512 | 32 | 1 | 544 | 0.398 | 1285.90 | 0.530 | 60.41 | 0.928 | 586.27 |
| 512 | 32 | 2 | 1088 | 0.386 | 2651.65 | 0.948 | 67.50 | 1.334 | 815.38 |
| 512 | 32 | 4 | 2176 | 0.666 | 3076.37 | 1.209 | 105.87 | 1.875 | 1160.71 |
| 512 | 32 | 8 | 4352 | 1.325 | 3091.39 | 1.610 | 158.98 | 2.935 | 1482.65 |
| 512 | 32 | 16 | 8704 | 2.664 | 3075.58 | 2.150 | 238.19 | 4.813 | 1808.39 |
| 512 | 32 | 32 | 17408 | 5.336 | 3070.31 | 2.904 | 352.59 | 8.240 | 2112.50 |
| 4096 | 32 | 1 | 4128 | 1.444 | 2836.81 | 0.581 | 55.09 | 2.025 | 2038.81 |
| 4096 | 32 | 2 | 8256 | 2.872 | 2852.14 | 1.084 | 59.06 | 3.956 | 2086.99 |
| 4096 | 32 | 4 | 16512 | 5.744 | 2852.32 | 1.440 | 88.90 | 7.184 | 2298.47 |
| 4096 | 32 | 8 | 33024 | 11.463 | 2858.68 | 2.068 | 123.78 | 13.531 | 2440.65 |
| 4096 | 32 | 16 | 66048 | 22.915 | 2859.95 | 3.018 | 169.67 | 25.933 | 2546.90 |
| 4096 | 32 | 32 | 132096 | 45.956 | 2852.10 | 4.609 | 222.18 | 50.565 | 2612.39 |
| 8192 | 32 | 1 | 8224 | 3.063 | 2674.72 | 0.693 | 46.20 | 3.755 | 2189.92 |
| 8192 | 32 | 2 | 16448 | 6.109 | 2681.87 | 1.214 | 52.71 | 7.323 | 2245.98 |
| 8192 | 32 | 4 | 32896 | 12.197 | 2686.63 | 1.682 | 76.11 | 13.878 | 2370.30 |
| 8192 | 32 | 8 | 65792 | 24.409 | 2684.94 | 2.556 | 100.17 | 26.965 | 2439.95 |
| 8192 | 32 | 16 | 131584 | 48.753 | 2688.50 | 3.994 | 128.20 | 52.747 | 2494.64 |
| 8192 | 32 | 32 | 263168 | 97.508 | 2688.42 | 6.528 | 156.86 | 104.037 | 2529.57 |
- `llama-bench`
| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 2925.55 ± 4.25 |
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 62.80 ± 0.27 |
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2531.01 ± 6.79 |
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 55.86 ± 0.33 |
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 2244.39 ± 5.33 |
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 45.95 ± 0.33 |
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1783.17 ± 3.68 |
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 39.07 ± 0.10 |
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1241.90 ± 3.13 |
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 29.92 ± 0.06 |
build: eeee367de (6989)
## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
- `llama-batched-bench`
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
| 512 | 32 | 1 | 544 | 0.211 | 2421.57 | 1.055 | 30.33 | 1.266 | 429.57 |
| 512 | 32 | 2 | 1088 | 0.419 | 2441.34 | 1.130 | 56.65 | 1.549 | 702.32 |
| 512 | 32 | 4 | 2176 | 0.873 | 2345.54 | 1.174 | 108.99 | 2.048 | 1062.74 |
| 512 | 32 | 8 | 4352 | 1.727 | 2371.85 | 1.254 | 204.22 | 2.980 | 1460.19 |
| 512 | 32 | 16 | 8704 | 3.452 | 2373.22 | 1.492 | 343.16 | 4.944 | 1760.56 |
| 512 | 32 | 32 | 17408 | 6.916 | 2368.93 | 1.675 | 611.51 | 8.591 | 2026.36 |
| 4096 | 32 | 1 | 4128 | 1.799 | 2277.26 | 1.084 | 29.51 | 2.883 | 1431.91 |
| 4096 | 32 | 2 | 8256 | 3.577 | 2290.01 | 1.196 | 53.50 | 4.774 | 1729.51 |
| 4096 | 32 | 4 | 16512 | 7.172 | 2284.36 | 1.313 | 97.50 | 8.485 | 1946.00 |
| 4096 | 32 | 8 | 33024 | 14.341 | 2284.96 | 1.520 | 168.46 | 15.860 | 2082.18 |
| 4096 | 32 | 16 | 66048 | 28.675 | 2285.44 | 1.983 | 258.21 | 30.658 | 2154.33 |
| 4096 | 32 | 32 | 132096 | 57.354 | 2285.32 | 2.640 | 387.87 | 59.994 | 2201.82 |
| 8192 | 32 | 1 | 8224 | 3.701 | 2213.75 | 1.119 | 28.59 | 4.820 | 1706.34 |
| 8192 | 32 | 2 | 16448 | 7.410 | 2211.19 | 1.272 | 50.31 | 8.682 | 1894.56 |
| 8192 | 32 | 4 | 32896 | 14.802 | 2213.83 | 1.460 | 87.68 | 16.261 | 2022.96 |
| 8192 | 32 | 8 | 65792 | 29.609 | 2213.35 | 1.781 | 143.74 | 31.390 | 2095.93 |
| 8192 | 32 | 16 | 131584 | 59.229 | 2212.96 | 2.495 | 205.17 | 61.725 | 2131.79 |
| 8192 | 32 | 32 | 263168 | 118.449 | 2213.15 | 3.714 | 275.75 | 122.162 | 2154.25 |
- `llama-bench`
| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 2272.74 ± 4.68 |
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 30.66 ± 0.02 |
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2107.80 ± 9.55 |
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 29.71 ± 0.05 |
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1937.80 ± 6.75 |
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 28.86 ± 0.04 |
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1641.12 ± 1.78 |
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 27.24 ± 0.04 |
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1296.02 ± 2.67 |
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 23.78 ± 0.03 |
build: eeee367de (6989)
## ggml-org/gemma-3-4b-it-qat-GGUF
Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF
- `llama-batched-bench`
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
| 512 | 32 | 1 | 544 | 0.094 | 5434.73 | 0.394 | 81.21 | 0.488 | 1114.15 |
| 512 | 32 | 2 | 1088 | 0.168 | 6091.68 | 0.498 | 128.52 | 0.666 | 1633.41 |
| 512 | 32 | 4 | 2176 | 0.341 | 6010.68 | 0.542 | 236.37 | 0.882 | 2466.43 |
| 512 | 32 | 8 | 4352 | 0.665 | 6161.46 | 0.678 | 377.74 | 1.342 | 3241.72 |
| 512 | 32 | 16 | 8704 | 1.323 | 6193.19 | 0.902 | 567.41 | 2.225 | 3911.74 |
| 512 | 32 | 32 | 17408 | 2.642 | 6202.03 | 1.231 | 832.03 | 3.872 | 4495.36 |
| 4096 | 32 | 1 | 4128 | 0.701 | 5840.49 | 0.439 | 72.95 | 1.140 | 3621.23 |
| 4096 | 32 | 2 | 8256 | 1.387 | 5906.82 | 0.574 | 111.48 | 1.961 | 4210.12 |
| 4096 | 32 | 4 | 16512 | 2.758 | 5940.33 | 0.651 | 196.58 | 3.409 | 4843.33 |
| 4096 | 32 | 8 | 33024 | 5.491 | 5967.56 | 0.876 | 292.40 | 6.367 | 5187.12 |
| 4096 | 32 | 16 | 66048 | 10.978 | 5969.58 | 1.275 | 401.69 | 12.253 | 5390.38 |
| 4096 | 32 | 32 | 132096 | 21.944 | 5972.93 | 1.992 | 514.16 | 23.936 | 5518.73 |
| 8192 | 32 | 1 | 8224 | 1.402 | 5841.91 | 0.452 | 70.73 | 1.855 | 4434.12 |
| 8192 | 32 | 2 | 16448 | 2.793 | 5865.34 | 0.637 | 100.55 | 3.430 | 4795.51 |
| 8192 | 32 | 4 | 32896 | 5.564 | 5889.64 | 0.770 | 166.26 | 6.334 | 5193.95 |
| 8192 | 32 | 8 | 65792 | 11.114 | 5896.44 | 1.122 | 228.07 | 12.237 | 5376.51 |
| 8192 | 32 | 16 | 131584 | 22.210 | 5901.38 | 1.789 | 286.15 | 24.000 | 5482.74 |
| 8192 | 32 | 32 | 263168 | 44.382 | 5906.56 | 3.044 | 336.38 | 47.426 | 5549.02 |
- `llama-bench`
| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 5810.04 ± 21.71 |
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 84.54 ± 0.18 |
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 5288.04 ± 3.54 |
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 78.82 ± 1.37 |
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 4960.43 ± 16.64 |
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 74.13 ± 0.30 |
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 4495.92 ± 31.11 |
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 72.37 ± 0.29 |
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 3746.90 ± 40.01 |
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 63.02 ± 0.20 |
build: eeee367de (6989)

File diff suppressed because one or more lines are too long

View File

@@ -56,8 +56,6 @@ add_library(${TARGET} STATIC
common.h
console.cpp
console.h
download.cpp
download.h
http.h
json-partial.cpp
json-partial.h

File diff suppressed because it is too large Load Diff

View File

@@ -59,8 +59,8 @@ struct common_arg {
common_arg & set_sparam();
bool in_example(enum llama_example ex);
bool is_exclude(enum llama_example ex);
bool get_value_from_env(std::string & output) const;
bool has_value_from_env() const;
bool get_value_from_env(std::string & output);
bool has_value_from_env();
std::string to_string();
};

View File

@@ -9,11 +9,8 @@
#include <minja/chat-template.hpp>
#include <minja/minja.hpp>
#include <algorithm>
#include <cstdio>
#include <cctype>
#include <exception>
#include <functional>
#include <iostream>
#include <optional>
#include <stdexcept>
@@ -313,6 +310,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
}
if (!msg.reasoning_content.empty()) {
jmsg["reasoning_content"] = msg.reasoning_content;
jmsg["thinking"] = msg.reasoning_content; // gpt-oss
}
if (!msg.tool_name.empty()) {
jmsg["name"] = msg.tool_name;
@@ -642,7 +640,6 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS: return "LFM2 with JSON tools";
default:
throw std::runtime_error("Unknown chat format");
}
@@ -989,126 +986,6 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
return data;
}
// Case-insensitive find
static size_t ifind_string(const std::string & haystack, const std::string & needle, size_t pos = 0) {
auto it = std::search(
haystack.begin() + pos, haystack.end(),
needle.begin(), needle.end(),
[](char a, char b) { return std::tolower(a) == std::tolower(b); }
);
return (it == haystack.end()) ? std::string::npos : std::distance(haystack.begin(), it);
}
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
const auto is_json_schema_provided = !inputs.json_schema.is_null();
const auto is_grammar_provided = !inputs.grammar.empty();
const auto are_tools_provided = inputs.tools.is_array() && !inputs.tools.empty();
// the logic requires potentially modifying the messages
auto tweaked_messages = inputs.messages;
auto replace_json_schema_marker = [](json & messages) -> bool {
static std::string marker1 = "force json schema.\n";
static std::string marker2 = "force json schema.";
if (messages.empty() || messages.at(0).at("role") != "system") {
return false;
}
std::string content = messages.at(0).at("content");
for (const auto & marker : {marker1, marker2}) {
const auto pos = ifind_string(content, marker);
if (pos != std::string::npos) {
content.replace(pos, marker.length(), "");
// inject modified content back into the messages
messages.at(0).at("content") = content;
return true;
}
}
return false;
};
// Lfm2 model does not natively work with json, but can generally understand the tools structure
//
// Example of the pytorch dialog structure:
// <|startoftext|><|im_start|>system
// List of tools: <|tool_list_start|>[{"name": "get_candidate_status", "description": "Retrieves the current status of a candidate in the recruitment process", "parameters": {"type": "object", "properties": {"candidate_id": {"type": "string", "description": "Unique identifier for the candidate"}}, "required": ["candidate_id"]}}]<|tool_list_end|><|im_end|>
// <|im_start|>user
// What is the current status of candidate ID 12345?<|im_end|>
// <|im_start|>assistant
// <|tool_call_start|>[get_candidate_status(candidate_id="12345")]<|tool_call_end|>Checking the current status of candidate ID 12345.<|im_end|>
// <|im_start|>tool
// <|tool_response_start|>{"candidate_id": "12345", "status": "Interview Scheduled", "position": "Clinical Research Associate", "date": "2023-11-20"}<|tool_response_end|><|im_end|>
// <|im_start|>assistant
// The candidate with ID 12345 is currently in the "Interview Scheduled" stage for the position of Clinical Research Associate, with an interview date set for 2023-11-20.<|im_end|>
//
// For the llama server compatibility with json tools semantic,
// the client can add "Follow json schema." line into the system message prompt to force the json output.
//
if (are_tools_provided && (is_json_schema_provided || is_grammar_provided)) {
// server/utils.hpp prohibits that branch for the custom grammar anyways
throw std::runtime_error("Tools call must not use \"json_schema\" or \"grammar\", use non-tool invocation if you want to use custom grammar");
} else if (are_tools_provided && replace_json_schema_marker(tweaked_messages)) {
LOG_INF("%s: Using tools to build a grammar\n", __func__);
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
auto schemas = json::array();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
schemas.push_back({
{"type", "object"},
{"properties", {
{"name", {
{"type", "string"},
{"const", function.at("name")},
}},
{"arguments", function.at("parameters")},
}},
{"required", json::array({"name", "arguments", "id"})},
});
});
auto schema = json {
{"type", "array"},
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
{"minItems", 1},
};
if (!inputs.parallel_tool_calls) {
schema["maxItems"] = 1;
}
builder.add_rule("root", "\"<|tool_call_start|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tool_call_end|>\"");
});
// model has no concept of tool selection mode choice,
// if the system prompt rendered correctly it will produce a tool call
// the grammar goes inside the tool call body
data.grammar_lazy = true;
data.grammar_triggers = {{COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, "\\s*<\\|tool_call_start\\|>\\s*\\["}};
data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
data.format = COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS;
} else if (are_tools_provided && (!is_json_schema_provided && !is_grammar_provided)) {
LOG_INF("%s: Using tools without json schema or grammar\n", __func__);
// output those tokens
data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
} else if (is_json_schema_provided) {
LOG_INF("%s: Using provided json schema to build a grammar\n", __func__);
data.grammar = json_schema_to_grammar(inputs.json_schema);
} else if (is_grammar_provided) {
LOG_INF("%s: Using provided grammar\n", __func__);
data.grammar = inputs.grammar;
} else {
LOG_INF("%s: Using content relying on the template\n", __func__);
}
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
LOG_DBG("%s: Prompt: %s\n", __func__, data.prompt.c_str());
return data;
}
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
@@ -1809,23 +1686,7 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
// Copy reasoning to the "thinking" field as expected by the gpt-oss template
auto adjusted_messages = json::array();
for (const auto & msg : inputs.messages) {
auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
if (has_reasoning_content && has_tool_calls) {
auto adjusted_message = msg;
adjusted_message["thinking"] = msg.at("reasoning_content");
adjusted_messages.push_back(adjusted_message);
} else {
adjusted_messages.push_back(msg);
}
}
auto prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
auto prompt = apply(tmpl, inputs);
// Check if we need to replace the return token with end token during
// inference and without generation prompt. For more details see:
@@ -2638,71 +2499,6 @@ static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
builder.add_content(builder.consume_rest());
}
static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
// LFM2 format: <|tool_call_start|>[{"name": "get_current_time", "arguments": {"location": "Paris"}}]<|tool_call_end|>
static const common_regex tool_call_start_regex(regex_escape("<|tool_call_start|>"));
static const common_regex tool_call_end_regex(regex_escape("<|tool_call_end|>"));
// Loop through all tool calls
while (auto res = builder.try_find_regex(tool_call_start_regex, std::string::npos, /* add_prelude_to_content= */ true)) {
builder.move_to(res->groups[0].end);
// Parse JSON array format: [{"name": "...", "arguments": {...}}]
auto tool_calls_data = builder.consume_json();
// Consume end marker
builder.consume_spaces();
if (!builder.try_consume_regex(tool_call_end_regex)) {
throw common_chat_msg_partial_exception("Expected <|tool_call_end|>");
}
// Process each tool call in the array
if (tool_calls_data.json.is_array()) {
for (const auto & tool_call : tool_calls_data.json) {
if (!tool_call.is_object()) {
throw common_chat_msg_partial_exception("Tool call must be an object");
}
if (!tool_call.contains("name")) {
throw common_chat_msg_partial_exception("Tool call missing 'name' field");
}
std::string function_name = tool_call.at("name");
std::string arguments = "{}";
if (tool_call.contains("arguments")) {
if (tool_call.at("arguments").is_object()) {
arguments = tool_call.at("arguments").dump();
} else if (tool_call.at("arguments").is_string()) {
arguments = tool_call.at("arguments");
}
}
if (!builder.add_tool_call(function_name, "", arguments)) {
throw common_chat_msg_partial_exception("Incomplete tool call");
}
}
} else {
throw common_chat_msg_partial_exception("Expected JSON array for tool calls");
}
// Consume any trailing whitespace after this tool call
builder.consume_spaces();
}
// Consume any remaining content after all tool calls
auto remaining = builder.consume_rest();
if (!string_strip(remaining).empty()) {
builder.add_content(remaining);
}
}
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
// Parse thinking tags first - this handles the main reasoning content
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2952,12 +2748,6 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_apertus(tmpl, params);
}
// LFM2 (w/ tools)
if (src.find("List of tools: <|tool_list_start|>[") != std::string::npos &&
src.find("]<|tool_list_end|>") != std::string::npos) {
return common_chat_params_init_lfm2(tmpl, params);
}
// Use generic handler when mixing tools + JSON schema.
// TODO: support that mix in handlers below.
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -3136,9 +2926,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_APERTUS:
common_chat_parse_apertus(builder);
break;
case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
common_chat_parse_lfm2(builder);
break;
default:
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
}

View File

@@ -116,7 +116,6 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_SEED_OSS,
COMMON_CHAT_FORMAT_NEMOTRON_V2,
COMMON_CHAT_FORMAT_APERTUS,
COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};

View File

@@ -908,39 +908,6 @@ std::string fs_get_cache_file(const std::string & filename) {
return cache_directory + filename;
}
std::vector<common_file_info> fs_list_files(const std::string & path) {
std::vector<common_file_info> files;
if (path.empty()) return files;
std::filesystem::path dir(path);
if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
return files;
}
for (const auto & entry : std::filesystem::directory_iterator(dir)) {
try {
// Only include regular files (skip directories)
const auto & p = entry.path();
if (std::filesystem::is_regular_file(p)) {
common_file_info info;
info.path = p.string();
info.name = p.filename().string();
try {
info.size = static_cast<size_t>(std::filesystem::file_size(p));
} catch (const std::filesystem::filesystem_error &) {
info.size = 0;
}
files.push_back(std::move(info));
}
} catch (const std::filesystem::filesystem_error &) {
// skip entries we cannot inspect
continue;
}
}
return files;
}
//
// Model utils

View File

@@ -406,8 +406,6 @@ struct common_params {
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
int image_min_tokens = -1;
int image_max_tokens = -1;
// finetune
struct lr_opt lr;
@@ -460,8 +458,7 @@ struct common_params {
float slot_prompt_similarity = 0.1f;
// batched-bench params
bool is_pp_shared = false;
bool is_tg_separate = false;
bool is_pp_shared = false;
std::vector<int32_t> n_pp;
std::vector<int32_t> n_tg;
@@ -508,10 +505,6 @@ struct common_params {
// return false from callback to abort model loading or true to continue
llama_progress_callback load_progress_callback = NULL;
void * load_progress_callback_user_data = NULL;
bool has_speculative() const {
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
}
};
// call once at the start of a program if it uses libcommon
@@ -612,13 +605,6 @@ bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);
struct common_file_info {
std::string path;
std::string name;
size_t size = 0; // in bytes
};
std::vector<common_file_info> fs_list_files(const std::string & path);
//
// Model utils
//

File diff suppressed because it is too large Load Diff

View File

@@ -1,55 +0,0 @@
#pragma once
#include <string>
struct common_params_model;
//
// download functionalities
//
struct common_cached_model_info {
std::string manifest_path;
std::string user;
std::string model;
std::string tag;
size_t size = 0; // GGUF size in bytes
std::string to_string() const {
return user + "/" + model + ":" + tag;
}
};
struct common_hf_file_res {
std::string repo; // repo name with ":tag" removed
std::string ggufFile;
std::string mmprojFile;
};
/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
common_hf_file_res common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & bearer_token,
bool offline);
// returns true if download succeeded
bool common_download_model(
const common_params_model & model,
const std::string & bearer_token,
bool offline);
// returns list of cached models
std::vector<common_cached_model_info> common_list_cached_models();
// resolve and download model from Docker registry
// return local path to downloaded model file
std::string common_docker_resolve_model(const std::string & docker);

View File

@@ -601,10 +601,7 @@ private:
}
std::string _resolve_ref(const std::string & ref) {
auto it = ref.find('#');
std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
_refs_being_resolved.insert(ref);
json resolved = _refs[ref];
@@ -777,24 +774,11 @@ public:
std::vector<std::string> tokens = string_split(pointer, "/");
for (size_t i = 1; i < tokens.size(); ++i) {
std::string sel = tokens[i];
if (target.is_object() && target.contains(sel)) {
target = target[sel];
} else if (target.is_array()) {
size_t sel_index;
try {
sel_index = std::stoul(sel);
} catch (const std::invalid_argument & e) {
sel_index = target.size();
}
if (sel_index >= target.size()) {
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
return;
}
target = target[sel_index];
} else {
if (target.is_null() || !target.contains(sel)) {
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
return;
}
target = target[sel];
}
_refs[ref] = target;
}

View File

@@ -218,7 +218,8 @@ class ModelBase:
logger.info(f"gguf: indexing model part '{part_name}'")
ctx: ContextManager[Any]
if is_safetensors:
ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name))
from safetensors import safe_open
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
else:
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
@@ -227,18 +228,18 @@ class ModelBase:
for name in model_part.keys():
if is_safetensors:
data: gguf.utility.LocalTensor = model_part[name]
if self.lazy:
data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731
data = model_part.get_slice(name)
data_gen = lambda data=data: LazyTorchTensor.from_safetensors_slice(data) # noqa: E731
else:
dtype = LazyTorchTensor._dtype_str_map[data.dtype]
data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731
data = model_part.get_tensor(name)
data_gen = lambda data=data: data # noqa: E731
else:
data_torch: Tensor = model_part[name]
data = model_part[name]
if self.lazy:
data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731
data_gen = lambda data=data: LazyTorchTensor.from_eager(data) # noqa: E731
else:
data_gen = lambda data=data_torch: data # noqa: E731
data_gen = lambda data=data: data # noqa: E731
tensors[name] = data_gen
# verify tensor name presence and identify potentially missing files
@@ -277,14 +278,15 @@ class ModelBase:
# The scale is inverted
return data / scale.float()
def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor:
def dequant_simple(weight: Tensor, scale: Tensor) -> Tensor:
scale = scale.float()
if block_size is not None:
for i, size in enumerate(block_size):
if (weight_block_size := quant_config.get("weight_block_size")):
# TODO: make sure it's a list of integers
for i, size in enumerate(weight_block_size):
scale = scale.repeat_interleave(size, i)
# unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
scale = scale[tuple(slice(0, size) for size in weight.shape)]
# unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
scale = scale[tuple(slice(0, size) for size in weight.shape)]
return weight.float() * scale
@@ -331,40 +333,6 @@ class ModelBase:
return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T
def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int):
assert w.dtype == torch.int32
shape = tuple(shape_tensor.tolist())
assert len(shape) == 2
mask = (1 << num_bits) - 1
shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32)
if self.lazy:
shifts = LazyTorchTensor.from_eager(shifts)
if zero_point is None:
offset = 1 << (num_bits - 1)
else:
assert len(zero_point.shape) == 2
offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask
offset = offset.reshape(-1, zero_point.shape[1])
# trim padding, and prepare for broadcast
# NOTE: the zero-point is packed along dim 0
offset = offset[:shape[0], :].unsqueeze(-1)
# extract values
# NOTE: the weights are packed along dim 1
unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask
unpacked = unpacked.reshape(shape[0], -1)
# trim padding
unpacked = unpacked[:, :shape[1]]
# prepare for broadcast of the scale
unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size)
unpacked = unpacked - offset
return (unpacked * scale.unsqueeze(-1).float()).reshape(shape)
if quant_method == "bitnet":
for name in self.model_tensors.keys():
if name.endswith(".weight_scale"):
@@ -374,13 +342,12 @@ class ModelBase:
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s())
tensors_to_remove.append(name)
elif quant_method == "fp8":
block_size = quant_config.get("weight_block_size")
for name in self.model_tensors.keys():
if name.endswith(".weight_scale_inv"):
weight_name = name.removesuffix("_scale_inv")
w = self.model_tensors[weight_name]
s = self.model_tensors[name]
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s())
tensors_to_remove.append(name)
elif quant_method == "gptq":
for name in self.model_tensors.keys():
@@ -404,49 +371,6 @@ class ModelBase:
".scales",
)
]
elif quant_method == "compressed-tensors":
quant_format = quant_config["format"]
groups = quant_config["config_groups"]
if len(groups) > 1:
raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
weight_config = tuple(groups.values())[0]["weights"]
if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized":
block_size = weight_config.get("block_structure", None)
strategy = weight_config.get("strategy")
assert strategy == "channel" or strategy == "block"
assert weight_config.get("group_size") is None # didn't find a model using this yet
for name in self.model_tensors.keys():
if name.endswith(".weight_scale"):
weight_name = name.removesuffix("_scale")
w = self.model_tensors[weight_name]
s = self.model_tensors[name]
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size)
tensors_to_remove.append(name)
elif quant_format == "pack-quantized":
assert weight_config.get("strategy") == "group"
assert weight_config.get("type", "int") == "int"
num_bits = weight_config.get("num_bits")
group_size = weight_config.get("group_size")
assert isinstance(num_bits, int)
assert isinstance(group_size, int)
for name in self.model_tensors.keys():
if name.endswith(".weight_packed"):
base_name = name.removesuffix("_packed")
w = self.model_tensors[name]
scale = self.model_tensors[base_name + "_scale"]
shape = self.model_tensors[base_name + "_shape"]
zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None)
new_tensors[base_name] = (
lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed(
w(), scale(), shape(), zero_point(), num_bits, group_size,
)
)
tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
if (base_name + "_zero_point") in self.model_tensors:
tensors_to_remove.append(base_name + "_zero_point")
else:
raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
else:
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@@ -818,12 +742,6 @@ class TextModel(ModelBase):
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
logger.info(f"gguf: experts used count = {n_experts_used}")
if (n_expert_groups := self.hparams.get("n_group")) is not None:
self.gguf_writer.add_expert_group_count(n_expert_groups)
logger.info(f"gguf: expert groups count = {n_expert_groups}")
if (n_group_used := self.hparams.get("topk_group")) is not None:
self.gguf_writer.add_expert_group_used_count(n_group_used)
logger.info(f"gguf: expert groups used count = {n_group_used}")
if (head_dim := self.hparams.get("head_dim")) is not None:
self.gguf_writer.add_key_length(head_dim)
@@ -1130,9 +1048,6 @@ class TextModel(ModelBase):
if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
# ref: https://huggingface.co/ibm-granite/granite-docling-258M
res = "granite-docling"
if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
# ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
res = "minimax-m2"
if res is None:
logger.warning("\n")
@@ -1582,17 +1497,6 @@ class MmprojModel(ModelBase):
def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
def prepare_metadata(self, vocab_only: bool):
super().prepare_metadata(vocab_only=vocab_only)
output_type: str = self.ftype.name.partition("_")[2]
if self.fname_out.is_dir():
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=output_type, model_type=None)
self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf"
else:
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
def set_gguf_parameters(self):
self.gguf_writer.add_file_type(self.ftype)
@@ -1607,7 +1511,7 @@ class MmprojModel(ModelBase):
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
# preprocessor config
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -2539,21 +2443,18 @@ class ArceeModel(LlamaModel):
)
class LlavaVisionModel(MmprojModel):
img_break_tok_id = -1
use_break_tok = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.hparams.get("model_type") == "pixtral":
# layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
if self.use_break_tok:
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
elif self.is_mistral_format:
# hparams is already vision config here so norm_eps is only defined in global_config.
self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
if self.use_break_tok:
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
else:
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
logger.info(f"Image break token id: {self.img_break_tok_id}")
@@ -3931,43 +3832,7 @@ class Qwen2MoeModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
name = name.replace("language_model.", "") # InternVL
# handle aggregated expert tensors
# GGUF stores dimensions reversed from PyTorch, so:
# PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
# Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
# Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
mapped = f"{name}.weight" if not name.endswith(".weight") else name
# Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
# Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
# Need PyTorch: (128, 2048, 768) [reversed of GGML]
# So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
permuted = data_torch.permute(0, 2, 1).contiguous()
return [(self.map_tensor_name(mapped), permuted)]
if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
split_dim = data_torch.shape[-1] // 2
gate = data_torch[..., :split_dim].contiguous()
up = data_torch[..., split_dim:].contiguous()
# Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
# Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
# Need PyTorch: (128, 768, 2048) [reversed of GGML]
# So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
base_name = name.removesuffix(".weight")
base = base_name.rsplit('.', 1)[0]
mapped_gate = f"{base}.gate_proj.weight"
mapped_up = f"{base}.up_proj.weight"
perm_gate = gate.permute(0, 2, 1).contiguous()
perm_up = up.permute(0, 2, 1).contiguous()
return [
(self.map_tensor_name(mapped_gate), perm_gate),
(self.map_tensor_name(mapped_up), perm_up),
]
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
# skip visual tensors
return []
if name.find("experts") != -1:
@@ -4080,10 +3945,6 @@ class Qwen3Model(Qwen2Model):
return torch.stack([true_row, false_row], dim=0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if "model.vision_" in name:
# skip multimodal tensors
return []
if self.is_rerank:
is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
is_real_head = not self.is_tied_embeddings and "lm_head" in name
@@ -4119,187 +3980,6 @@ class Qwen3MoeModel(Qwen2MoeModel):
super().set_vocab()
@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
class Qwen3VLVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
# Compute image_size if not present
if "image_size" not in self.hparams_vision:
# For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
num_pos = self.hparams_vision.get("num_position_embeddings", 2304)
patch_size = self.hparams_vision.get("patch_size", 16)
# num_position_embeddings = (image_size / patch_size) ** 2
# So image_size = sqrt(num_position_embeddings) * patch_size
image_size = int(num_pos**0.5 * patch_size)
self.hparams_vision["image_size"] = image_size
# Rename config values for compatibility
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0)
for idx in self.hparams_vision.get("deepstack_visual_indexes", []):
self.is_deepstack_layers[idx] = True
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
self.gguf_writer.add_vision_use_gelu(True)
if self.hparams_vision is not None:
merge_size = self.hparams_vision.get("spatial_merge_size")
if merge_size is not None:
self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
# Use text config's rms_norm_eps for vision attention layernorm eps
rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6)
self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
if self.is_deepstack_layers:
self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
assert self.hparams_vision is not None
# Skip text model tensors - they go in the text model file
if name.startswith("model.language_model.") or name.startswith("lm_head."):
return []
if name.startswith("model.visual."):
name = name.replace("model.visual.", "visual.", 1)
if name.startswith("visual.deepstack_merger_list."):
prefix, rest = name.split(".", maxsplit=3)[2:]
# prefix is the layer index, convert to absolute clip layer index!
idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)]
target = rest
tensor_type: gguf.MODEL_TENSOR
if target.startswith("norm."):
tensor_type = gguf.MODEL_TENSOR.V_DS_NORM
suffix = target.split(".", 1)[1]
elif target.startswith("linear_fc1."):
tensor_type = gguf.MODEL_TENSOR.V_DS_FC1
suffix = target.split(".", 1)[1]
elif target.startswith("linear_fc2."):
tensor_type = gguf.MODEL_TENSOR.V_DS_FC2
suffix = target.split(".", 1)[1]
else:
raise ValueError(f"Unexpected deepstack tensor: {name}")
new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}")
return [(new_name, data_torch)]
if name.startswith("visual.merger."):
suffix = name.split(".", 2)[2]
if suffix.startswith("linear_fc"):
fc_idx_str, tail = suffix.split(".", 1)
fc_num = int(fc_idx_str.replace("linear_fc", ""))
# Qwen3VL has linear_fc1 and linear_fc2
# Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2)
if fc_num == 1:
fc_idx = 0
elif fc_num == 2:
fc_idx = 2
else:
raise ValueError(f"unexpected fc index {fc_num} in {name}")
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}")
elif suffix.startswith("norm."):
new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}")
else:
raise ValueError(f"Unexpected merger tensor: {name}")
return [(new_name, data_torch)]
if name == "visual.patch_embed.proj.weight":
# split Conv3D into Conv2Ds along temporal dimension
c1, c2, kt, _, _ = data_torch.shape
del c1, c2
if kt != 2:
raise ValueError("Current implementation only supports temporal_patch_size of 2")
return [
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]),
(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
]
if name == "visual.patch_embed.proj.bias":
# Include the bias - it's used by the C++ code
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)]
if name.startswith("visual."):
return [(self.map_tensor_name(name), data_torch)]
# Fall back to parent class for other tensors
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3VLForConditionalGeneration")
class Qwen3VLTextModel(Qwen3Model):
model_arch = gguf.MODEL_ARCH.QWEN3VL
def set_gguf_parameters(self):
super().set_gguf_parameters()
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
text_config = self.hparams.get("text_config", {})
# rope_scaling is deprecated in V5, use rope_parameters instead
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
if rope_scaling.get("mrope_section"):
# mrope_section contains [time, height, width] dimensions
mrope_section = rope_scaling["mrope_section"]
# Pad to 4 dimensions [time, height, width, extra]
while len(mrope_section) < 4:
mrope_section.append(0)
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
logger.info(f"MRoPE sections: {mrope_section[:4]}")
vision_config = self.hparams.get("vision_config", {})
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision tensors - they go in the mmproj file
if name.startswith("model.visual."):
return []
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
class Qwen3VLMoeTextModel(Qwen3MoeModel):
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
def set_gguf_parameters(self):
super().set_gguf_parameters()
# Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
text_config = self.hparams.get("text_config", {})
# rope_scaling is deprecated in V5, use rope_parameters instead
rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
if rope_scaling.get("mrope_section"):
# mrope_section contains [time, height, width] dimensions
mrope_section = rope_scaling["mrope_section"]
# Pad to 4 dimensions [time, height, width, extra]
while len(mrope_section) < 4:
mrope_section.append(0)
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
logger.info(f"MRoPE sections: {mrope_section[:4]}")
vision_config = self.hparams.get("vision_config", {})
deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision tensors - they go in the mmproj file
if name.startswith("model.visual."):
return []
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("GPT2LMHeadModel")
class GPT2Model(TextModel):
model_arch = gguf.MODEL_ARCH.GPT2
@@ -7205,100 +6885,6 @@ class DeepseekV2Model(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("MiniMaxM2ForCausalLM")
class MiniMaxM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.MINIMAXM2
_experts_cache: dict[int, dict[str, Tensor]] = {}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.hparams["num_experts"] = self.hparams["num_local_experts"]
def set_gguf_parameters(self):
super().set_gguf_parameters()
if self.hparams["scoring_func"] == "sigmoid":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif self.hparams["scoring_func"] == "softmax":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
else:
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"]))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
# merge expert weights
if 'experts' in name:
n_experts = self.hparams["num_experts"]
assert bid is not None
expert_cache = self._experts_cache.setdefault(bid, {})
expert_cache[name] = data_torch
expert_weights = ["w1", "w2", "w3"]
# not enough expert weights to merge
if len(expert_cache) < n_experts * len(expert_weights):
return []
tensors: list[tuple[str, Tensor]] = []
for w_name in expert_weights:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
datas.append(expert_cache[ename])
del expert_cache[ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
del self._experts_cache[bid]
return tensors
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("PanguEmbeddedForCausalLM")
class PanguEmbeddedModel(TextModel):
model_arch = gguf.MODEL_ARCH.PANGU_EMBED
def set_vocab(self):
self._set_vocab_sentencepiece()
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "add_prefix_space" in tokenizer_config_json:
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
# PanguEmbedded's hparam loaded from config.json without head_dim
if (rope_dim := hparams.get("head_dim")) is None:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
if hparams.get("head_dim") is None:
self.gguf_writer.add_key_length(rope_dim)
self.gguf_writer.add_value_length(rope_dim)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name == "lm_head.weight":
if self.hparams.get("tie_word_embeddings", False):
logger.info("Skipping tied output layer 'lm_head.weight'")
return []
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("Dots1ForCausalLM")
class Dots1Model(Qwen2MoeModel):
model_arch = gguf.MODEL_ARCH.DOTS1
@@ -7354,7 +6940,6 @@ class PLMModel(TextModel):
@ModelBase.register("T5ForConditionalGeneration")
@ModelBase.register("MT5ForConditionalGeneration")
@ModelBase.register("UMT5ForConditionalGeneration")
@ModelBase.register("UMT5Model")
class T5Model(TextModel):
model_arch = gguf.MODEL_ARCH.T5
@@ -8637,6 +8222,8 @@ class BailingMoeV2Model(TextModel):
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_count(hparams["num_experts"])
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
self.gguf_writer.add_expert_group_count(hparams["n_group"])
self.gguf_writer.add_expert_group_used_count(hparams["topk_group"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
if hparams["score_function"] == "sigmoid":
@@ -9356,13 +8943,6 @@ class SmolLM3Model(LlamaModel):
class GptOssModel(TextModel):
model_arch = gguf.MODEL_ARCH.GPT_OSS
# TODO: remove once MXFP4 is supported more generally
def dequant_model(self):
quant_config = self.hparams.get("quantization_config")
if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
return
return super().dequant_model()
def transform_nibble_layout(self, tensor):
assert tensor.dtype == torch.uint8
assert tensor.shape[-1] == 16
@@ -9833,21 +9413,6 @@ class PixtralModel(LlavaVisionModel):
return super().map_tensor_name(name, try_suffixes)
@ModelBase.register("LightOnOCRForConditionalGeneration")
class LightOnOCRVisionModel(LlavaVisionModel):
is_mistral_format = False
use_break_tok = False
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
name = name.replace("model.vision_encoder.", "vision_tower.")
name = name.replace("model.vision_projection.", "multi_modal_projector.")
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("KimiVLForConditionalGeneration")
class KimiVLModel(MmprojModel):
def __init__(self, *args, **kwargs):
@@ -9884,144 +9449,6 @@ class KimiVLModel(MmprojModel):
return [] # skip other tensors
@ModelBase.register("CogVLMForCausalLM")
class CogVLMVisionModel(MmprojModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if not name.startswith("model.vision."):
return []
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("CogVLMForCausalLM")
class CogVLMModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.COGVLM
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
# block vision tensors
if name.startswith("model.vision."):
return []
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("JanusForConditionalGeneration")
class JanusProModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision, aligner, and generation tensors
skip_prefixes = (
'model.vision_model.',
'model.aligner.',
'model.vqmodel.',
'model.generation_embeddings.',
'model.generation_aligner.',
'model.generation_head.',
)
if name.startswith(skip_prefixes):
return []
if name.startswith('model.language_model.'):
name = name.replace('model.language_model.', 'model.')
elif name.startswith('language_model.'):
name = name.replace('language_model.', '')
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("JanusForConditionalGeneration")
class JanusProVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
if "intermediate_size" not in self.hparams_vision:
mlp_ratio = self.hparams_vision.get("mlp_ratio")
hidden_size = self.hparams_vision.get("hidden_size")
if mlp_ratio is not None and hidden_size is not None:
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
if hidden_act == "gelu":
self.gguf_writer.add_vision_use_gelu(True)
elif hidden_act == "silu":
self.gguf_writer.add_vision_use_silu(True)
def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
"""Map aligner tensors to projector format"""
suffix = ".bias" if name.endswith(".bias") else ".weight"
if name.startswith("model.aligner."):
local_name = name[len("model.aligner."):]
elif name.startswith("aligner."):
local_name = name[len("aligner."):]
else:
raise ValueError(f"Unsupported Janus aligner prefix: {name}")
if local_name.startswith("fc1."):
mm_index = 0
elif local_name.startswith("hidden_layers."):
parts = local_name.split(".", 2)
if len(parts) < 3:
raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
mm_index = int(parts[1]) + 1
else:
raise ValueError(f"Unsupported Janus aligner tensor: {name}")
tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
return [(tensor_name, data_torch)]
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
# Skip language model tensors as they will be handled by `JanusProModel`
if name.startswith(('model.language_model.', 'language_model.')):
return []
# Skip generation-related components
skip_generation_prefixes = (
'model.vqmodel.',
'vqmodel.',
'model.generation_embeddings.',
'generation_embeddings.',
'model.generation_aligner.',
'generation_aligner.',
'model.generation_head.',
'generation_head.',
)
if name.startswith(skip_generation_prefixes):
return []
# Handle aligner tensors
if name.startswith(('model.aligner.', 'aligner.')):
return list(self._map_aligner_tensor(data_torch, name))
# Handle vision tensors
if name.startswith(('model.vision_model.', 'vision_model.')):
return [(self.map_tensor_name(name), data_torch)]
return []
###### CONVERSION LOGIC ######
@@ -10079,16 +9506,6 @@ class LazyTorchTensor(gguf.LazyBase):
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
return cast(torch.Tensor, lazy)
@classmethod
def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
dtype = cls._dtype_str_map[tensor.dtype]
return torch.from_numpy(tensor.mmap_bytes()).view(dtype).reshape(tensor.shape)
dtype = cls._dtype_str_map[t.dtype]
shape = t.shape
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r))
return cast(torch.Tensor, lazy)
@classmethod
def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
dtype = cls._dtype_str_map[remote_tensor.dtype]
@@ -10305,6 +9722,10 @@ def main() -> None:
logger.info(f"Loading model: {dir_model.name}")
if args.mmproj:
if "mmproj" not in fname_out.name:
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
is_mistral_format = args.mistral_format
if is_mistral_format and not _mistral_common_installed:
raise ImportError(_mistral_import_error_msg)

View File

@@ -141,7 +141,6 @@ models = [
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
{"name": "bailingmoe2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -436,7 +435,7 @@ for model in models:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except (OSError, TypeError) as e:
except OSError as e:
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
continue # Skip this model and continue with the next one in the loop

View File

@@ -39,23 +39,18 @@ The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adren
| Adreno 830 (Snapdragon 8 Elite) | Support |
| Adreno X85 (Snapdragon X Elite) | Support |
> A6x GPUs with a recent driver and compiler are supported; they are usually found in IoT platforms.
However, A6x GPUs in phones are likely not supported due to the outdated driver and compiler.
## DataType Supports
| DataType | Status |
|:----------------------:|:--------------------------:|
| Q4_0 | Support |
| Q6_K | Support, but not optimized |
| Q8_0 | Support |
| MXFP4 | Support |
## Model Preparation
You can refer to the general [llama-quantize tool](/tools/quantize/README.md) for steps to convert a model in Hugging Face safetensor format to GGUF with quantization.
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration.
Currently we support `Q4_0` quantization and have optimized for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize` (i.e., make all weights in `Q4_0`). For example,
Currently we support `Q4_0` quantization and have optimize for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize`. For example,
```sh
./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
@@ -63,17 +58,6 @@ Currently we support `Q4_0` quantization and have optimized for it. To achieve b
Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.
### `MXFP4` MoE Models
OpenAI gpt-oss models are MoE models in `MXFP4`. The quantized model will be in `MXFP4_MOE`, a mixture of `MXFP4` and `Q8_0`.
For this quantization, there is no need to specify `--pure`.
For gpt-oss-20b model, you can directly [download](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) the quantized GGUF file in `MXFP4_MOE` from Hugging Face.
Although it is possible to quantize gpt-oss-20b model in pure `Q4_0` (all weights in `Q4_0`), it is not recommended since `MXFP4` has been optimized for MoE while `Q4_0` is not. In addition, accuracy should degrade with such pure `Q4_0` quantization.
Hence, using the default `MXFP4_MOE` quantization (see the link above) is recommended for this model.
> Note that the `Q4_0` model found [here](https://huggingface.co/unsloth/gpt-oss-20b-GGUF/blob/main/gpt-oss-20b-Q4_0.gguf) is a mixture of `Q4_0`, `Q8_0` and `MXFP4` and gives better performance than `MXFP4_MOE` quantization.
## CMake Options
The OpenCL backend has the following CMake options that control the behavior of the backend.
@@ -162,13 +146,10 @@ A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the followi
* Ninja
* Visual Studio 2022
* Powershell 7
* Python
Visual Studio provides necessary headers and libraries although it is not directly used for building.
Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.
> Note that building using Visual Studio's cl compiler is not supported. Clang must be used. Clang depends on libraries provided by Visual Studio to work. Therefore, Visual Studio must be installed. Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.
Powershell 7 is used for the following commands.
If an older version of Powershell is used, these commands may not work as they are.
@@ -220,12 +201,9 @@ ninja
## Known Issues
- Flash attention does not always improve performance.
- Currently OpenCL backend works on A6xx GPUs with recent drivers and compilers (usually found in IoT platforms).
However, it does not work on A6xx GPUs found in phones with old drivers and compilers.
- Currently OpenCL backend does not work on Adreno 6xx GPUs.
## TODO
- Optimization for Q6_K
- Support and optimization for Q4_K
- Improve flash attention

View File

@@ -178,48 +178,6 @@ GeForce RTX 3070 8.6
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
```
### Overriding the CUDA Version
If you have multiple CUDA installations on your system and want to compile llama.cpp for a specific one, e.g. for CUDA 11.7 installed under `/opt/cuda-11.7`:
```bash
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/opt/cuda-11.7/bin/nvcc -DCMAKE_INSTALL_RPATH="/opt/cuda-11.7/lib64;\$ORIGIN" -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
```
#### Fixing Compatibility Issues with Old CUDA and New glibc
If you try to use an old CUDA version (e.g. v11.7) with a new glibc version you can get errors like this:
```
/usr/include/bits/mathcalls.h(83): error: exception specification is
incompatible with that of previous function "cospi"
/opt/cuda-11.7/bin/../targets/x86_64-linux/include/crt/math_functions.h(5545):
here
```
It seems the least bad solution is to patch the CUDA installation to declare the correct signatures.
Replace the following lines in `/path/to/your/cuda/installation/targets/x86_64-linux/include/crt/math_functions.h`:
```C++
// original lines
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double cospi(double x);
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float cospif(float x);
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double sinpi(double x);
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float sinpif(float x);
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double rsqrt(double x);
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float rsqrtf(float x);
// edited lines
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double cospi(double x) noexcept (true);
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float cospif(float x) noexcept (true);
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double sinpi(double x) noexcept (true);
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float sinpif(float x) noexcept (true);
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double rsqrt(double x) noexcept (true);
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float rsqrtf(float x) noexcept (true);
```
### Runtime CUDA environmental variables
You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
@@ -303,12 +261,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16
```
Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system.
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
@@ -326,17 +282,17 @@ You can download it from your Linux distro's package manager or from here: [ROCm
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash
set PATH=%HIP_PATH%\bin;%PATH%
cmake -S . -B build -G Ninja -DGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cmake --build build
```
If necessary, adapt `GPU_TARGETS` to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.

View File

@@ -7,9 +7,9 @@
## Images
We have three Docker images available for this project:
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
Additionally, there the following images, similar to the above:

View File

@@ -22,11 +22,11 @@ Legend:
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | | ❌ | ❌ | ✅ | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | | ✅ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | | ❌ | ✅ | ❌ | ✅ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| CONV_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
@@ -42,7 +42,7 @@ Legend:
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
| FLOOR | ❌ | ❌ | ✅ | | ❌ | ❌ | ✅ | ❌ | ❌ |
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
@@ -79,12 +79,12 @@ Legend:
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | | ✅ | ❌ |
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | | ✅ | ❌ |
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
| ROUND | ❌ | ❌ | ✅ | | ❌ | ❌ | ✅ | ❌ | ❌ |
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -111,6 +111,6 @@ Legend:
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| TOPK_MOE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | | ❌ | ❌ | ✅ | ❌ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
| XIELU | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |

File diff suppressed because it is too large Load Diff

View File

@@ -5637,25 +5637,25 @@
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000000,inplace=0","support","1","yes","SYCL"
"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000000","support","1","yes","SYCL"
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000000,inplace=0","support","1","yes","SYCL"
"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000000","support","1","yes","SYCL"
"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000000","support","0","no","SYCL"
"SYCL0","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","SYCL"
"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001","support","1","yes","SYCL"
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=0","support","1","yes","SYCL"
"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000001","support","1","yes","SYCL"
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000001,inplace=0","support","1","yes","SYCL"
"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000001","support","1","yes","SYCL"
"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000001","support","0","no","SYCL"
"SYCL0","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","SYCL"
"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000100","support","1","yes","SYCL"
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000100,inplace=0","support","1","yes","SYCL"
"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000100","support","1","yes","SYCL"
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000100,inplace=0","support","1","yes","SYCL"
"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000100","support","1","yes","SYCL"
"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000100","support","0","no","SYCL"
"SYCL0","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","SYCL"
"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.100000","support","1","yes","SYCL"
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.100000,inplace=0","support","1","yes","SYCL"
"SYCL0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.100000","support","1","yes","SYCL"
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.100000,inplace=0","support","1","yes","SYCL"
"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.100000","support","1","yes","SYCL"
"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.100000","support","0","no","SYCL"
"SYCL0","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","SYCL"
"SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","SYCL"
"SYCL0","RMS_NORM_MUL_ADD","type=f32,ne=[64,5,4,3],eps=0.000000,broadcast=0,multi_add=0","support","1","yes","SYCL"
@@ -9307,37 +9307,37 @@
"SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=1","support","1","yes","SYCL"
"SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=1","support","1","yes","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","SYCL"
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","yes","SYCL"
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","SYCL"
Can't render this file because it is too large.

View File

@@ -38,7 +38,6 @@ The above command will output space-separated float values.
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
| 'json' | openai style |
| 'json+' | add cosine similarity matrix |
| 'raw' | plain text output |
### --embd-separator $"string"$
| $"string"$ | |

View File

@@ -70,29 +70,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
}
}
// plain, pipe-friendly output: one embedding per line
static void print_raw_embeddings(const float * emb,
int n_embd_count,
int n_embd,
const llama_model * model,
enum llama_pooling_type pooling_type,
int embd_normalize) {
const uint32_t n_cls_out = llama_model_n_cls_out(model);
const bool is_rank = (pooling_type == LLAMA_POOLING_TYPE_RANK);
const int cols = is_rank ? std::min<int>(n_embd, (int) n_cls_out) : n_embd;
for (int j = 0; j < n_embd_count; ++j) {
for (int i = 0; i < cols; ++i) {
if (embd_normalize == 0) {
LOG("%1.0f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
} else {
LOG("%1.7f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
}
}
LOG("\n");
}
}
int main(int argc, char ** argv) {
common_params params;
@@ -395,8 +372,6 @@ int main(int argc, char ** argv) {
}
if (notArray) LOG("\n}\n");
} else if (params.embd_out == "raw") {
print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
}
LOG("\n");

View File

@@ -184,13 +184,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
const char * name = gguf_get_tensor_name (ctx, i);
const size_t size = gguf_get_tensor_size (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i);
const auto type = gguf_get_tensor_type (ctx, i);
const char * type_name = ggml_type_name(type);
const size_t type_size = ggml_type_size(type);
const size_t n_elements = size / type_size;
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu, type = %s, n_elts = %zu\n", __func__, i, name, size, offset, type_name, n_elements);
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
}
}

View File

@@ -371,17 +371,8 @@ class SchemaConverter:
raise ValueError(f'Unsupported ref {ref}')
for sel in ref.split('#')[-1].split('/')[1:]:
assert target is not None, f'Error resolving ref {ref}: {sel} not in {target}'
if isinstance(target, list):
try:
sel_index = int(sel)
except ValueError:
raise ValueError(f'Error resolving ref {ref}: {sel} not in {target}')
assert 0 <= sel_index < len(target), f'Error resolving ref {ref}: {sel} not in {target}'
target = target[sel_index]
else:
assert sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
target = target[sel]
assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
target = target[sel]
self._refs[ref] = target
else:
@@ -556,8 +547,7 @@ class SchemaConverter:
def _resolve_ref(self, ref):
ref_fragment = ref.split('#')[-1]
ref_name = 'ref' + re.sub(r'[^a-zA-Z0-9-]+', '-', ref_fragment)
ref_name = ref.split('/')[-1]
if ref_name not in self._rules and ref not in self._refs_being_resolved:
self._refs_being_resolved.add(ref)
resolved = self._refs[ref]

View File

@@ -138,10 +138,7 @@ if model_path is None:
"Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
)
print("Loading model and tokenizer using AutoTokenizer:", model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_path)
print("Model type: ", config.model_type)
print("Vocab size: ", config.vocab_size)
@@ -150,6 +147,10 @@ print("Number of layers: ", config.num_hidden_layers)
print("BOS token id: ", config.bos_token_id)
print("EOS token id: ", config.eos_token_id)
print("Loading model and tokenizer using AutoTokenizer:", model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = (
@@ -170,7 +171,7 @@ if unreleased_model_name:
exit(1)
else:
model = AutoModelForCausalLM.from_pretrained(
model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
model_path, device_map="auto", offload_folder="offload"
)
for name, module in model.named_modules():

View File

@@ -168,7 +168,7 @@ option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
option(GGML_VXE "ggml: enable vxe" ON)
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")

View File

@@ -242,7 +242,6 @@
#define GGML_ROPE_TYPE_NEOX 2
#define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24
#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
#define GGML_MROPE_SECTIONS 4
@@ -2108,7 +2107,6 @@ extern "C" {
enum ggml_scale_mode {
GGML_SCALE_MODE_NEAREST = 0,
GGML_SCALE_MODE_BILINEAR = 1,
GGML_SCALE_MODE_BICUBIC = 2,
GGML_SCALE_MODE_COUNT
};

View File

@@ -211,11 +211,6 @@ add_library(ggml-base
ggml-quants.h
gguf.cpp)
set_target_properties(ggml-base PROPERTIES
VERSION ${GGML_VERSION}
SOVERSION ${GGML_VERSION_MAJOR}
)
target_include_directories(ggml-base PRIVATE .)
if (GGML_BACKEND_DL)
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
@@ -225,11 +220,6 @@ add_library(ggml
ggml-backend-reg.cpp)
add_library(ggml::ggml ALIAS ggml)
set_target_properties(ggml PROPERTIES
VERSION ${GGML_VERSION}
SOVERSION ${GGML_VERSION_MAJOR}
)
if (GGML_BACKEND_DIR)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
@@ -269,12 +259,6 @@ function(ggml_add_backend_library backend)
target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED)
endif()
# Set versioning properties for all backend libraries
set_target_properties(${backend} PROPERTIES
VERSION ${GGML_VERSION}
SOVERSION ${GGML_VERSION_MAJOR}
)
if(NOT GGML_AVAILABLE_BACKENDS)
set(GGML_AVAILABLE_BACKENDS "${backend}"
CACHE INTERNAL "List of backends for cmake package")
@@ -324,10 +308,6 @@ function(ggml_add_cpu_backend_variant tag_name)
set(GGML_INTERNAL_${feat} ON)
endforeach()
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
foreach (feat VXE2 NNPA)
set(GGML_INTERNAL_${feat} OFF)
endforeach()
foreach (feat ${ARGN})
set(GGML_INTERNAL_${feat} ON)
endforeach()
@@ -397,8 +377,9 @@ if (GGML_CPU_ALL_VARIANTS)
endif()
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
ggml_add_cpu_backend_variant(z15 Z15 VXE2)
ggml_add_cpu_backend_variant(z16 Z16 VXE2 NNPA)
ggml_add_cpu_backend_variant(s390x_z15 Z15 VXE)
# ggml_add_cpu_backend_variant(s390x_z16 Z16 VXE)
# ggml_add_cpu_backend_variant(s390x_z17 Z17 VXE)
else()
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
endif()

View File

@@ -226,23 +226,16 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
}
if (best_fit_block == -1) {
// no suitable block found, try the last block (this may grow a chunks size)
int64_t best_reuse = INT64_MIN;
// no suitable block found, try the last block (this will grow a chunks size)
for (int c = 0; c < alloc->n_chunks; ++c) {
struct tallocr_chunk * chunk = alloc->chunks[c];
if (chunk->n_free_blocks > 0) {
struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
max_avail = MAX(max_avail, block->size);
int64_t reuse_factor = chunk->max_size - block->offset - size;
// reuse_factor < 0 : amount of extra memory that needs to be allocated
// reuse_factor = 0 : allocated free space exactly matches tensor size
// reuse_factor > 0 : superfluous memory that will remain unused
bool better_reuse = best_reuse < 0 && reuse_factor > best_reuse;
bool better_fit = reuse_factor >= 0 && reuse_factor < best_reuse;
if (block->size >= size && (better_reuse || better_fit)) {
if (block->size >= size) {
best_fit_chunk = c;
best_fit_block = chunk->n_free_blocks - 1;
best_reuse = reuse_factor;
break;
}
}
}
@@ -275,7 +268,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
#ifdef GGML_ALLOCATOR_DEBUG
add_allocated_tensor(alloc, addr, tensor);
size_t cur_max = addr.offset + size;
if (cur_max > chunk->max_size) {
if (cur_max > alloc->max_size[addr.chunk]) {
// sort allocated_tensors by chunk/offset
for (int i = 0; i < 1024; i++) {
for (int j = i + 1; j < 1024; j++) {

View File

@@ -2234,7 +2234,7 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
ACL_MEM_MALLOC_HUGE_FIRST));
acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, 1);
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
float start = 0;
float step = 1;
@@ -2251,7 +2251,7 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
void * yarn_ramp_buffer = yarn_ramp_allocator.get();
acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne,
theta_scale_nb, 1);
theta_scale_nb, GGML_MAX_DIMS);
float zero_value = 0, one_value = 1;
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
aclScalar * low = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT);

View File

@@ -67,30 +67,19 @@
GGML_ABORT("CANN error");
}
// Thread-local variable to record the current device of this thread.
thread_local int g_current_cann_device = -1;
/**
* @brief Set the CANN device to be used.
* @brief Sets the device to be used by CANN.
*
* @param device The target device ID to set.
* @param device The device ID to set.
*/
void ggml_cann_set_device(const int32_t device) {
// int current_device = -1;
// Note: In some CANN versions, if no device has been set yet,
// aclrtGetDevice(&current_device) may return 0 by default.
// aclrtGetDevice(&current_device);
int current_device = -1;
aclrtGetDevice(&current_device);
// If the current device is already the target one, no need to switch.
if (device == g_current_cann_device) {
if (device == current_device) {
return;
}
// Switch to the new device.
ACL_CHECK(aclrtSetDevice(device));
// Update the global device record.
g_current_cann_device = device;
}
/**

View File

@@ -126,36 +126,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
)
if (NOT ARM_MCPU_RESULT)
string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
string(REGEX MATCH "-march=[^ ']+" ARM_MARCH_FLAG "${ARM_MCPU}")
# on some old GCC we need to read -march=
if (ARM_MARCH_FLAG AND NOT "${ARM_MARCH_FLAG}" STREQUAL "-march=native")
set(ARM_NATIVE_FLAG "${ARM_MARCH_FLAG}")
elseif(ARM_MCPU_FLAG AND NOT "${ARM_MCPU_FLAG}" STREQUAL "-mcpu=native")
set(ARM_NATIVE_FLAG "${ARM_MCPU_FLAG}")
endif()
endif()
if ("${ARM_NATIVE_FLAG}" STREQUAL "")
set(ARM_NATIVE_FLAG -mcpu=native)
message(WARNING "ARM -march/-mcpu not found, -mcpu=native will be used")
else()
message(STATUS "ARM detected flags: ${ARM_NATIVE_FLAG}")
if ("${ARM_MCPU_FLAG}" STREQUAL "")
set(ARM_MCPU_FLAG -mcpu=native)
message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
endif()
include(CheckCXXSourceRuns)
function(check_arm_feature tag code)
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+${tag}")
set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
if (GGML_MACHINE_SUPPORTS_${tag})
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+${tag}" PARENT_SCOPE)
set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
else()
set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+no${tag}")
set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+no${tag}")
check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
if (GGML_MACHINE_SUPPORTS_no${tag})
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+no${tag}" PARENT_SCOPE)
set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
endif()
endif()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
@@ -166,7 +155,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
check_arm_feature(sme "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
list(APPEND ARCH_FLAGS "${ARM_NATIVE_FLAG}${ARM_NATIVE_FLAG_FIX}")
list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
else()
if (GGML_CPU_ARM_ARCH)
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
@@ -515,18 +504,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endforeach()
endif()
if (GGML_VXE OR GGML_INTERNAL_VXE2)
message(STATUS "VXE2 enabled")
if (GGML_VXE OR GGML_INTERNAL_VXE)
message(STATUS "VX/VXE/VXE2 enabled")
list(APPEND ARCH_FLAGS -mvx -mzvector)
list(APPEND ARCH_DEFINITIONS GGML_USE_VXE2)
list(APPEND ARCH_DEFINITIONS GGML_VXE)
endif()
if (GGML_INTERNAL_NNPA)
message(STATUS "NNPA enabled")
list(APPEND ARCH_DEFINITIONS GGML_USE_NNPA)
endif()
ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
message(STATUS "Wasm detected")
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
@@ -590,7 +572,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${KLEIDIAI_SRC}/kai/ukernels/
${KLEIDIAI_SRC}/kai/ukernels/matmul/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
@@ -609,34 +590,23 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.c)
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
if (NOT DOTPROD_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.c)
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
endif()
if (NOT I8MM_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.c)
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c)
endif()
if (NOT SME_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_asm.S
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c

View File

@@ -2044,26 +2044,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
#ifdef __ARM_FEATURE_SVE
static inline svuint32_t ggml_decode_q4scales_and_mins_for_mmla(const uint32_t * vx_scales) {
const svbool_t pg_all = svptrue_pat_b32(SV_VL4);
const svbool_t pg_false = svpfalse_b(); // 0x0000
const svbool_t pg_lo_8 = svwhilelt_b8_s32(0, 8); // 0x00ff
const svbool_t pg_odd = svzip1_b32(pg_false, pg_lo_8);
svuint32_t vutmp_hi, vutmp_lo;
svuint32_t vx01 = svld1_u32(pg_lo_8, vx_scales);
vutmp_hi = svzip1_u32(vx01, vx01);
vutmp_hi = svlsr_n_u32_m(pg_odd, vutmp_hi, 2);
vutmp_hi = svreinterpret_u32_u64(svand_n_u64_x(pg_all, svreinterpret_u64_u32(vutmp_hi), UINT64_C(0x303030303f3f3f3f)));
const svuint32_t vx2 = svdup_u32(vx_scales[2]);
vutmp_lo = svlsr_u32_x(pg_all, vx2, svreinterpret_u32_s32(svindex_s32(-2, 2)));
vutmp_lo = svand_n_u32_z(pg_odd, vutmp_lo, UINT32_C(0x0f0f0f0f));
svuint32_t vutmp = svorr_u32_z(pg_all, vutmp_hi, vutmp_lo);
return vutmp;
}
#endif
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
#ifdef __ARM_FEATURE_MATMUL_INT8
@@ -2086,220 +2066,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
static const uint32_t kmask3 = 0x03030303;
uint32_t utmp[4];
#ifdef __ARM_FEATURE_SVE
const int vector_length = ggml_cpu_get_sve_cnt()*8;
#endif
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
svbool_t pg32_2 = svptrue_pat_b32(SV_VL2);
const block_q4_K * GGML_RESTRICT vx0 = vx;
const block_q8_K * GGML_RESTRICT vy0 = vy;
const block_q4_K * GGML_RESTRICT vx1 = (const block_q4_K *) ((const uint8_t*)vx + bx);
const block_q8_K * GGML_RESTRICT vy1 = (const block_q8_K *) ((const uint8_t*)vy + by);
union {
uint32_t u32[8];
uint64_t u64[4];
} new_utmp;
svfloat32_t sumf1 = svdup_n_f32(0);
switch (vector_length) {
case 128:
{
svbool_t pg_false = svpfalse_b();
svbool_t pg_lo_8 = svwhilelt_b8_s32(0, 8);
svbool_t vmins_mask1= svzip1_b32(pg_lo_8, pg_false);
svbool_t vmins_mask2 = svzip1_b32(pg_false, pg_lo_8);
svbool_t pg128_all = svptrue_pat_b8(SV_VL16);
for (int i = 0; i < nb; ++i) {
svfloat32_t vy_d = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d));
svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
svfloat32_t svsuper_block_scales = svmul_f32_x(pg128_all, vy_d, vx_d);
svfloat32_t vx_dmins = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].dmin)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].dmin)));
svfloat32_t vy_dmins = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d));
svfloat32_t svdmins = svmul_n_f32_x(pg128_all, svmul_f32_x(pg128_all, vy_dmins, vx_dmins), -1);
const uint8_t * GGML_RESTRICT q4_0 = vx0[i].qs;
const int8_t * GGML_RESTRICT q8_0 = vy0[i].qs;
const uint8_t * GGML_RESTRICT q4_1 = vx1[i].qs;
const int8_t * GGML_RESTRICT q8_1 = vy1[i].qs;
svint16_t lo = svld1_s16(pg128_all, vy0[i].bsums + 0);
svint16_t hi = svld1_s16(pg128_all, vy0[i].bsums + 8);
svint16_t sum_tmp1 = svuzp1_s16(lo, hi);
svint16_t sum_tmp2 = svuzp2_s16(lo, hi);
svint16_t svq8sums_0 = svadd_s16_x(pg128_all, sum_tmp1, sum_tmp2);
lo = svld1_s16(pg128_all, vy1[i].bsums + 0);
hi = svld1_s16(pg128_all, vy1[i].bsums + 8);
sum_tmp1 = svuzp1(lo, hi);
sum_tmp2 = svuzp2(lo, hi);
svint16_t svq8sums_1 = svadd_s16_x(pg128_all, sum_tmp1, sum_tmp2);
svuint32_t decoded_scales0 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx0[i].scales);
svuint32_t decoded_scales1 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx1[i].scales);
svuint32x2_t decoded_scales = svcreate2_u32(decoded_scales0, decoded_scales1);
svst2_u32(pg128_all, new_utmp.u32, decoded_scales);
svint16_t svmins8_0 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u32(svuzp1_u32(svld1_u32(vmins_mask1, new_utmp.u32+4), svdup_n_u32(0)))));
svint16_t svmins8_1 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u32(svuzp2_u32(svld1_u32(vmins_mask2, new_utmp.u32+4), svdup_n_u32(0)))));
svint32_t svsumfs_tmp1 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_0, svmins8_0));
svint32_t svsumfs_tmp2 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_0, svmins8_1));
svint32_t svsumfs_tmp3 = svtrn1_s32(svsumfs_tmp1, svsumfs_tmp2);
svint32_t svsumfs_tmp4 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_1, svmins8_0));
svint32_t svsumfs_tmp5 = svreinterpret_s32_s64(svdot_s64(svdup_n_s64(0), svq8sums_1, svmins8_1));
svint32_t svsumfs_tmp6 = svtrn1_s32(svsumfs_tmp4, svsumfs_tmp5);
svint32_t svsumfs_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(svsumfs_tmp3), svreinterpret_s64_s32(svsumfs_tmp6)));
svint32_t svsumfs_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(svsumfs_tmp3), svreinterpret_s64_s32(svsumfs_tmp6)));
svint32_t svsumfs_tmp = svadd_s32_x(pg128_all, svsumfs_tmp7, svsumfs_tmp8);
svint32_t svscales, sumi1, sumi2;
svint32_t acc_sumif1 = svdup_n_s32(0);
svint32_t acc_sumif2 = svdup_n_s32(0);
svint8_t q4bytes_0_l, q4bytes_0_h, q4bytes_1_l, q4bytes_1_h, l0, l1, l2, l3,
q8bytes_0_h, q8bytes_0_l, q8bytes_1_h, q8bytes_1_l, r0, r1, r2, r3;
#pragma GCC unroll 1
for (int j = 0; j < QK_K/64; ++j) {
q4bytes_0_l = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0), 0xf));
q4bytes_1_l = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1), 0xf));
q4bytes_0_h = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0+16), 0xf));
q4bytes_1_h = svreinterpret_s8_u8(svand_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1+16), 0xf));
l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
l2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
l3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
q8bytes_0_h = svld1_s8(pg128_all, q8_0);
q8bytes_1_h = svld1_s8(pg128_all, q8_1);
q8bytes_0_l = svld1_s8(pg128_all, q8_0+16);
q8bytes_1_l = svld1_s8(pg128_all, q8_1+16);
r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
sumi1 = svmmla_s32(svmmla_s32(svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), r2, l2), r3, l3);
svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg128_all, svlsl_n_u32_x(pg128_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-1)), 24));
acc_sumif1 = svmla_s32_x(pg128_all, acc_sumif1, svscales, sumi1);
q4bytes_0_l = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0), 4));
q4bytes_1_l = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1), 4));
q4bytes_0_h = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_0+16), 4));
q4bytes_1_h = svreinterpret_s8_u8(svlsr_n_u8_x(pg128_all, svld1_u8(pg128_all, q4_1+16), 4));
l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_l), svreinterpret_s64_s8(q4bytes_1_l)));
l2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
l3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q4bytes_0_h), svreinterpret_s64_s8(q4bytes_1_h)));
q8bytes_0_h = svld1_s8(pg128_all, q8_0+32);
q8bytes_1_h = svld1_s8(pg128_all, q8_1+32);
q8bytes_0_l = svld1_s8(pg128_all, q8_0+48);
q8bytes_1_l = svld1_s8(pg128_all, q8_1+48);
r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_h), svreinterpret_s64_s8(q8bytes_1_h)));
r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0_l), svreinterpret_s64_s8(q8bytes_1_l)));
sumi2 = svmmla_s32(svmmla_s32(svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), r2, l2), r3, l3);
svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg128_all, svlsl_n_u32_x(pg128_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-2)), 24));
acc_sumif2 = svmla_s32_x(pg128_all, acc_sumif2, svscales, sumi2);
q4_0 += 32; q4_1 += 32; q8_0 += 64; q8_1 += 64;
}
sumf1 = svmla_f32_x(pg128_all,
svmla_f32_x(pg128_all,
sumf1,
svcvt_f32_x(pg128_all,
svadd_s32_x(pg128_all, acc_sumif1, acc_sumif2)),
svsuper_block_scales),
svdmins,
svcvt_f32_s32_x(pg128_all, svsumfs_tmp));
} //end of for nb
} // end of case 128
break;
case 256:
case 512:
{
const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4);
const svbool_t pg8_16 = svptrue_pat_b8(SV_VL16);
const svbool_t pg256_all = svptrue_pat_b8(SV_ALL);
for (int i = 0; i < nb; ++i) {
const uint8_t * GGML_RESTRICT q4_0 = vx0[i].qs;
const int8_t * GGML_RESTRICT q8_0 = vy0[i].qs;
const uint8_t * GGML_RESTRICT q4_1 = vx1[i].qs;
const int8_t * GGML_RESTRICT q8_1 = vy1[i].qs;
svint32_t svscales, sumi1, sumi2;
svint32_t acc_sumif1 = svdup_n_s32(0);
svint32_t acc_sumif2 = svdup_n_s32(0);
svint8_t l0, l1, l2, l3, r0, r1, r2, r3;
svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
svfloat64_t vy_d_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)));
svfloat32_t vy_d = svreinterpret_f32_f64(svuzp1_f64(vy_d_tmp, vy_d_tmp));
svfloat32_t svsuper_block_scales = svmul_f32_z(pg32_4, vy_d, vx_d);
svfloat32_t vx_dmins = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].dmin)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].dmin)));
svfloat64_t vy_dmins_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)));
svfloat32_t vy_dmins = svreinterpret_f32_f64(svuzp1_f64(vy_dmins_tmp, vy_dmins_tmp));
svfloat32_t svdmins = svmul_n_f32_x(pg32_4, svmul_f32_x(pg32_4, vx_dmins, vy_dmins), -1);
svint16_t rc1 = svuzp1_s16(svld1_s16(pg256_all, vy0[i].bsums), svld1_s16(pg256_all, vy1[i].bsums));
svint16_t rc2 = svuzp2_s16(svld1_s16(pg256_all, vy0[i].bsums), svld1_s16(pg256_all, vy1[i].bsums));
svint16_t svq8sums = svadd_s16_x(pg256_all, rc1, rc2);
svuint32_t decoded_scales0 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx0[i].scales);
svuint32_t decoded_scales1 = ggml_decode_q4scales_and_mins_for_mmla((const uint32_t *)vx1[i].scales);
svuint32x2_t decoded_scales = svcreate2_u32(decoded_scales0, decoded_scales1);
svst2_u32(pg8_16, new_utmp.u32, decoded_scales);
svint16_t new_svq8sums_0 = svreinterpret_s16_u64(svtrn1_u64(svreinterpret_u64_s16(svq8sums), svreinterpret_u64_s16(svq8sums)));
svint16_t new_svq8sums_1 = svreinterpret_s16_u64(svtrn2_u64(svreinterpret_u64_s16(svq8sums), svreinterpret_u64_s16(svq8sums)));
svuint64_t new_mins_0 = svdup_u64(new_utmp.u64[2]);
svuint64_t new_mins_1 = svdup_u64(new_utmp.u64[3]);
svint16_t new_svmins8_0 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u64(new_mins_0)));
svint16_t new_svmins8_1 = svreinterpret_s16_u16(svunpklo_u16(svreinterpret_u8_u64(new_mins_1)));
svint64_t dot_prod_0 = svdot_s64(svdup_s64(0), new_svmins8_0, new_svq8sums_0);
svint64_t dot_prod_1 = svdot_s64(dot_prod_0, new_svmins8_1, new_svq8sums_1);
svfloat32_t converted_dot_prod_1 = svcvt_f32_s64_x(pg256_all, dot_prod_1);
svfloat32_t svsumfs_tmp = svuzp1_f32(converted_dot_prod_1, converted_dot_prod_1);
#pragma GCC unroll 1
for (int j = 0; j < QK_K/64; ++j) {
svuint8_t q4bytes_0 = svand_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_0), 0xf);
svuint8_t q4bytes_1 = svand_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_1), 0xf);
svuint8_t q4bytes_2 = svlsr_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_0), 4);
svuint8_t q4bytes_3 = svlsr_n_u8_x(pg256_all, svld1_u8(pg256_all, q4_1), 4);
l0 = svreinterpret_s8_u64(svzip1_u64(svreinterpret_u64_u8(q4bytes_0), svreinterpret_u64_u8(q4bytes_1)));
l1 = svreinterpret_s8_u64(svzip2_u64(svreinterpret_u64_u8(q4bytes_0), svreinterpret_u64_u8(q4bytes_1)));
l2 = svreinterpret_s8_u64(svzip1_u64(svreinterpret_u64_u8(q4bytes_2), svreinterpret_u64_u8(q4bytes_3)));
l3 = svreinterpret_s8_u64(svzip2_u64(svreinterpret_u64_u8(q4bytes_2), svreinterpret_u64_u8(q4bytes_3)));
svint8_t q8bytes_0 = svld1_s8(pg256_all, q8_0);
svint8_t q8bytes_1 = svld1_s8(pg256_all, q8_1);
svint8_t q8bytes_2 = svld1_s8(pg256_all, q8_0+32);
svint8_t q8bytes_3 = svld1_s8(pg256_all, q8_1+32);
r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
r2 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_2), svreinterpret_s64_s8(q8bytes_3)));
r3 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_2), svreinterpret_s64_s8(q8bytes_3)));
sumi1 = svmmla(svmmla(svdup_n_s32(0), r0, l0), r1, l1);
svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg256_all, svlsl_n_u32_x(pg256_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-1)), 24));
acc_sumif1 = svmla_s32_x(pg256_all, acc_sumif1, svscales, sumi1);
sumi2 = svmmla(svmmla(svdup_n_s32(0), r2, l2), r3, l3);
svscales = svreinterpret_s32_u32(svlsr_n_u32_x(pg256_all, svlsl_n_u32_x(pg256_all, svreinterpret_u32_u64(svdup_n_u64(new_utmp.u64[j/2])), 8*(4-2*(j%2)-2)), 24));
acc_sumif2 = svmla_s32_x(pg256_all, acc_sumif2, svscales, sumi2);
q4_0 += 32; q4_1 += 32; q8_0 += 64; q8_1 += 64;
}
svint32_t acc_sumif = svadd_s32_x(pg256_all, acc_sumif1, acc_sumif2);
svint32_t swap_acc_sumif = svext_s32(acc_sumif, acc_sumif, 4);
acc_sumif = svadd_s32_x(pg32_4, acc_sumif, swap_acc_sumif);
sumf1 = svmla_f32_x(pg32_4,
svmla_f32_x(pg32_4,
sumf1,
svcvt_f32_x(pg32_4, acc_sumif),
svsuper_block_scales),
svdmins,
svsumfs_tmp);
} // end of for nb
} // end of case 256-512
break;
default:
assert(false && "Unsupported vector length");
break;
}
svst1_f32(pg32_2, s, sumf1);
svst1_f32(pg32_2, s + bs, svreinterpret_f32_u8(svext_u8(svreinterpret_u8_f32(sumf1), svdup_n_u8(0), 8)));
return;
}
#elif defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
const block_q4_K * GGML_RESTRICT x0 = x;
const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx);
@@ -2467,6 +2235,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
const int vector_length = ggml_cpu_get_sve_cnt()*8;
const svuint8_t m4b = svdup_n_u8(0xf);
const svint32_t mzero = svdup_n_s32(0);
svint32_t sumi1 = svdup_n_s32(0);
@@ -2711,201 +2480,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int nb = n / QK_K;
#ifdef __ARM_FEATURE_SVE
const int vector_length = ggml_cpu_get_sve_cnt()*8;
#endif
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
const svbool_t pg32_2 = svptrue_pat_b32(SV_VL2);
svfloat32_t sum = svdup_n_f32(0);
const block_q6_K * GGML_RESTRICT vx0 = vx;
const block_q8_K * GGML_RESTRICT vy0 = vy;
const block_q6_K * GGML_RESTRICT vx1 = (const block_q6_K *) ((const uint8_t*)vx + bx);
const block_q8_K * GGML_RESTRICT vy1 = (const block_q8_K *) ((const uint8_t*)vy + by);
switch (vector_length) {
case 128:
{
const svbool_t pg128_all = svptrue_pat_b8(SV_ALL);
for (int i = 0; i < nb; ++i) {
const uint8_t * GGML_RESTRICT ql0 = vx0[i].ql;
const uint8_t * GGML_RESTRICT qh0 = vx0[i].qh;
const uint8_t * GGML_RESTRICT ql1 = vx1[i].ql;
const uint8_t * GGML_RESTRICT qh1 = vx1[i].qh;
const int8_t * GGML_RESTRICT q80 = vy0[i].qs;
const int8_t * GGML_RESTRICT q81 = vy1[i].qs;
const int8_t * GGML_RESTRICT scale0 = vx0[i].scales;
const int8_t * GGML_RESTRICT scale1 = vx1[i].scales;
svfloat32_t vy_d = svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d));
svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
svfloat32_t svsuper_block_scales = svmul_f32_x(pg128_all, vy_d, vx_d);
// process q8sum summation 128 bit route
const svint16_t q8sums_01 = svld1_s16(pg128_all, vy0[i].bsums);
const svint16_t q8sums_02 = svld1_s16(pg128_all, vy0[i].bsums + 8);
const svint16_t q8sums_11 = svld1_s16(pg128_all, vy1[i].bsums);
const svint16_t q8sums_12 = svld1_s16(pg128_all, vy1[i].bsums + 8);
const svint64x2_t q6scales_0_tmp = svld2_s64(pg128_all, (const int64_t *)scale0);
const svint16_t q6scales_01 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_0_tmp, 0)));
const svint16_t q6scales_02 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_0_tmp, 1)));
const svint64x2_t q6scales_1_tmp = svld2_s64(pg128_all, (const int64_t *)scale1);
const svint16_t q6scales_11 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_1_tmp, 0)));
const svint16_t q6scales_12 = svunpklo_s16(svreinterpret_s8_s64(svget2_s64(q6scales_1_tmp, 1)));
const svint64_t prod = svdup_n_s64(0);
svint32_t isum_tmp1 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_01, q6scales_01), q8sums_02, q6scales_02));
svint32_t isum_tmp2 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_01, q6scales_11), q8sums_02, q6scales_12));
svint32_t isum_tmp3 = svtrn1_s32(isum_tmp1, isum_tmp2);
svint32_t isum_tmp4 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_11, q6scales_01), q8sums_12, q6scales_02));
svint32_t isum_tmp5 = svreinterpret_s32_s64(svdot_s64(svdot_s64(prod, q8sums_11, q6scales_11), q8sums_12, q6scales_12));
svint32_t isum_tmp6 = svtrn1_s32(isum_tmp4, isum_tmp5);
svint32_t isum_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(isum_tmp3), svreinterpret_s64_s32(isum_tmp6)));
svint32_t isum_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(isum_tmp3), svreinterpret_s64_s32(isum_tmp6)));
svint32_t svisum_mins = svadd_s32_x(pg128_all, isum_tmp7, isum_tmp8);
// process mmla
svint8_t l0, l1, r0, r1;
svint32_t isum_tmp = svdup_n_s32(0);
for (int j = 0; j < QK_K/128; ++j) {
for (int k = 0; k < 8; ++k) {
svuint8_t qhbits_0 = svld1_u8(pg128_all, qh0+16*(k%2));
svuint8_t qhbits_1 = svld1_u8(pg128_all, qh1+16*(k%2));
svuint8_t q6bits_0 = svld1_u8(pg128_all, ql0+16*(k%4));
svuint8_t q6bits_1 = svld1_u8(pg128_all, ql1+16*(k%4));
const int ql_pos = (k/4)*4;
svuint8_t q6bytes_0_lo = (ql_pos < 4) ? svand_n_u8_x(pg128_all, q6bits_0, 0xf) : svlsr_n_u8_x(pg128_all, q6bits_0, 4);
svuint8_t q6bytes_1_lo = (ql_pos < 4) ? svand_n_u8_x(pg128_all, q6bits_1, 0xf) : svlsr_n_u8_x(pg128_all, q6bits_1, 4);
const int qh_pos = (k/2)*2;
svuint8_t q6bytes_0_hi = svand_n_u8_x(pg128_all, qhbits_0, 0x3 << qh_pos);
svuint8_t q6bytes_1_hi = svand_n_u8_x(pg128_all, qhbits_1, 0x3 << qh_pos);
svint8_t q6bytes_0, q6bytes_1;
if (qh_pos <= 4) {
q6bytes_0 = svreinterpret_s8_u8(svmla_n_u8_x(pg128_all, q6bytes_0_lo, q6bytes_0_hi, 1 << (4 - qh_pos)));
q6bytes_1 = svreinterpret_s8_u8(svmla_n_u8_x(pg128_all, q6bytes_1_lo, q6bytes_1_hi, 1 << (4 - qh_pos)));
} else {
q6bytes_0 = svreinterpret_s8_u8(svorr_u8_x(pg128_all, q6bytes_0_lo, svlsr_n_u8_x(pg128_all, q6bytes_0_hi, (qh_pos - 4))));
q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg128_all, q6bytes_1_lo, svlsr_n_u8_x(pg128_all, q6bytes_1_hi, (qh_pos - 4))));
}
svint8_t q8bytes_0 = svld1_s8(pg128_all, q80+16*(k%8));
svint8_t q8bytes_1 = svld1_s8(pg128_all, q81+16*(k%8));
l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
svint32_t svscale = svzip1_s32(svdup_n_s32(scale0[k]), svdup_n_s32(scale1[k]));
isum_tmp = svmla_s32_x(pg128_all, isum_tmp, svmmla_s32(svmmla_s32(svdup_n_s32(0), r0, l0), r1, l1), svscale);
}
qh0 += 32; qh1 += 32;
ql0 += 64; ql1 += 64;
q80 += 128; q81 += 128;
scale0 += 8; scale1 += 8;
}
sum = svmla_f32_x(pg128_all, sum,
svcvt_f32_x(pg128_all, svmla_s32_x(pg128_all, isum_tmp,
svisum_mins, svdup_n_s32(-32))),
svsuper_block_scales);
}
} // end of case 128
break;
case 256:
case 512:
{
const svbool_t pg256_all = svptrue_pat_b8(SV_ALL);
const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4);
for (int i = 0; i < nb; ++i) {
const uint8_t * GGML_RESTRICT ql0 = vx0[i].ql;
const uint8_t * GGML_RESTRICT qh0 = vx0[i].qh;
const uint8_t * GGML_RESTRICT ql1 = vx1[i].ql;
const uint8_t * GGML_RESTRICT qh1 = vx1[i].qh;
const int8_t * GGML_RESTRICT q80 = vy0[i].qs;
const int8_t * GGML_RESTRICT q81 = vy1[i].qs;
const int8_t * GGML_RESTRICT scale0 = vx0[i].scales;
const int8_t * GGML_RESTRICT scale1 = vx1[i].scales;
svfloat32_t vx_d = svzip1_f32(svdup_n_f32(GGML_FP16_TO_FP32(vx0[i].d)), svdup_n_f32(GGML_FP16_TO_FP32(vx1[i].d)));
svfloat64_t vy_d_tmp = svreinterpret_f64_f32(svuzp1_f32(svdup_n_f32(vy0[i].d), svdup_n_f32(vy1[i].d)));
svfloat32_t vy_d = svreinterpret_f32_f64(svuzp1_f64(vy_d_tmp, vy_d_tmp));
svfloat32_t svsuper_block_scales = svmul_f32_x(pg32_4, vy_d, vx_d);
// process q8sum summation 256 bit route
const svint16_t q8sums_0 = svld1_s16(pg256_all, vy0[i].bsums);
const svint16_t q8sums_1 = svld1_s16(pg256_all, vy1[i].bsums);
const svint16_t q6scales_0 = svunpklo_s16(svld1_s8(pg256_all, scale0));
const svint16_t q6scales_1 = svunpklo_s16(svld1_s8(pg256_all, scale1));
const svint64_t prod = svdup_n_s64(0);
svint32_t isum_tmp1 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_0, q6scales_0));
svint32_t isum_tmp2 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_0, q6scales_1));
svint32_t isum_tmp3 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_1, q6scales_0));
svint32_t isum_tmp4 = svreinterpret_s32_s64(svdot_s64(prod, q8sums_1, q6scales_1));
svint32_t isum_tmp5 = svtrn1_s32(isum_tmp1, isum_tmp2);
svint32_t isum_tmp6 = svtrn1_s32(isum_tmp3, isum_tmp4);
svint32_t isum_tmp7 = svreinterpret_s32_s64(svtrn2_s64(svreinterpret_s64_s32(isum_tmp5), svreinterpret_s64_s32(isum_tmp6)));
svint32_t isum_tmp8 = svreinterpret_s32_s64(svtrn1_s64(svreinterpret_s64_s32(isum_tmp5), svreinterpret_s64_s32(isum_tmp6)));
svint32_t isum_tmp9 = svadd_s32_x(pg256_all, isum_tmp7, isum_tmp8);
svint32_t isum_tmp10 = svreinterpret_s32_u8(svext_u8(svreinterpret_u8_s32(isum_tmp9), svreinterpret_u8_s32(isum_tmp9), 16));
svint32_t svisum_mins = svadd_s32_z(pg32_4, isum_tmp9, isum_tmp10);
// process mmla
svint8_t l0, l1, r0, r1;
svint32_t isum_tmp = svdup_n_s32(0);
for (int j = 0; j < QK_K/128; ++j) {
for (int k = 0; k < 8; k+=2) { // process 2 block
svuint8_t qhbits_0 = svld1_u8(pg256_all, qh0);
svuint8_t qhbits_1 = svld1_u8(pg256_all, qh1);
svuint8_t q6bits_0 = svld1_u8(pg256_all, ql0+32*((k%4)/2));
svuint8_t q6bits_1 = svld1_u8(pg256_all, ql1+32*((k%4)/2));
const int ql_pos = (k/4)*4;
svuint8_t q6bytes_0_lo = (ql_pos < 4) ? svand_n_u8_x(pg256_all, q6bits_0, 0xf) : svlsr_n_u8_x(pg256_all, q6bits_0, 4);
svuint8_t q6bytes_1_lo = (ql_pos < 4) ? svand_n_u8_x(pg256_all, q6bits_1, 0xf) : svlsr_n_u8_x(pg256_all, q6bits_1, 4);
const int qh_pos = (k/2)*2;
svuint8_t q6bytes_0_hi = svand_n_u8_x(pg256_all, qhbits_0, 0x3 << qh_pos);
svuint8_t q6bytes_1_hi = svand_n_u8_x(pg256_all, qhbits_1, 0x3 << qh_pos);
svint8_t q6bytes_0, q6bytes_1;
if (qh_pos <= 4) {
q6bytes_0 = svreinterpret_s8_u8(svmla_n_u8_x(pg256_all, q6bytes_0_lo, q6bytes_0_hi, 1 << (4 - qh_pos)));
q6bytes_1 = svreinterpret_s8_u8(svmla_n_u8_x(pg256_all, q6bytes_1_lo, q6bytes_1_hi, 1 << (4 - qh_pos)));
} else {
q6bytes_0 = svreinterpret_s8_u8(svorr_u8_x(pg256_all, q6bytes_0_lo, svlsr_n_u8_x(pg256_all, q6bytes_0_hi, (qh_pos - 4))));
q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg256_all, q6bytes_1_lo, svlsr_n_u8_x(pg256_all, q6bytes_1_hi, (qh_pos - 4))));
}
svint8_t q8bytes_0 = svld1_s8(pg256_all, q80+32*(k/2));
svint8_t q8bytes_1 = svld1_s8(pg256_all, q81+32*(k/2));
l0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
l1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q6bytes_0), svreinterpret_s64_s8(q6bytes_1)));
r0 = svreinterpret_s8_s64(svzip1_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
r1 = svreinterpret_s8_s64(svzip2_s64(svreinterpret_s64_s8(q8bytes_0), svreinterpret_s64_s8(q8bytes_1)));
svint32_t svscale0 = svzip1_s32(svdup_n_s32(scale0[k]), svdup_n_s32(scale1[k]));
svint32_t svscale1 = svzip1_s32(svdup_n_s32(scale0[k+1]), svdup_n_s32(scale1[k+1]));
isum_tmp = svmla_s32_x(pg256_all, isum_tmp, svmmla_s32(svdup_n_s32(0), r0, l0), svscale0);
isum_tmp = svmla_s32_x(pg256_all, isum_tmp, svmmla_s32(svdup_n_s32(0), r1, l1), svscale1);
}
qh0 += 32; qh1 += 32;
ql0 += 64; ql1 += 64;
q80 += 128; q81 += 128;
scale0 += 8; scale1 += 8;
} // end of for
svint32_t swap_isum_tmp = svext_s32(isum_tmp, isum_tmp, 4);
isum_tmp = svadd_s32_x(pg32_4, isum_tmp, swap_isum_tmp);
sum = svmla_f32_x(pg32_4, sum,
svcvt_f32_x(pg32_4, svmla_s32_x(pg32_4, isum_tmp,
svisum_mins, svdup_n_s32(-32))),
svsuper_block_scales);
}
} // end of case 256
break;
default:
assert(false && "Unsupported vector length");
break;
} // end of switch
svst1_f32(pg32_2, s, sum);
svst1_f32(pg32_2, s + bs, svreinterpret_f32_u8(svext_u8(svreinterpret_u8_f32(sum), svdup_n_u8(0), 8)));
return;
}
#elif defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_FEATURE_MATMUL_INT8)
if (nrc == 2) {
const block_q6_K * GGML_RESTRICT x0 = x;
const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx);
@@ -3019,6 +2594,27 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
// adjust bias, apply superblock scale
{
int32_t bias[4];
#ifdef __ARM_FEATURE_SVE
const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8);
const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums);
const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8);
const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums);
const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8);
const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales));
const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8));
const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales));
const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8));
const svint64_t zero = svdup_n_s64(0);
bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0),
svdot_s64(zero, y0_q8sums_1, x0_q6scales_1)));
bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0),
svdot_s64(zero, y1_q8sums_1, x0_q6scales_1)));
bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0),
svdot_s64(zero, y0_q8sums_1, x1_q6scales_1)));
bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0),
svdot_s64(zero, y1_q8sums_1, x1_q6scales_1)));
#else
// NEON doesn't support int16 dot product, fallback to separated mul and add
const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums);
const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums);
@@ -3050,6 +2646,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1]))));
bias[3] = vaddvq_s32(prod);
#endif
const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
const float32x4_t superblock_scale = {
@@ -3075,6 +2672,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
#endif
#ifdef __ARM_FEATURE_SVE
const int vector_length = ggml_cpu_get_sve_cnt()*8;
float sum = 0;
svuint8_t m4b = svdup_n_u8(0xf);
svint32_t vzero = svdup_n_s32(0);

View File

@@ -700,8 +700,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (; ib + 1 < nb; ib += 2) {
// Compute combined scale for the block 0 and 1
const float ft0 = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
const __m128 d_0_1 = (__m128)(v4f32){ft0, ft0, ft0, ft0};
const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
@@ -715,9 +714,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
bx_1 = __lsx_vsub_b(bx_1, off);
const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
//_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
//_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
// Compute combined scale for the block 2 and 3
const float ft1 = GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d);
const __m128 d_2_3 = (__m128)(v4f32){ft1, ft1, ft1, ft1};
const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);

View File

@@ -580,19 +580,16 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
uint8_t *patmp = atmp;
int vsums;
int tmp, t1, t2, t3, t4, t5, t6, t7;
int tmp;
__asm__ __volatile__(
"vsetivli zero, 16, e8, m1\n\t"
"vmv.v.x v8, zero\n\t"
"lb zero, 15(%[sc])\n\t"
"vle8.v v1, (%[sc])\n\t"
"vle8.v v2, (%[bsums])\n\t"
"addi %[tmp], %[bsums], 16\n\t"
"vand.vi v0, v1, 0xF\n\t"
"vsrl.vi v1, v1, 4\n\t"
"vle8.v v3, (%[tmp])\n\t"
"vse8.v v0, (%[scale])\n\t"
"vsetivli zero, 16, e16, m2\n\t"
"vle16.v v2, (%[bsums])\n\t"
"vzext.vf2 v0, v1\n\t"
"vwmul.vv v4, v0, v2\n\t"
"vsetivli zero, 16, e32, m4\n\t"
@@ -611,89 +608,46 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
for (int j = 0; j < QK_K/128; ++j) {
__asm__ __volatile__(
"lb zero, 31(%[q2])\n\t"
"addi %[tmp], %[q2], 16\n\t"
"addi %[t1], %[q8], 16\n\t"
"vsetivli zero, 16, e8, m1\n\t"
"vsetvli zero, %[vl32], e8, m2\n\t"
"vle8.v v0, (%[q2])\n\t"
"vle8.v v1, (%[tmp])\n\t"
"vsrl.vi v2, v0, 2\n\t"
"vsrl.vi v3, v1, 2\n\t"
"vsrl.vi v4, v0, 4\n\t"
"addi %[tmp], %[q8], 32\n\t"
"vle8.v v8, (%[q8])\n\t"
"vle8.v v9, (%[t1])\n\t"
"addi %[t1], %[t1], 32\n\t"
"vsrl.vi v5, v1, 4\n\t"
"vsrl.vi v6, v0, 6\n\t"
"vsrl.vi v7, v1, 6\n\t"
"vle8.v v10, (%[tmp])\n\t"
"vle8.v v11, (%[t1])\n\t"
"addi %[tmp], %[tmp], 32\n\t"
"addi %[t1], %[t1], 32\n\t"
"vand.vi v0, v0, 0x3\n\t"
"vand.vi v1, v1, 0x3\n\t"
"vand.vi v2, v2, 0x3\n\t"
"vle8.v v12, (%[tmp])\n\t"
"vle8.v v13, (%[t1])\n\t"
"addi %[tmp], %[tmp], 32\n\t"
"addi %[t1], %[t1], 32\n\t"
"vand.vi v3, v3, 0x3\n\t"
"vand.vi v4, v4, 0x3\n\t"
"vand.vi v5, v5, 0x3\n\t"
"vle8.v v14, (%[tmp])\n\t"
"vle8.v v15, (%[t1])\n\t"
"vsetvli zero, %[vl128], e8, m8\n\t"
"vle8.v v8, (%[q8])\n\t"
"vsetvli zero, %[vl64], e8, m4\n\t"
"vwmul.vv v16, v0, v8\n\t"
"vwmul.vv v18, v1, v9\n\t"
"vwmul.vv v20, v2, v10\n\t"
"vwmul.vv v22, v3, v11\n\t"
"vwmul.vv v24, v4, v12\n\t"
"vwmul.vv v26, v5, v13\n\t"
"vwmul.vv v28, v6, v14\n\t"
"vwmul.vv v30, v7, v15\n\t"
"vsetivli zero, 8, e16, m1\n\t"
"vsetivli zero, 16, e16, m2\n\t"
"vmv.v.x v0, zero\n\t"
"lbu %[tmp], 0(%[scale])\n\t"
"vwredsum.vs v8, v16, v0\n\t"
"vwredsum.vs v10, v16, v0\n\t"
"vwredsum.vs v9, v18, v0\n\t"
"lbu %[t1], 1(%[scale])\n\t"
"vwredsum.vs v10, v20, v0\n\t"
"vwredsum.vs v11, v22, v0\n\t"
"lbu %[t2], 2(%[scale])\n\t"
"vwredsum.vs v12, v24, v0\n\t"
"vwredsum.vs v13, v26, v0\n\t"
"lbu %[t3], 3(%[scale])\n\t"
"vwredsum.vs v14, v28, v0\n\t"
"vwredsum.vs v15, v30, v0\n\t"
"lbu %[t4], 4(%[scale])\n\t"
"vwredsum.vs v8, v17, v8\n\t"
"vwredsum.vs v9, v19, v9\n\t"
"lbu %[t5], 5(%[scale])\n\t"
"vwredsum.vs v10, v21, v10\n\t"
"vwredsum.vs v11, v23, v11\n\t"
"lbu %[t6], 6(%[scale])\n\t"
"vwredsum.vs v12, v25, v12\n\t"
"vwredsum.vs v13, v27, v13\n\t"
"lbu %[t7], 7(%[scale])\n\t"
"vwredsum.vs v14, v29, v14\n\t"
"vwredsum.vs v15, v31, v15\n\t"
"vwredsum.vs v8, v20, v0\n\t"
"vwredsum.vs v7, v22, v0\n\t"
"vwredsum.vs v11, v24, v0\n\t"
"vwredsum.vs v12, v26, v0\n\t"
"vwredsum.vs v13, v28, v0\n\t"
"vwredsum.vs v14, v30, v0\n\t"
"vsetivli zero, 4, e32, m1\n\t"
"vmul.vx v0, v8, %[tmp]\n\t"
"vmul.vx v1, v9, %[t1]\n\t"
"vmacc.vx v0, %[t2], v10\n\t"
"vmacc.vx v1, %[t3], v11\n\t"
"vmacc.vx v0, %[t4], v12\n\t"
"vmacc.vx v1, %[t5], v13\n\t"
"vmacc.vx v0, %[t6], v14\n\t"
"vmacc.vx v1, %[t7], v15\n\t"
"vslideup.vi v10, v9, 1\n\t"
"vslideup.vi v8, v7, 1\n\t"
"vslideup.vi v11, v12, 1\n\t"
"vslideup.vi v13, v14, 1\n\t"
"vslideup.vi v10, v8, 2\n\t"
"vslideup.vi v11, v13, 2\n\t"
"vsetivli zero, 8, e32, m2\n\t"
"vle8.v v15, (%[scale])\n\t"
"vzext.vf4 v12, v15\n\t"
"vmul.vv v10, v10, v12\n\t"
"vredsum.vs v0, v10, v0\n\t"
"vmv.x.s %[tmp], v0\n\t"
"vmv.x.s %[t1], v1\n\t"
"add %[isum], %[isum], %[tmp]\n\t"
"add %[isum], %[isum], %[t1]"
: [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
, [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
, [isum] "+&r" (isum)
"add %[isum], %[isum], %[tmp]"
: [tmp] "=&r" (tmp), [isum] "+&r" (isum)
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
@@ -975,7 +929,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int8_t * restrict q8 = y[i].qs;
int8_t * scale = (int8_t *)utmp;
int tmp, t1, t2, t3, t4, t5, t6, t7;
int tmp;
__asm__ __volatile__(
"vsetivli zero, 12, e8, m1\n\t"
"vle8.v v0, (%[s6b])\n\t"
@@ -1013,23 +967,19 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
int isum = 0;
for (int j = 0; j < QK_K; j += 128) {
__asm__ __volatile__(
"lb zero, 31(%[q3])\n\t"
"vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
"vle8.v v8, (%[q3])\n\t"
"vsrl.vi v10, v8, 2\n\t"
"vsrl.vi v12, v8, 4\n\t"
"vsrl.vi v14, v8, 6\n\t"
"lb zero, 64(%[q8])\n\t"
"vand.vi v8, v8, 3\n\t"
"vand.vi v10, v10, 3\n\t"
"vand.vi v12, v12, 3\n\t"
"vle8.v v2, (%[qh])\n\t"
"lb zero, 127(%[q8])\n\t"
"vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"vmseq.vx v0, v4, zero\n\t"
"vadd.vi v8, v8, -4, v0.t\n\t"
"lb zero, 0(%[q8])\n\t"
"vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"vmseq.vx v0, v4, zero\n\t"
@@ -1044,43 +994,34 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
"vadd.vi v14, v14, -4, v0.t\n\t"
"vsetvli zero, %[vl128], e8, m8\n\t"
"vle8.v v0, (%[q8])\n\t"
"lb %[tmp], 0(%[scale])\n\t"
"lb %[t1], 1(%[scale])\n\t"
"lb %[t2], 2(%[scale])\n\t"
"lb %[t3], 3(%[scale])\n\t"
"vsetvli zero, %[vl64], e8, m4\n\t"
"vwmul.vv v16, v0, v8\n\t"
"vwmul.vv v24, v4, v12\n\t"
"vsetivli zero, 16, e16, m2\n\t"
"vmv.v.x v0, zero\n\t"
"vwredsum.vs v8, v16, v0\n\t"
"lb %[t4], 4(%[scale])\n\t"
"lb %[t5], 5(%[scale])\n\t"
"vwredsum.vs v10, v16, v0\n\t"
"vwredsum.vs v9, v18, v0\n\t"
"vwredsum.vs v10, v20, v0\n\t"
"vwredsum.vs v11, v22, v0\n\t"
"vwredsum.vs v12, v24, v0\n\t"
"lb %[t6], 6(%[scale])\n\t"
"lb %[t7], 7(%[scale])\n\t"
"vwredsum.vs v13, v26, v0\n\t"
"vwredsum.vs v14, v28, v0\n\t"
"vwredsum.vs v15, v30, v0\n\t"
"vwredsum.vs v8, v20, v0\n\t"
"vwredsum.vs v7, v22, v0\n\t"
"vwredsum.vs v11, v24, v0\n\t"
"vwredsum.vs v12, v26, v0\n\t"
"vwredsum.vs v13, v28, v0\n\t"
"vwredsum.vs v14, v30, v0\n\t"
"vsetivli zero, 4, e32, m1\n\t"
"vmul.vx v0, v8, %[tmp]\n\t"
"vmul.vx v1, v9, %[t1]\n\t"
"vmacc.vx v0, %[t2], v10\n\t"
"vmacc.vx v1, %[t3], v11\n\t"
"vmacc.vx v0, %[t4], v12\n\t"
"vmacc.vx v1, %[t5], v13\n\t"
"vmacc.vx v0, %[t6], v14\n\t"
"vmacc.vx v1, %[t7], v15\n\t"
"vslideup.vi v10, v9, 1\n\t"
"vslideup.vi v8, v7, 1\n\t"
"vslideup.vi v11, v12, 1\n\t"
"vslideup.vi v13, v14, 1\n\t"
"vslideup.vi v10, v8, 2\n\t"
"vslideup.vi v11, v13, 2\n\t"
"vsetivli zero, 8, e32, m2\n\t"
"vle8.v v15, (%[scale])\n\t"
"vsext.vf4 v12, v15\n\t"
"vmul.vv v10, v10, v12\n\t"
"vredsum.vs v0, v10, v0\n\t"
"vmv.x.s %[tmp], v0\n\t"
"vmv.x.s %[t1], v1\n\t"
"add %[isum], %[isum], %[tmp]\n\t"
"add %[isum], %[isum], %[t1]"
: [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
, [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
, [m] "+&r" (m), [isum] "+&r" (isum)
"add %[isum], %[isum], %[tmp]"
: [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
: "memory"

View File

@@ -1,50 +0,0 @@
#include "ggml-backend-impl.h"
#if defined(__s390x__)
#include <sys/auxv.h>
// find hwcap bits in asm/elf.h
#ifndef HWCAP_VXRS_EXT2
#define HWCAP_VXRS_EXT2 (1 << 15)
#endif
#ifndef HWCAP_NNPA
#define HWCAP_NNPA (1 << 20)
#endif
struct s390x_features {
bool has_vxe2 = false;
bool has_nnpa = false;
s390x_features() {
uint32_t hwcap = getauxval(AT_HWCAP);
// NOTE: use hwcap2 with DFLT for z17 and later
// uint32_t hwcap2 = getauxval(AT_HWCAP2);
has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
has_nnpa = !!(hwcap & HWCAP_NNPA);
}
};
static int ggml_backend_cpu_s390x_score() {
int score = 1;
s390x_features sf;
// IBM z15 / LinuxONE 3
#ifdef GGML_USE_VXE2
if (!sf.has_vxe2) { return 0; }
score += 1 << 1;
#endif
// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
#ifdef GGML_USE_NNPA
if (!sf.has_nnpa) { return 0; }
score += 1 << 2;
#endif
return score;
}
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
#endif // __s390x__

View File

@@ -500,15 +500,13 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
#endif
#if defined(__loongarch_sx)
#if defined(__loongarch_asx)
/* float type data load instructions */
static __m128 __lsx_vreplfr2vr_s(const float val) {
v4f32 res = {val, val, val, val};
return (__m128)res;
}
#endif
#if defined(__loongarch_asx)
static __m256 __lasx_xvreplfr2vr_s(const float val) {
v8f32 res = {val, val, val, val, val, val, val, val};
return (__m256)res;

View File

@@ -1613,8 +1613,13 @@ static void ggml_compute_forward_mul_mat_id(
chunk_size = 64;
}
#if defined(__aarch64__)
// disable for ARM
const bool disable_chunking = true;
#else
// disable for NUMA
const bool disable_chunking = ggml_is_numa();
#endif // defined(__aarch64__)
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
@@ -1807,6 +1812,22 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_cont(params, tensor);
} break;
case GGML_OP_RESHAPE:
{
ggml_compute_forward_reshape(params, tensor);
} break;
case GGML_OP_VIEW:
{
ggml_compute_forward_view(params, tensor);
} break;
case GGML_OP_PERMUTE:
{
ggml_compute_forward_permute(params, tensor);
} break;
case GGML_OP_TRANSPOSE:
{
ggml_compute_forward_transpose(params, tensor);
} break;
case GGML_OP_GET_ROWS:
{
ggml_compute_forward_get_rows(params, tensor);
@@ -2026,22 +2047,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
// nop
} break;
case GGML_OP_RESHAPE:
{
// nop
} break;
case GGML_OP_PERMUTE:
{
// nop
} break;
case GGML_OP_VIEW:
{
// nop
} break;
case GGML_OP_TRANSPOSE:
{
// nop
} break;
case GGML_OP_COUNT:
{
GGML_ABORT("fatal error");
@@ -2884,11 +2889,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];
if (ggml_op_is_empty(node->op)) {
// skip NOPs
continue;
}
ggml_compute_forward(&params, node);
if (state->ith == 0 && cplan->abort_callback &&

View File

@@ -4,7 +4,6 @@
// KleidiAI micro-kernels
#include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
#include "kai_matmul_clamp_f32_qai8dxp_qsi8cxp_interface.h"
#include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
#include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
#include "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
@@ -12,31 +11,20 @@
#include "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
#include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
#include "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
#include "kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
#include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
#include "kai_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod.h"
#include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
#include "kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h"
#include "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
#include "kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.h"
#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
#include "kai_lhs_quant_pack_qai8dxp_f32.h"
#include "kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.h"
#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
#include "kai_rhs_pack_nxk_qsi8cxp_qsi8cx_neon.h"
#include "kai_common.h"
#include "simd-mappings.h"
#define GGML_COMMON_DECL_CPP
#include "ggml-common.h"
#include "kernels.h"
#define NELEMS(x) sizeof(x) / sizeof(*x)
@@ -67,14 +55,6 @@ static inline void kernel_run_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
Fn(m, n, k, lhs, rhs, dst, dst_stride_row, dst_stride_col, clamp_min, clamp_max);
}
template<void(*Fn)(size_t,size_t,size_t,const void*,const void*,float*,size_t,size_t,float,float)>
static inline void kernel_run_float_fn10(size_t m, size_t n, size_t k, size_t /*bl*/,
const void* lhs, const void* rhs, void* dst,
size_t dst_stride_row, size_t dst_stride_col,
float clamp_min, float clamp_max) {
Fn(m, n, k, lhs, rhs, static_cast<float*>(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max);
}
template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t)>
static inline size_t lhs_ps_fn6(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr) {
return Fn(m, k, bl, mr, kr, sr);
@@ -113,12 +93,6 @@ static inline void lhs_pack_void_fn9(size_t m, size_t k, size_t /*bl*/, size_t m
Fn(m, k, mr, kr, sr, m_idx_start, lhs, lhs_stride, lhs_packed);
}
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const float*,size_t,void*)>
static inline void lhs_pack_float_fn9_no_bl(size_t m, size_t k, size_t /*bl*/, size_t mr, size_t kr, size_t sr,
size_t m_idx_start, const void * lhs, size_t lhs_stride, void * lhs_packed) {
Fn(m, k, mr, kr, sr, m_idx_start, static_cast<const float*>(lhs), lhs_stride, lhs_packed);
}
template<size_t(*Fn)(size_t,size_t,size_t,size_t,size_t)>
static inline size_t rhs_ps_fn5(size_t n, size_t k, size_t nr, size_t kr, size_t bl) {
return Fn(n, k, nr, kr, bl);
@@ -150,18 +124,6 @@ static inline void rhs_pack_fn12(size_t num_groups, size_t n, size_t k, size_t n
static_cast<const kai_rhs_pack_qs4cxs1s0_param*>(params));
}
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,const int8_t*,const float*,const float*,void*,size_t,const struct kai_rhs_pack_qsi8cx_params*)>
static inline void rhs_pack_scale_fn12(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale,
void* rhs_packed, size_t extra_bytes, const void* params) {
Fn(num_groups, n, k, nr, kr, sr,
static_cast<const int8_t*>(rhs),
static_cast<const float*>(bias),
static_cast<const float*>(scale),
rhs_packed, extra_bytes,
static_cast<const kai_rhs_pack_qsi8cx_params*>(params));
}
template<void(*Fn)(size_t,size_t,size_t,size_t,size_t,size_t,size_t,const void*,const void*,const void*,void*,size_t,const void*)>
static inline void rhs_pack_fn13(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/,
size_t rhs_stride, const void* rhs, const void* bias, const void* scale,
@@ -251,57 +213,6 @@ static void dequantize_row_qsi4c32ps1s0scalef16(
GGML_UNUSED(kr);
}
static void dequantize_row_qsi8cxp(
const void *packed_data,
int32_t row_idx,
int64_t k,
float *out,
size_t nr,
size_t packed_row_stride,
size_t kr,
size_t bl,
size_t num_bytes_multiplier
) {
GGML_UNUSED(bl);
GGML_UNUSED(num_bytes_multiplier);
const size_t k_internal = ((size_t) k + QK8_0 - 1) / QK8_0 * QK8_0;
const size_t group_idx = row_idx / nr;
const size_t row_in_group = row_idx % nr;
const uint8_t * group_ptr = static_cast<const uint8_t *>(packed_data) + group_idx * packed_row_stride;
const int8_t * data_base = reinterpret_cast<const int8_t *>(group_ptr);
const size_t num_blocks = k_internal / kr;
for (size_t block = 0; block < num_blocks; ++block) {
const int8_t * block_ptr = data_base + (block * nr + row_in_group) * kr;
for (size_t i = 0; i < kr; ++i) {
const size_t k_idx = block * kr + i;
if (k_idx < (size_t) k) {
out[k_idx] = static_cast<float>(block_ptr[i]);
}
}
}
const uint8_t * sums_ptr = group_ptr + nr * k_internal;
GGML_UNUSED(sums_ptr);
const float * scale_ptr = reinterpret_cast<const float *>(sums_ptr + nr * sizeof(int32_t));
const float scale = scale_ptr[row_in_group];
if (scale == 0.0f) {
for (size_t i = 0; i < (size_t) k; ++i) {
out[i] = 0.0f;
}
return;
}
for (size_t i = 0; i < (size_t) k; ++i) {
out[i] *= scale;
}
}
static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
#if defined(__ARM_FEATURE_SME)
{
@@ -637,174 +548,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
#endif
};
static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
#if defined(__ARM_FEATURE_SME)
{
/* SME GEMM */
{
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa>,
},
/* .gemm_lhs_info = */ {
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
},
/* SME GEMV */
{
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot,
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot>,
},
/* .gemv_lhs_info = */ {
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
},
/* .rhs_info = */ {
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon,
/* .to_float = */ dequantize_row_qsi8cxp,
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
},
/* .required_cpu = */ CPU_FEATURE_SME,
/* .lhs_type = */ GGML_TYPE_F32,
/* .rhs_type = */ GGML_TYPE_Q8_0,
/* .op_type = */ GGML_TYPE_F32,
},
#endif
#if defined(__ARM_FEATURE_MATMUL_INT8)
{
/* I8MM GEMM */
{
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm>,
},
/* .gemm_lhs_info = */ {
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
},
/* I8MM GEMV (dotprod fallback) */
{
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod,
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x8_qsi8cxp4x8_1x4_neon_dotprod>,
},
/* .gemv_lhs_info = */ {
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
},
/* .rhs_info = */ {
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon,
/* .to_float = */ dequantize_row_qsi8cxp,
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
},
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
/* .lhs_type = */ GGML_TYPE_F32,
/* .rhs_type = */ GGML_TYPE_Q8_0,
/* .op_type = */ GGML_TYPE_F32,
},
#endif
#if defined(__ARM_FEATURE_DOTPROD)
{
/* DOTPROD GEMM */
{
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod,
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod>,
},
/* .gemm_lhs_info = */ {
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
},
/* DOTPROD GEMV */
{
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
/* .get_lhs_offset_ex = */ &kernel_offs_fn2<kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
/* .get_rhs_packed_offset_ex = */ &kernel_offs_fn2<kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
/* .run_kernel_ex = */ &kernel_run_float_fn10<kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod>,
},
/* .gemv_lhs_info = */ {
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qai8dxp_f32,
/* .get_packed_offset_ex = */ &lhs_offs_fn5<kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32>,
/* .packed_size_ex = */ &lhs_ps_fn5<kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32>,
/* .pack_func_ex = */ &lhs_pack_float_fn9_no_bl<kai_run_lhs_quant_pack_qai8dxp_f32>,
},
/* .rhs_info = */ {
/* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon,
/* .to_float = */ dequantize_row_qsi8cxp,
/* .packed_size_ex = */ &rhs_ps_fn5<kai_get_rhs_packed_size_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
},
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
/* .lhs_type = */ GGML_TYPE_F32,
/* .rhs_type = */ GGML_TYPE_Q8_0,
/* .op_type = */ GGML_TYPE_F32,
},
#endif
};
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor) {
ggml_kleidiai_kernels * kernel = nullptr;
@@ -819,17 +562,6 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c
break;
}
}
if (!kernel) {
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8); ++i) {
if ((cpu_features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu &&
gemm_gemv_kernels_q8[i].lhs_type == tensor->src[1]->type &&
gemm_gemv_kernels_q8[i].rhs_type == tensor->src[0]->type &&
gemm_gemv_kernels_q8[i].op_type == tensor->type) {
kernel = &gemm_gemv_kernels_q8[i];
break;
}
}
}
#endif
}
@@ -850,18 +582,3 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features)
return kernels;
}
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features) {
ggml_kleidiai_kernels * kernels = nullptr;
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8); ++i) {
if ((features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu) {
kernels = &gemm_gemv_kernels_q8[i];
break;
}
}
#endif
return kernels;
}

View File

@@ -87,4 +87,3 @@ struct ggml_kleidiai_kernels {
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor);
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features);
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q8_0(cpu_feature features);

View File

@@ -5,13 +5,10 @@
#include <assert.h>
#include <atomic>
#include <cfloat>
#include <cmath>
#include <algorithm>
#include <stdexcept>
#include <stdint.h>
#include <string.h>
#include <string>
#include <vector>
#if defined(__linux__)
#include <asm/hwcap.h>
#include <sys/auxv.h>
@@ -41,9 +38,8 @@
struct ggml_kleidiai_context {
cpu_feature features;
ggml_kleidiai_kernels * kernels_q4;
ggml_kleidiai_kernels * kernels_q8;
} static ctx = { CPU_FEATURE_NONE, NULL, NULL };
ggml_kleidiai_kernels * kernels;
} static ctx = { CPU_FEATURE_NONE, NULL };
static const char* cpu_feature_to_string(cpu_feature f) {
switch (f) {
@@ -77,14 +73,10 @@ static void init_kleidiai_context(void) {
if (sme_enabled != 0) {
ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
}
ctx.kernels_q4 = ggml_kleidiai_select_kernels_q4_0(ctx.features);
ctx.kernels_q8 = ggml_kleidiai_select_kernels_q8_0(ctx.features);
ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features);
#ifndef NDEBUG
if (ctx.kernels_q4) {
GGML_LOG_DEBUG("kleidiai: using q4 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q4->required_cpu));
}
if (ctx.kernels_q8) {
GGML_LOG_DEBUG("kleidiai: using q8 kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels_q8->required_cpu));
if (ctx.kernels) {
GGML_LOG_DEBUG("kleidiai: using kernel with CPU feature %s\n", cpu_feature_to_string(ctx.kernels->required_cpu));
}
#endif
}
@@ -138,9 +130,6 @@ class tensor_traits : public ggml::cpu::tensor_traits {
if (kernels->rhs_type == GGML_TYPE_Q4_0) {
if (!lhs_info->packed_size_ex) return false;
size = lhs_info->packed_size_ex(m, k, QK4_0, mr, kr, sr);
} else if (kernels->rhs_type == GGML_TYPE_Q8_0) {
if (!lhs_info->packed_size_ex) return false;
size = lhs_info->packed_size_ex(m, k, QK8_0, mr, kr, sr);
} else if (kernels->rhs_type == GGML_TYPE_F16) {
if (!lhs_info->packed_size_ex || !kernels->rhs_info.packed_size_ex) return false;
const int64_t lhs_batch_size0 = op->src[1]->ne[2];
@@ -160,13 +149,11 @@ class tensor_traits : public ggml::cpu::tensor_traits {
if (dst->op == GGML_OP_MUL_MAT) {
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
return compute_forward_q4_0(params, dst);
} else if (dst->src[0]->type == GGML_TYPE_Q8_0) {
return compute_forward_q8_0(params, dst);
} else if (dst->src[0]->type == GGML_TYPE_F16) {
return compute_forward_fp16(params, dst);
}
} else if (dst->op == GGML_OP_GET_ROWS) {
if (dst->src[0]->type == GGML_TYPE_Q4_0 || dst->src[0]->type == GGML_TYPE_Q8_0) {
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
return compute_forward_get_rows(params, dst);
}
}
@@ -413,120 +400,19 @@ class tensor_traits : public ggml::cpu::tensor_traits {
return true;
}
bool compute_forward_q8_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q8_0);
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_TENSOR_BINARY_OP_LOCALS
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
if (!kernels) {
return false;
}
bool is_gemv = src1->ne[1] == 1;
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
if (!kernel || !lhs_info->get_packed_offset_ex || !lhs_info->pack_func_ex ||
!kernel->get_rhs_packed_offset_ex || !kernel->run_kernel_ex || !kernel->get_dst_offset) {
return false;
}
const int ith = params->ith;
const int nth_raw = params->nth;
const int nth = nth_raw > 0 ? nth_raw : 1;
const size_t k = ne00;
const size_t m = ne11;
const size_t n = ne01;
size_t mr = kernel->get_mr();
size_t kr = kernel->get_kr();
size_t sr = kernel->get_sr();
const uint8_t * lhs = static_cast<const uint8_t *>(src1->data);
uint8_t * lhs_packed = static_cast<uint8_t *>(params->wdata);
const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
const size_t n_step = kernel->get_n_step();
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
const size_t n_start = ith * num_n_per_thread;
size_t n_to_process = 0;
if (n_start < n) {
n_to_process = num_n_per_thread;
if ((n_start + n_to_process) > n) {
n_to_process = n - n_start;
}
}
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
const size_t m_start = ith * num_m_per_thread;
size_t m_to_process = num_m_per_thread;
if ((m_start + m_to_process) > m) {
m_to_process = m - m_start;
}
if (m_start < m) {
const size_t src_stride = src1->nb[1];
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(m_start, k, 0, mr, kr, sr);
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
lhs_info->pack_func_ex(m_to_process, k, 0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
}
ggml_barrier(params->threadpool);
const size_t dst_stride = dst->nb[1];
const size_t lhs_packed_offset = lhs_info->get_packed_offset_ex(0, k, 0, mr, kr, sr);
const size_t rhs_packed_offset = kernel->get_rhs_packed_offset_ex(n_start, k, 0);
const size_t dst_offset = kernel->get_dst_offset(0, n_start, dst_stride);
const void * rhs_ptr = static_cast<const void *>(rhs_packed + rhs_packed_offset);
const void * lhs_ptr = static_cast<const void *>(lhs_packed + lhs_packed_offset);
float * dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
if (n_to_process > 0) {
kernel->run_kernel_ex(m, n_to_process, k, 0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
sizeof(float), -FLT_MAX, FLT_MAX);
}
return true;
}
bool compute_forward_get_rows(struct ggml_compute_params * params, struct ggml_tensor * dst) {
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q4_0);
if (!ctx.kernels) {
return false;
}
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_TENSOR_BINARY_OP_LOCALS
ggml_kleidiai_kernels * kernels = nullptr;
size_t block_len = 0;
size_t num_bytes_multiplier = 0;
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
if (!ctx.kernels_q4) {
return false;
}
kernels = ctx.kernels_q4;
block_len = QK4_0;
num_bytes_multiplier = sizeof(uint16_t);
} else if (dst->src[0]->type == GGML_TYPE_Q8_0) {
if (!ctx.kernels_q8) {
return false;
}
kernels = ctx.kernels_q8;
block_len = QK8_0;
num_bytes_multiplier = sizeof(float);
} else {
return false;
}
rhs_packing_info * rhs_info = &kernels->rhs_info;
kernel_info * kernel = &kernels->gemm;
rhs_packing_info * rhs_info = &ctx.kernels->rhs_info;
kernel_info * kernel = &ctx.kernels->gemm;
if (!rhs_info->to_float || !kernel->get_nr) {
return false;
}
@@ -537,7 +423,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
const size_t block_rows = kernel->get_nr();
const size_t kr = kernel->get_kr();
const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, block_len);
const size_t num_bytes_multiplier = sizeof(uint16_t);
const size_t packed_stride = rhs_info->packed_stride(nc, block_rows, kr, QK4_0);
const int ith = params->ith;
const int nth = params->nth;
@@ -552,7 +439,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
GGML_ASSERT(row_idx >= 0 && row_idx < src0->ne[1]);
float *out = (float *)((char *)dst->data + i * nb1);
rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, block_len, num_bytes_multiplier);
rhs_info->to_float(src0->data, row_idx, nc, out, block_rows, packed_stride, kr, QK4_0, num_bytes_multiplier);
}
return true;
@@ -560,91 +447,21 @@ class tensor_traits : public ggml::cpu::tensor_traits {
public:
int repack(struct ggml_tensor * tensor, const void * data, size_t data_size) {
GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
GGML_ASSERT(ctx.kernels);
const size_t n = tensor->ne[1];
const size_t k = tensor->ne[0];
size_t nr = ctx.kernels->gemm.get_nr();
size_t kr = ctx.kernels->gemm.get_kr();
size_t sr = ctx.kernels->gemm.get_sr();
if (tensor->type == GGML_TYPE_Q4_0) {
if (!ctx.kernels_q4) {
return -1;
}
size_t nr = ctx.kernels_q4->gemm.get_nr();
size_t kr = ctx.kernels_q4->gemm.get_kr();
size_t sr = ctx.kernels_q4->gemm.get_sr();
struct kai_rhs_pack_qs4cxs1s0_param params;
params.lhs_zero_point = 1;
params.rhs_zero_point = 8;
ctx.kernels_q4->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, QK4_0, 0,
static_cast<const uint8_t *>(data),
nullptr, nullptr, tensor->data, 0, &params);
GGML_UNUSED(data_size);
return 0;
} else if (tensor->type == GGML_TYPE_Q8_0) {
if (!ctx.kernels_q8) {
return -1;
}
const size_t row_stride = tensor->nb[1];
const size_t k_blocks = (k + QK8_0 - 1) / QK8_0;
std::vector<int8_t> qdata(n * k, 0);
std::vector<float> scales(n, 0.0f);
for (size_t row = 0; row < n; ++row) {
const auto * row_blocks = reinterpret_cast<const block_q8_0 *>(
static_cast<const uint8_t *>(data) + row * row_stride);
float max_abs = 0.0f;
for (size_t block = 0; block < k_blocks; ++block) {
const block_q8_0 & blk = row_blocks[block];
const float d = GGML_FP16_TO_FP32(blk.d);
for (size_t l = 0; l < QK8_0; ++l) {
const size_t linear_idx = block * QK8_0 + l;
if (linear_idx >= k) {
break;
}
const float value = d * blk.qs[l];
max_abs = std::max(max_abs, std::fabs(value));
}
}
float scale = max_abs > 0.0f ? max_abs / 127.0f : 0.0f;
scales[row] = scale;
const float inv_scale = scale > 0.0f ? 1.0f / scale : 0.0f;
for (size_t block = 0; block < k_blocks; ++block) {
const block_q8_0 & blk = row_blocks[block];
const float d = GGML_FP16_TO_FP32(blk.d);
for (size_t l = 0; l < QK8_0; ++l) {
const size_t linear_idx = block * QK8_0 + l;
if (linear_idx >= k) {
break;
}
const float value = d * blk.qs[l];
int32_t q = scale > 0.0f ? static_cast<int32_t>(std::lround(value * inv_scale)) : 0;
q = std::clamp(q, -127, 127);
qdata[row * k + linear_idx] = static_cast<int8_t>(q);
}
}
}
size_t nr = ctx.kernels_q8->gemm.get_nr();
size_t kr = ctx.kernels_q8->gemm.get_kr();
size_t sr = ctx.kernels_q8->gemm.get_sr();
struct kai_rhs_pack_qsi8cx_params params;
params.lhs_zero_point = 1;
params.scale_multiplier = 1.0f;
ctx.kernels_q8->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, 0, 0,
qdata.data(), nullptr, scales.data(),
tensor->data, 0, &params);
GGML_UNUSED(data_size);
return 0;
}
struct kai_rhs_pack_qs4cxs1s0_param params;
params.lhs_zero_point = 1;
params.rhs_zero_point = 8;
ctx.kernels->rhs_info.pack_func_ex(1, n, k, nr, kr, sr, QK4_0, 0, (const uint8_t*)data, nullptr, nullptr, tensor->data, 0, &params);
return 0;
GGML_UNUSED(data_size);
return -1;
}
};
@@ -701,45 +518,27 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b
}
static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->type == GGML_TYPE_Q4_0);
GGML_ASSERT(ctx.kernels);
const size_t n = tensor->ne[1];
const size_t k = tensor->ne[0];
const size_t nr = ctx.kernels->gemm.get_nr();
const size_t kr = ctx.kernels->gemm.get_kr();
return ctx.kernels->rhs_info.packed_size_ex(n, k, nr, kr, QK4_0);
GGML_UNUSED(buft);
const size_t n = tensor->ne[1];
const size_t k = tensor->ne[0];
ggml_kleidiai_kernels * kernels = nullptr;
size_t block_len = 0;
if (tensor->type == GGML_TYPE_Q4_0) {
GGML_ASSERT(ctx.kernels_q4);
kernels = ctx.kernels_q4;
block_len = QK4_0;
} else if (tensor->type == GGML_TYPE_Q8_0) {
GGML_ASSERT(ctx.kernels_q8);
kernels = ctx.kernels_q8;
block_len = QK8_0;
} else {
return 0;
}
const size_t nr = kernels->gemm.get_nr();
const size_t kr = kernels->gemm.get_kr();
const size_t packed = kernels->rhs_info.packed_size_ex(n, k, nr, kr, block_len);
const size_t raw = ggml_nbytes(tensor);
return packed > raw ? packed : raw;
}
namespace ggml::cpu::kleidiai {
class extra_buffer_type : ggml::cpu::extra_buffer_type {
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
if ((op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_GET_ROWS) &&
(op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_Q8_0) &&
op->src[0]->type == GGML_TYPE_Q4_0 &&
op->src[0]->buffer &&
(ggml_n_dims(op->src[0]) == 2) &&
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
if (((op->src[0]->type == GGML_TYPE_Q4_0) ? ctx.kernels_q4 : ctx.kernels_q8) == nullptr) {
return false;
}
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false;
}

View File

@@ -4455,6 +4455,46 @@ void ggml_compute_forward_cont(
ggml_compute_forward_dup(params, dst);
}
// ggml_compute_forward_reshape
void ggml_compute_forward_reshape(
const ggml_compute_params * params,
ggml_tensor * dst) {
// NOP
GGML_UNUSED(params);
GGML_UNUSED(dst);
}
// ggml_compute_forward_view
void ggml_compute_forward_view(
const ggml_compute_params * params,
ggml_tensor * dst) {
// NOP
GGML_UNUSED(params);
GGML_UNUSED(dst);
}
// ggml_compute_forward_permute
void ggml_compute_forward_permute(
const ggml_compute_params * params,
ggml_tensor * dst) {
// NOP
GGML_UNUSED(params);
GGML_UNUSED(dst);
}
// ggml_compute_forward_transpose
void ggml_compute_forward_transpose(
const ggml_compute_params * params,
ggml_tensor * dst) {
// NOP
GGML_UNUSED(params);
GGML_UNUSED(dst);
}
// ggml_compute_forward_get_rows
static void ggml_compute_forward_get_rows_q(
@@ -5434,7 +5474,7 @@ static void ggml_rope_cache_init(
}
static void ggml_mrope_cache_init(
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool is_imrope, bool indep_sects,
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
float * cache, float sin_sign, float theta_scale) {
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
@@ -5469,26 +5509,14 @@ static void ggml_mrope_cache_init(
}
float theta = theta_t;
if (is_imrope) { // qwen3vl apply interleaved mrope
if (sector % 3 == 1 && sector < 3 * sections[1]) {
theta = theta_h;
} else if (sector % 3 == 2 && sector < 3 * sections[2]) {
theta = theta_w;
} else if (sector % 3 == 0 && sector < 3 * sections[0]) {
theta = theta_t;
} else {
theta = theta_e;
}
} else {
if (sector >= sections[0] && sector < sec_w) {
theta = theta_h;
}
else if (sector >= sec_w && sector < sec_w + sections[2]) {
theta = theta_w;
}
else if (sector >= sec_w + sections[2]) {
theta = theta_e;
}
if (sector >= sections[0] && sector < sec_w) {
theta = theta_h;
}
else if (sector >= sec_w && sector < sec_w + sections[2]) {
theta = theta_w;
}
else if (sector >= sec_w + sections[2]) {
theta = theta_e;
}
rope_yarn(
@@ -5561,7 +5589,6 @@ static void ggml_compute_forward_rope_f32(
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_mrope) {
@@ -5600,7 +5627,7 @@ static void ggml_compute_forward_rope_f32(
const int64_t p_w = pos[i2 + ne2 * 2];
const int64_t p_e = pos[i2 + ne2 * 3];
ggml_mrope_cache_init(
p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
p_t, p_h, p_w, p_e, sections, is_vision,
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
@@ -5748,7 +5775,6 @@ static void ggml_compute_forward_rope_f16(
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_mrope) {
@@ -5787,7 +5813,7 @@ static void ggml_compute_forward_rope_f16(
const int64_t p_w = pos[i2 + ne2 * 2];
const int64_t p_e = pos[i2 + ne2 * 3];
ggml_mrope_cache_init(
p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
p_t, p_h, p_w, p_e, sections, is_vision,
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
}
@@ -7044,11 +7070,7 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
#ifdef GGML_SIMD
#if defined(__ARM_FEATURE_SVE)
const int64_t pkg_size = svcntw();
#else
const int64_t pkg_size = GGML_F32_EPR;
#endif
const int64_t pkg_size = GGML_F32_EPR;
const int64_t pkg_count = c / pkg_size;
const int64_t c_pkg_end = pkg_count * pkg_size;
#else
@@ -7471,17 +7493,10 @@ static void ggml_compute_forward_upscale_f32(
float sf1 = (float)ne1/src0->ne[1];
float sf2 = (float)ne2/src0->ne[2];
float sf3 = (float)ne3/src0->ne[3];
float pixel_offset = 0.5f;
const int32_t mode_flags = ggml_get_op_params_i32(dst, 0);
const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
pixel_offset = 0.0f;
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
}
if (mode == GGML_SCALE_MODE_NEAREST) {
for (int64_t i3 = 0; i3 < ne3; i3++) {
const int64_t i03 = i3 / sf3;
@@ -7501,6 +7516,13 @@ static void ggml_compute_forward_upscale_f32(
}
}
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
float pixel_offset = 0.5f;
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
pixel_offset = 0.0f;
sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1);
sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1);
}
for (int64_t i3 = 0; i3 < ne3; i3++) {
const int64_t i03 = i3 / sf3;
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
@@ -7535,51 +7557,6 @@ static void ggml_compute_forward_upscale_f32(
const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy;
float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
*y_dst = val;
}
}
}
}
} else if (mode == GGML_SCALE_MODE_BICUBIC) {
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
auto weight1 = [a](float x) { return ((a + 2) * x - (a + 3)) * x * x + 1; };
auto weight2 = [a](float x) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
auto bicubic = [=](float p0, float p1, float p2, float p3, float x) {
const float w0 = weight2(x + 1);
const float w1 = weight1(x + 0);
const float w2 = weight1(1 - x);
const float w3 = weight2(2 - x);
return p0*w0 + p1*w1 + p2*w2 + p3*w3;
};
for (int64_t i3 = 0; i3 < ne3; i3++) {
const int64_t i03 = i3 / sf3;
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
const int64_t i02 = i2 / sf2;
for (int64_t i1 = 0; i1 < ne1; i1++) {
const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset;
const int64_t y0 = (int64_t)floorf(y);
const float dy = y - (float)y0;
for (int64_t i0 = 0; i0 < ne0; i0++) {
const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset;
const int64_t x0 = (int64_t)floorf(x);
const float dx = x - (float)x0;
auto p = [=](int64_t x_off, int64_t y_off) -> float {
int64_t i00 = std::max(int64_t(0), std::min(x0 + x_off, ne00 - 1));
int64_t i01 = std::max(int64_t(0), std::min(y0 + y_off, ne01 - 1));
return *(const float *)((const char *)src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
};
const float val = bicubic(
bicubic(p(-1,-1), p(0,-1), p(1,-1), p(2,-1), dx),
bicubic(p(-1, 0), p(0, 0), p(1, 0), p(2, 0), dx),
bicubic(p(-1, 1), p(0, 1), p(1, 1), p(2, 1), dx),
bicubic(p(-1, 2), p(0, 2), p(1, 2), p(2, 2), dx), dy);
float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
*y_dst = val;
}
@@ -7932,10 +7909,10 @@ void ggml_compute_forward_argsort(
// ggml_compute_forward_flash_attn_ext
static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
static void ggml_compute_forward_flash_attn_ext_f16(
const ggml_compute_params * params,
ggml_tensor * dst,
int ir0, int ir1) {
ggml_tensor * dst) {
const ggml_tensor * q = dst->src[0];
const ggml_tensor * k = dst->src[1];
const ggml_tensor * v = dst->src[2];
@@ -7951,6 +7928,9 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
const int ith = params->ith;
const int nth = params->nth;
const int64_t DK = nek0;
const int64_t DV = nev0;
const int64_t N = neq1;
@@ -7984,6 +7964,16 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
// parallelize by q rows using ggml_vec_dot_f32
// total rows in q
const int nr = neq1*neq2*neq3;
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
float scale = 1.0f;
float max_bias = 0.0f;
float logit_softcap = 0.0f;
@@ -8010,8 +8000,6 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
GGML_ASSERT(( q_to_vec_dot) && "fattn: unsupported K-type");
GGML_ASSERT((v->type == GGML_TYPE_F32 || v_to_float ) && "fattn: unsupported V-type");
int ith = params->ith;
// loop over n_batch and n_head
for (int ir = ir0; ir < ir1; ++ir) {
// q indices
@@ -8159,91 +8147,6 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
}
}
static void ggml_compute_forward_flash_attn_ext_f16(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * q = dst->src[0];
const ggml_tensor * k = dst->src[1];
const ggml_tensor * v = dst->src[2];
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
const int64_t DK = nek0;
const int64_t DV = nev0;
const int64_t N = neq1;
GGML_ASSERT(ne0 == DV);
GGML_ASSERT(ne2 == N);
// input tensor rows must be contiguous
GGML_ASSERT(nbq0 == ggml_type_size(q->type));
GGML_ASSERT(nbk0 == ggml_type_size(k->type));
GGML_ASSERT(nbv0 == ggml_type_size(v->type));
GGML_ASSERT(neq0 == DK);
GGML_ASSERT(nek0 == DK);
GGML_ASSERT(nev0 == DV);
GGML_ASSERT(neq1 == N);
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(nb0 <= nb1);
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
// parallelize by q rows using ggml_vec_dot_f32
// total rows in q
const int64_t nr = neq1*neq2*neq3;
// rows per thread
const int ith = params->ith;
const int nth = params->nth;
// disable for NUMA
const bool disable_chunking = ggml_is_numa();
// 4x chunks per thread
int nth_scaled = nth * 4;
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
if (nth == 1 || nchunk < nth || disable_chunking) {
nchunk = nth;
}
if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
ggml_threadpool_chunk_set(params->threadpool, nth);
}
ggml_barrier(params->threadpool);
// The number of elements in each chunk
const int64_t dr = (nr + nchunk - 1) / nchunk;
// The first chunk comes from our thread_id, the rest will get auto-assigned.
int current_chunk = ith;
while (current_chunk < nchunk) {
const int64_t ir0 = dr * current_chunk;
const int64_t ir1 = MIN(ir0 + dr, nr);
ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
}
}
void ggml_compute_forward_flash_attn_ext(
const ggml_compute_params * params,
ggml_tensor * dst) {

View File

@@ -51,6 +51,10 @@ void ggml_compute_forward_scale(const struct ggml_compute_params * params, struc
void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_reshape(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_view(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_permute(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);

View File

@@ -1600,32 +1600,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
return false;
}
void forward_mul_mat_one_chunk(ggml_compute_params * params, ggml_tensor * op, int64_t src0_start, int64_t src0_end) {
const ggml_tensor * src0 = op->src[0];
const ggml_tensor * src1 = op->src[1];
ggml_tensor * dst = op;
GGML_TENSOR_BINARY_OP_LOCALS
const void * src1_wdata = params->wdata;
const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
if (ne11 > 3) {
gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
(float *) ((char *) dst->data) + src0_start, ne01,
(const char *) src0->data + src0_start * nb01,
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
}
for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
(float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
(const char *) src0->data + src0_start * nb01,
(const char *) src1_wdata + (src1_col_stride * iter), 1,
src0_end - src0_start);
}
}
void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
const ggml_tensor * src0 = op->src[0];
const ggml_tensor * src1 = op->src[1];
@@ -1669,62 +1643,31 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
}
// disable for NUMA
const bool disable_chunking = ggml_is_numa();
// 4x chunks per thread
int64_t nr = ggml_nrows(op->src[0]);
int nth_scaled = nth * 4;
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
// Ensure minimum chunk size to avoid alignment issues with high thread counts
// Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
const int64_t min_chunk_size = NB_COLS;
if (nchunk > 0 && (nr / nchunk) < min_chunk_size && nr >= min_chunk_size) {
nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
}
if (nth == 1 || nchunk < nth || disable_chunking) {
nchunk = nth;
}
// Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
// This prevents creating too many tiny chunks that could overlap after alignment
const int64_t max_nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
if (nchunk > max_nchunk) {
nchunk = max_nchunk;
}
if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
ggml_threadpool_chunk_set(params->threadpool, nth);
}
ggml_barrier(params->threadpool);
// The first chunk comes from our thread_id, the rest will get auto-assigned.
int current_chunk = ith;
const void * src1_wdata = params->wdata;
const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
int64_t src0_start = (ith * ne01) / nth;
int64_t src0_end = ((ith + 1) * ne01) / nth;
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
if (src0_start >= src0_end) {
return;
}
while (current_chunk < nchunk) {
int64_t src0_start = (current_chunk * ne01) / nchunk;
int64_t src0_end = ((current_chunk + 1) * ne01) / nchunk;
// Align boundaries to NB_COLS - round up to ensure all data is included
// The chunk size limiting above ensures chunks are large enough to prevent overlaps
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
if (src0_end > ne01) {
src0_end = ne01;
}
if (src0_start >= src0_end) {
break;
}
forward_mul_mat_one_chunk(params, dst, src0_start, src0_end);
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
if (ne11 > 3) {
gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
(float *) ((char *) dst->data) + src0_start, ne01,
(const char *) src0->data + src0_start * nb01,
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
}
for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
(float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
(const char *) src0->data + src0_start * nb01,
(const char *) src1_wdata + (src1_col_stride * iter), 1,
src0_end - src0_start);
}
}
@@ -1829,12 +1772,8 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
int64_t src0_cur_start = (ith * ne01) / nth;
int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
// Align boundaries to NB_COLS - round up to ensure all data is included
src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
if (src0_cur_end > ne01) {
src0_cur_end = ne01;
}
if (src0_cur_start >= src0_cur_end) {
return;

View File

@@ -956,7 +956,7 @@ do { \
#define GGML_F32Cx8 __m256
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
__m256i a;
@@ -999,34 +999,34 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
#define GGML_F32x4 __m128
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
#define GGML_F32x4_ADD __lsx_vfadd_s
#define GGML_F32x4_MUL __lsx_vfmul_s
#define GGML_F32x4_REDUCE(res, x) \
{ \
int offset = GGML_F32_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
} \
__m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
__m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
__m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \
__m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \
__m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \
__m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
res = (ggml_float) ((v4f32)t5)[0]; \
#define GGML_F32x4_REDUCE(res, x) \
{ \
int offset = GGML_F32_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
} \
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
}
#define GGML_F32_VEC GGML_F32x4
@@ -1068,7 +1068,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
#define GGML_F32Cx4 __m128
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
#define GGML_F32Cx4_FMA GGML_F32x4_FMA

View File

@@ -124,7 +124,6 @@ if (CUDAToolkit_FOUND)
if (GGML_CUDA_DEBUG)
list(APPEND CUDA_FLAGS -lineinfo)
add_compile_definitions(GGML_CUDA_DEBUG)
endif()
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")

View File

@@ -1,81 +1,5 @@
#include "argsort.cuh"
#ifdef GGML_CUDA_USE_CUB
# include <cub/cub.cuh>
using namespace cub;
#endif // GGML_CUDA_USE_CUB
static __global__ void init_indices(int * indices, const int ncols, const int nrows) {
const int col = blockIdx.x * blockDim.x + threadIdx.x;
const int row = blockIdx.y;
if (col < ncols && row < nrows) {
indices[row * ncols + col] = col;
}
}
static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx <= nrows) {
offsets[idx] = idx * ncols;
}
}
#ifdef GGML_CUDA_USE_CUB
static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
const float * x,
int * dst,
const int ncols,
const int nrows,
ggml_sort_order order,
cudaStream_t stream) {
ggml_cuda_pool_alloc<int> temp_indices_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
int * temp_indices = temp_indices_alloc.get();
float * temp_keys = temp_keys_alloc.get();
int * d_offsets = offsets_alloc.get();
static const int block_size = 256;
const dim3 grid_size((ncols + block_size - 1) / block_size, nrows);
init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols, nrows);
const dim3 offset_grid((nrows + block_size - 1) / block_size);
init_offsets<<<offset_grid, block_size, 0, stream>>>(d_offsets, ncols, nrows);
cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream);
size_t temp_storage_bytes = 0;
if (order == GGML_SORT_ORDER_ASC) {
DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place)
temp_indices, dst, // values (indices)
ncols * nrows, nrows, // num items, num segments
d_offsets, d_offsets + 1, 0, sizeof(float) * 8, // all bits
stream);
} else {
DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, 0,
sizeof(float) * 8, stream);
}
ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
void * d_temp_storage = temp_storage_alloc.get();
if (order == GGML_SORT_ORDER_ASC) {
DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
ncols * nrows, nrows, d_offsets, d_offsets + 1, 0, sizeof(float) * 8,
stream);
} else {
DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
0, sizeof(float) * 8, stream);
}
}
#endif // GGML_CUDA_USE_CUB
// Bitonic sort implementation
template<typename T>
static inline __device__ void ggml_cuda_swap(T & a, T & b) {
T tmp = a;
@@ -87,7 +11,7 @@ template<ggml_sort_order order>
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
// bitonic sort
int col = threadIdx.x;
int row = blockIdx.x;
int row = blockIdx.y;
if (col >= ncols_pad) {
return;
@@ -141,28 +65,21 @@ static int next_power_of_2(int x) {
return n;
}
static void argsort_f32_i32_cuda_bitonic(const float * x,
int * dst,
const int ncols,
const int nrows,
ggml_sort_order order,
cudaStream_t stream) {
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
// bitonic sort requires ncols to be power of 2
const int ncols_pad = next_power_of_2(ncols);
const dim3 block_dims(ncols_pad, 1, 1);
const dim3 block_nums(nrows, 1, 1);
const dim3 block_nums(1, nrows, 1);
const size_t shared_mem = ncols_pad * sizeof(int);
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
if (order == GGML_SORT_ORDER_ASC) {
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>
<<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else if (order == GGML_SORT_ORDER_DESC) {
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>
<<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else {
GGML_ABORT("fatal error");
}
@@ -183,18 +100,5 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
#ifdef GGML_CUDA_USE_CUB
const int ncols_pad = next_power_of_2(ncols);
const size_t shared_mem = ncols_pad * sizeof(int);
const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
if (shared_mem > max_shared_mem || ncols > 1024) {
ggml_cuda_pool & pool = ctx.pool();
argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
} else {
argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
}
#else
argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
#endif
argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
}

View File

@@ -272,7 +272,7 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]);
const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
if (block_nums.z > 65535 || block_nums.y > 65535) {
if (block_nums.z > 65535) {
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));

View File

@@ -224,11 +224,6 @@ static const char * cu_get_error_str(CUresult err) {
#define AMD_MFMA_AVAILABLE
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
// The Volta instructions are in principle available on Turing or newer but they are effectively unusable:
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#define VOLTA_MMA_AVAILABLE
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
#define TURING_MMA_AVAILABLE
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -283,10 +278,7 @@ static bool amd_mfma_available(const int cc) {
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
}
static bool volta_mma_available(const int cc) {
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA;
}
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
static bool turing_mma_available(const int cc) {
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
}
@@ -633,11 +625,8 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
// and a shift:
//
// n/d = (mulhi(n, mp) + n) >> L;
static const uint3 init_fastdiv_values(uint64_t d_64) {
GGML_ASSERT(d_64 != 0);
GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
uint32_t d = (uint32_t)d_64;
static const uint3 init_fastdiv_values(uint32_t d) {
GGML_ASSERT(d != 0);
// compute L = ceil(log2(d));
uint32_t L = 0;
@@ -1016,16 +1005,3 @@ struct ggml_backend_cuda_context {
return pool(device);
}
};
struct ggml_cuda_mm_fusion_args_host {
const ggml_tensor * x_bias = nullptr;
const ggml_tensor * gate = nullptr;
const ggml_tensor * gate_bias = nullptr;
ggml_glu_op glu_op;
};
struct ggml_cuda_mm_fusion_args_device {
const void * x_bias = nullptr;
const void * gate = nullptr;
const void * gate_bias = nullptr;
ggml_glu_op glu_op;
};

View File

@@ -1,4 +1,3 @@
#pragma once
#include "common.cuh"
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256

View File

@@ -7,10 +7,6 @@
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
const int CUDA_CPY_TILE_DIM_2D = 32; // 2D tile dimension for transposed blocks
const int CUDA_CPY_BLOCK_NM = 8; // block size of 3rd dimension if available
const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows
template <cpy_kernel_t cpy_1>
static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -39,55 +35,6 @@ static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
cpy_1(cx + x_offset, cdst + dst_offset);
}
template <typename T>
static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
const int nb12, const int nb13) {
const T* src = reinterpret_cast<const T*>(cx);
T* dst = reinterpret_cast<T*>(cdst);
const int64_t nmat = ne / (ne00 * ne01);
const int64_t n = ne00 * ne01;
const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset
const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
__shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
#pragma unroll
for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
const unsigned int imat = blockIdx.z * CUDA_CPY_BLOCK_NM + i;
if (imat >= nmat)
break;
#pragma unroll
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
if(x < ne01 && y + j < ne00){
const int row = threadIdx.y+j;
const int col = threadIdx.x * sizeof(float)/sizeof(T);
T *tile2 = reinterpret_cast<T*>(tile[row]);
tile2[col] = src[imat*n + (y+j)*ne01 + x];
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
if (ty + j < ne01 && tx < ne00) {
const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
const T *tile2 = reinterpret_cast<const T*>(tile[threadIdx.x]);
dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
}
}
}
}
static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
float * cdstf = (float *)(cdsti);
@@ -166,59 +113,14 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
}
template<typename src_t, typename dst_t>
static __global__ void cpy_flt_contiguous(const char * cx, char * cdst, const int64_t ne) {
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= ne) {
return;
}
const src_t * x = (const src_t *) cx;
dst_t * dst = (dst_t *) cdst;
dst[i] = ggml_cuda_cast<dst_t>(x[i]);
}
template<typename src_t, typename dst_t>
static void ggml_cpy_flt_contiguous_cuda(
const char * cx, char * cdst, const int64_t ne,
cudaStream_t stream) {
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
cpy_flt_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne);
}
template<typename src_t, typename dst_t, bool transposed = false>
static void ggml_cpy_flt_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
if (transposed) {
GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed
int ne00n, ne01n, ne02n;
if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
ne00n = ne00;
ne01n = ne01;
ne02n = ne02;
} else if (nb00 > nb02) {
ne00n = ne00;
ne01n = ne01*ne02;
ne02n = 1;
}
dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
(ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
(ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
cpy_flt_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
} else {
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
static void ggml_cpy_f32_q8_0_cuda(
@@ -383,10 +285,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
char * src0_ddc = (char *) src0->data;
char * src1_ddc = (char *) src1->data;
const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[3] == 1;
if (src0->type == src1->type && contiguous_srcs) {
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
@@ -397,23 +296,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
}
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
if (can_be_transposed) {
ggml_cpy_flt_cuda<float, float, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else {
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream);
} else {
ggml_cpy_flt_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<float, half> (src0_ddc, src1_ddc, ne, main_stream);
} else {
ggml_cpy_flt_cuda<float, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<float, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
@@ -440,53 +327,21 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
if (can_be_transposed) {
ggml_cpy_flt_cuda<half, half, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else {
ggml_cpy_flt_cuda<half, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<half, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream);
} else {
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<half, float> (src0_ddc, src1_ddc, ne, main_stream);
} else {
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
if (can_be_transposed) {
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else {
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, main_stream);
} else {
ggml_cpy_flt_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, main_stream);
} else {
ggml_cpy_flt_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) {
if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, main_stream);
} else {
ggml_cpy_flt_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) {
if (contiguous_srcs) {
ggml_cpy_flt_contiguous_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, main_stream);
} else {
ggml_cpy_flt_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
}
ggml_cpy_flt_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
} else {
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));

View File

@@ -14,10 +14,6 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_cuda_flash_attn_ext_tile_case< 64, 64>(ctx, dst);
} break;
case 72: {
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_cuda_flash_attn_ext_tile_case< 72, 72>(ctx, dst);
} break;
case 80: {
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_cuda_flash_attn_ext_tile_case< 80, 80>(ctx, dst);

View File

@@ -6,7 +6,7 @@
// nbatch_K == number of K columns to load in parallel for KQ calculation
// TODO optimize kernel parameters for FP16 NVIDIA (P100)
// TODO optimize kernel parameters for head sizes 40, 72, 80, 96, 112
// TODO optimize kernel parameters for head sizes 40, 80, 96, 112
// The ROCm compiler cannot handle templating in __launch_bounds__.
// As a workaround, define a macro to package the kernel parameters as uint32_t:
@@ -32,12 +32,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 64, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 64, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 64, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 64, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 64, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 64, 40)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 64, 40)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 64, 40)
@@ -86,12 +80,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 16, 128, 3, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
@@ -142,13 +130,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 64, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 64, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
@@ -204,13 +185,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 32, 128, 4, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64, 64, 64, 128, 5, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 2, 64, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 4, 128, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 8, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 16, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 32, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72, 72, 64, 256, 2, 32, 72)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 2, 64, 2, 32, 40)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 4, 128, 2, 32, 40)
GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80, 80, 8, 256, 2, 32, 40)
@@ -749,7 +723,7 @@ static __global__ void flash_attn_tile(
if (
#ifdef GGML_USE_WMMA_FATTN
(ncols2 != 1 && DV != 40 && DV != 72 && DV != 512) ||
(ncols2 != 1 && DV != 40 && DV != 512) ||
#endif // GGML_USE_WMMA_FATTN
(use_logit_softcap && !(DV == 128 || DV == 256))
) {
@@ -1224,7 +1198,6 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
extern DECL_FATTN_TILE_CASE( 40, 40);
extern DECL_FATTN_TILE_CASE( 64, 64);
extern DECL_FATTN_TILE_CASE( 72, 72);
extern DECL_FATTN_TILE_CASE( 80, 80);
extern DECL_FATTN_TILE_CASE( 96, 96);
extern DECL_FATTN_TILE_CASE(112, 112);

View File

@@ -223,7 +223,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
switch (K->ne[0]) {
case 40:
case 64:
case 72:
case 80:
case 96:
case 128:
@@ -276,7 +275,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
// If Turing tensor cores available, use them:
if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72) {
if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40) {
if (can_use_vector_kernel) {
if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
@@ -302,7 +301,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
}
// Use the WMMA kernel if possible:
if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 576) {
if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 576) {
if (can_use_vector_kernel && Q->ne[1] <= 2) {
return BEST_FATTN_KERNEL_VEC;
}

View File

@@ -50,7 +50,6 @@
#include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/wkv.cuh"
#include "ggml-cuda/gla.cuh"
#include "ggml-cuda/set.cuh"
#include "ggml-cuda/set-rows.cuh"
#include "ggml-cuda/pad_reflect_1d.cuh"
#include "ggml.h"
@@ -1958,15 +1957,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
size_t src1_stride_size = sizeof(cuda_t);
const int threads_x = 16;
const int threads_y = 16;
dim3 block_dims(threads_x, threads_y);
dim3 grid_dims(
(ne13 + threads_x - 1) / threads_x,
(ne12 + threads_y - 1) / threads_y
);
k_compute_batched_ptrs<<<grid_dims, block_dims, 0, main_stream>>>(
dim3 block_dims(ne13, ne12);
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
src0_ptr, src1_ptr, dst_t,
ptrs_src.get(), ptrs_dst.get(),
ne12, ne13,
@@ -2015,164 +2007,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
}
}
static bool ggml_cuda_should_fuse_mul_mat(const ggml_tensor * ffn_up,
const ggml_tensor * ffn_gate,
const ggml_tensor * glu,
const ggml_tensor * ffn_up_bias = nullptr,
const ggml_tensor * ffn_gate_bias = nullptr) {
const bool has_bias = ffn_up_bias != nullptr || ffn_gate_bias != nullptr;
if (has_bias && (!ffn_up_bias || !ffn_gate_bias)) {
return false;
}
const bool is_mul_mat = ffn_up->op == GGML_OP_MUL_MAT && ffn_gate->op == GGML_OP_MUL_MAT && glu->op == GGML_OP_GLU;
const bool is_mul_mat_id = ffn_up->op == GGML_OP_MUL_MAT_ID && ffn_gate->op == GGML_OP_MUL_MAT_ID && glu->op == GGML_OP_GLU;
GGML_ASSERT(ffn_up && ffn_gate && glu);
if (!is_mul_mat && !is_mul_mat_id) {
return false;
}
const ggml_op expected_bias_op = is_mul_mat ? GGML_OP_ADD : GGML_OP_ADD_ID;
if (has_bias) {
if (ffn_up_bias->op != expected_bias_op || ffn_gate_bias->op != expected_bias_op) {
return false;
}
if (glu->src[0] != ffn_gate_bias || glu->src[1] != ffn_up_bias) {
return false;
}
if (expected_bias_op == GGML_OP_ADD) {
const bool up_has_mul = ffn_up_bias->src[0] == ffn_up || ffn_up_bias->src[1] == ffn_up;
const bool gate_has_mul = ffn_gate_bias->src[0] == ffn_gate || ffn_gate_bias->src[1] == ffn_gate;
if (!up_has_mul || !gate_has_mul) {
return false;
}
} else { // GGML_OP_ADD_ID
if (ffn_up_bias->src[0] != ffn_up || ffn_gate_bias->src[0] != ffn_gate) {
return false;
}
if (ffn_up_bias->src[2] != ffn_up->src[2] || ffn_gate_bias->src[2] != ffn_gate->src[2]) {
return false;
}
}
} else {
if (glu->src[0] != ffn_gate && glu->src[1] != ffn_up) {
return false;
}
}
if (ffn_up->src[0]->type != ffn_gate->src[0]->type || !ggml_are_same_shape(ffn_up->src[0], ffn_gate->src[0]) ||
!ggml_are_same_stride(ffn_up->src[0], ffn_gate->src[0])) {
return false;
}
if (ffn_up->src[1] != ffn_gate->src[1]) {
return false;
}
if (ffn_up->src[2] && (ffn_up->src[2] != ffn_gate->src[2])) {
return false;
}
static constexpr std::array<ggml_glu_op, 3> valid_glu_ops = { GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU, GGML_GLU_OP_SWIGLU_OAI };
if (std::find(valid_glu_ops.begin(), valid_glu_ops.end(), ggml_get_glu_op(glu)) == valid_glu_ops.end()) {
return false;
}
if (const bool swapped = ggml_get_op_params_i32(glu, 1); swapped) {
return false;
}
const bool split = ggml_backend_buft_is_cuda_split(ffn_up->src[0]->buffer->buft) ||
ggml_backend_buft_is_cuda_split(ffn_gate->src[0]->buffer->buft);
//TODO: add support for fusion for split buffers
if (split) {
return false;
}
return true;
}
static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
ggml_tensor * src0 = tensor->src[0];
ggml_tensor * src1 = tensor->src[1];
const ggml_tensor * dst = tensor;
const bool is_mul_mat_id = tensor->op == GGML_OP_MUL_MAT_ID;
bool use_mul_mat_vec_f =
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) &&
src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
ggml_backend_buft_is_cuda_split(src1->buffer->buft);
//TODO: add support for fusion for split buffers
if (split) {
return false;
}
//we only support fusion for ncols_dst = 1
if (tensor->op == GGML_OP_MUL_MAT && dst->ne[1] != 1) {
return false;
}
if (tensor->op == GGML_OP_MUL_MAT_ID && dst->ne[2] != 1) {
return false;
}
return use_mul_mat_vec_f;
}
static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
ggml_tensor * src0 = tensor->src[0];
ggml_tensor * src1 = tensor->src[1];
const ggml_tensor * dst = tensor;
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE &&
ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) &&
src0->view_src;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
// fusion is not universally faster on Pascal
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (cc <= GGML_CUDA_CC_PASCAL) {
return false;
}
//we only support fusion for ncols_dst = 1
if (tensor->op == GGML_OP_MUL_MAT && dst->ne[1] != 1) {
return false;
}
if (tensor->op == GGML_OP_MUL_MAT_ID && dst->ne[2] != 1) {
return false;
}
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
ggml_backend_buft_is_cuda_split(src1->buffer->buft);
//TODO: add support for fusion for split buffers
if (split) {
return false;
}
return use_mul_mat_vec_q;
}
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
@@ -2206,16 +2040,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
const int cc = ggml_cuda_info().devices[id].cc;
const int warp_size = ggml_cuda_info().devices[id].warp_size;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
}
} else {
const int cc = ggml_cuda_info().devices[ctx.device].cc;
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
}
@@ -2286,7 +2120,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
return;
}
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) {
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2], /*mul_mat_id=*/true)) {
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
return;
}
@@ -2434,9 +2268,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SET_ROWS:
ggml_cuda_op_set_rows(ctx, dst);
break;
case GGML_OP_SET:
ggml_cuda_op_set(ctx, dst);
break;
case GGML_OP_DUP:
ggml_cuda_dup(ctx, dst);
break;
@@ -2515,18 +2346,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_UNARY_OP_XIELU:
ggml_cuda_op_xielu(ctx, dst);
break;
case GGML_UNARY_OP_FLOOR:
ggml_cuda_op_floor(ctx, dst);
break;
case GGML_UNARY_OP_CEIL:
ggml_cuda_op_ceil(ctx, dst);
break;
case GGML_UNARY_OP_ROUND:
ggml_cuda_op_round(ctx, dst);
break;
case GGML_UNARY_OP_TRUNC:
ggml_cuda_op_trunc(ctx, dst);
break;
default:
return false;
}
@@ -2926,7 +2745,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
}
}
if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
if (node->op == GGML_OP_SCALE &&
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
return false;
}
@@ -3007,9 +2826,9 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
if (ops.size() == topk_moe_ops_with_norm.size() &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx];
ggml_tensor * weights = cgraph->nodes[node_idx + 9];
ggml_tensor * weights = cgraph->nodes[node_idx+8];
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
return true;
@@ -3019,14 +2838,14 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
if (ops.size() == topk_moe_ops.size() &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx];
ggml_tensor * weights = cgraph->nodes[node_idx + 4];
ggml_tensor * weights = cgraph->nodes[node_idx+4];
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
return true;
}
}
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
@@ -3035,38 +2854,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}
}
std::initializer_list<enum ggml_op> mul_mat_bias_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_MUL_MAT, GGML_OP_ADD, GGML_OP_GLU };
std::initializer_list<enum ggml_op> mul_mat_id_bias_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_GLU };
std::initializer_list<enum ggml_op> mul_mat_id_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_MUL_MAT_ID, GGML_OP_GLU };
std::initializer_list<enum ggml_op> mul_mat_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT, GGML_OP_GLU };
if (ops.size() == 5 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}) ||
ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}))) {
const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
const ggml_tensor * ffn_gate_bias = cgraph->nodes[node_idx + 1];
const ggml_tensor * ffn_up = cgraph->nodes[node_idx + 2];
const ggml_tensor * ffn_up_bias = cgraph->nodes[node_idx + 3];
const ggml_tensor * glu = cgraph->nodes[node_idx + 4];
if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu, ffn_up_bias, ffn_gate_bias)) {
return true;
}
}
if (ops.size() == 3 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}) ||
ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}))) {
const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
const ggml_tensor * ffn_up = cgraph->nodes[node_idx + 1];
const ggml_tensor * glu = cgraph->nodes[node_idx + 2];
if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
return true;
}
}
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
return false;
}
@@ -3147,17 +2934,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
// With the use of CUDA graphs, the execution will be performed by the graph launch.
if (!use_cuda_graph || cuda_graph_update_required) {
[[maybe_unused]] int prev_i = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
#ifdef GGML_CUDA_DEBUG
const int nodes_fused = i - prev_i - 1;
prev_i = i;
if (nodes_fused > 0) {
GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
}
#endif
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
@@ -3167,18 +2945,17 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
if (!disable_fusion) {
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
ggml_tensor * weights = cgraph->nodes[i + 9];
ggml_tensor * selected_experts = cgraph->nodes[i + 3];
ggml_tensor * clamp = cgraph->nodes[i + 7];
ggml_tensor * weights = cgraph->nodes[i+8];
ggml_tensor * selected_experts = cgraph->nodes[i+3];
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ true,
/*delayed softmax*/ false, clamp);
i += 9;
/*delayed softmax*/ false);
i += 8;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
ggml_tensor * weights = cgraph->nodes[i + 4];
ggml_tensor * selected_experts = cgraph->nodes[i + 3];
ggml_tensor * weights = cgraph->nodes[i+4];
ggml_tensor * selected_experts = cgraph->nodes[i+3];
ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ false,
/*delayed softmax*/ false);
i += 4;
@@ -3227,195 +3004,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
}
bool fused_mul_mat_vec = false;
int fused_node_count = 0;
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
ggml_tensor * glu = cgraph->nodes[i + 4];
ggml_tensor * gate_bias_n = glu->src[0];
ggml_tensor * up_bias_n = glu->src[1];
//we don't assume the order for {gate, up}. Instead infer it from the bias tensor
ggml_tensor * gate_n = nullptr;
ggml_tensor * up_n = nullptr;
if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) {
gate_n = cgraph->nodes[i];
up_n = cgraph->nodes[i + 2];
} else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) {
gate_n = cgraph->nodes[i + 2];
up_n = cgraph->nodes[i];
} else {
continue;
}
auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
if (op_bias == GGML_OP_ADD) {
if (bias_node->src[0] == mul_node) {
return bias_node->src[1];
}
if (bias_node->src[1] == mul_node) {
return bias_node->src[0];
}
return (ggml_tensor *) nullptr;
}
GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
GGML_ASSERT(bias_node->src[0] == mul_node);
return bias_node->src[1];
};
ggml_tensor * up_bias_tensor = get_bias_tensor(up_bias_n, up_n, bias_op);
ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
if (!up_bias_tensor || !gate_bias_tensor) {
continue;
}
// we don't support repeating adds
if (bias_op == GGML_OP_ADD &&
(!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
continue;
}
const ggml_tensor * src0 = up_n->src[0];
const ggml_tensor * src1 = up_n->src[1];
const ggml_tensor * ids = up_n->src[2];
if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate_n->src[0];
fusion_data.x_bias = up_bias_tensor;
fusion_data.gate_bias = gate_bias_tensor;
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 5;
break;
}
if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate_n->src[0];
fusion_data.x_bias = up_bias_tensor;
fusion_data.gate_bias = gate_bias_tensor;
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 5;
break;
}
} else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
ggml_tensor * glu = cgraph->nodes[i + 2];
ggml_tensor * gate = glu->src[0];
ggml_tensor * up = glu->src[1];
bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1])
|| (gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]);
if (!ok) continue;
const ggml_tensor * src0 = up->src[0];
const ggml_tensor * src1 = up->src[1];
const ggml_tensor * ids = up->src[2];
if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate->src[0];
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 3;
break;
}
if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate->src[0];
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 3;
break;
}
}
}
if (fused_mul_mat_vec) {
i += fused_node_count - 1;
continue;
}
fused_mul_mat_vec = false;
fused_node_count = 0;
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
continue;
}
ggml_tensor * mm_node = cgraph->nodes[i];
ggml_tensor * bias_node = cgraph->nodes[i + 1];
ggml_tensor * bias_tensor = nullptr;
if (bias_op == GGML_OP_ADD) {
if (bias_node->src[0] == mm_node) {
bias_tensor = bias_node->src[1];
} else if (bias_node->src[1] == mm_node) {
bias_tensor = bias_node->src[0];
} else {
continue;
}
} else {
if (bias_node->src[0] != mm_node) {
continue;
}
bias_tensor = bias_node->src[1];
}
const ggml_tensor * src0 = mm_node->src[0];
const ggml_tensor * src1 = mm_node->src[1];
const ggml_tensor * ids = mm_node->src[2];
if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) {
continue;
}
if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
continue;
}
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.x_bias = bias_tensor;
if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 2;
break;
}
if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 2;
break;
}
}
if (fused_mul_mat_vec) {
i += fused_node_count - 1;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
@@ -3781,10 +3369,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_FLOOR:
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_ROUND:
case GGML_UNARY_OP_TRUNC:
return ggml_is_contiguous(op->src[0]);
default:
return false;
@@ -3899,13 +3483,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
op->src[0]->type == GGML_TYPE_F32 &&
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
} break;
case GGML_OP_SET:
{
const ggml_type t = op->type;
return (t == GGML_TYPE_F32 || t == GGML_TYPE_I32) &&
t == op->src[0]->type &&
t == op->src[1]->type;
} break;
case GGML_OP_CPY:
{
ggml_type src0_type = op->src[0]->type;
@@ -4065,11 +3642,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SUM:
return ggml_is_contiguous_rows(op->src[0]);
case GGML_OP_ARGSORT:
#ifndef GGML_CUDA_USE_CUB
// TODO: Support arbitrary column width
return op->src[0]->ne[0] <= 1024;
#else
return true;
#endif
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_GROUP_NORM:

View File

@@ -18,10 +18,6 @@
#include "common.cuh"
// On Volta each warp is doing 4 8x8 mma operations in parallel.
// The basic memory layout for a 32x8 output tile is to stack 4 input tiles in I direction and to mirror the B tile.
// However, the i indices in this file are by default permuted to simplify the index calculations.
// #define GGML_CUDA_MMA_NO_VOLTA_PERM
#if CUDART_VERSION >= 11080
@@ -77,15 +73,6 @@ namespace ggml_cuda_mma {
static constexpr int ne = I * J / 64;
T x[ne] = {0};
static constexpr __device__ bool supported() {
if (I == 64 && J == 2) return true;
if (I == 16 && J == 8) return true;
if (I == 32 && J == 4) return true;
if (I == 16 && J == 16) return true;
if (I == 32 && J == 32) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
return threadIdx.x % 16;
@@ -98,8 +85,7 @@ namespace ggml_cuda_mma {
} else if constexpr (I == 32 && J == 32) {
return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
} else {
NO_DEVICE_CODE;
return -1;
static_assert(I == -1 && J == -1, "template specialization not implemented");
}
}
@@ -115,67 +101,22 @@ namespace ggml_cuda_mma {
} else if constexpr (I == 32 && J == 32) {
return threadIdx.x % 32;
} else {
NO_DEVICE_CODE;
return -1;
}
}
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
static constexpr int ne = I * J / 32;
T x[ne] = {0};
static constexpr __device__ bool supported() {
if (I == 32 && J == 8) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 32 && J == 8) {
#ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
return (((threadIdx.x % 16) / 4) * 8) | ((threadIdx.x / 16) * 4) | (l & 2) | (threadIdx.x % 2);
#else
return (l & 2) | (threadIdx.x & ~2);
#endif // GGML_CUDA_MMA_NO_VOLTA_PERM
} else {
NO_DEVICE_CODE;
return -1;
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 32 && J == 8) {
return (threadIdx.x & 2) | (l & (4 + 1));
} else {
NO_DEVICE_CODE;
return -1;
static_assert(I == -1 && J == -1, "template specialization not implemented");
}
}
#else
static constexpr int ne = I * J / 32;
T x[ne] = {0};
static constexpr __device__ bool supported() {
if (I == 8 && J == 4) return true;
if (I == 8 && J == 8) return true;
if (I == 16 && J == 8) return true;
if (I == 16 && J == 16) return true;
if (I == 32 && J == 8) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 8 && J == 4) {
return threadIdx.x / 4;
} else if constexpr (I == 8 && J == 8) {
if constexpr (I == 8 && (J == 4 || J == 8)) {
return threadIdx.x / 4;
} else if constexpr (I == 16 && J == 8) {
return ((l / 2) * 8) | (threadIdx.x / 4);
return (l / 2) * 8 + threadIdx.x / 4;
} else if constexpr (I == 16 && J == 16) {
return (((l / 2) % 2) * 8) | (threadIdx.x / 4);
} else if constexpr (I == 32 && J == 8) {
return tile<16, 8, T>::get_i(l); // Memory layout simply repeated with same pattern in i direction.
return ((l / 2) % 2) * 8 + threadIdx.x / 4;
} else {
NO_DEVICE_CODE;
return -1;
static_assert(I == -1 && J == -1, "template specialization not implemented");
}
}
@@ -183,16 +124,13 @@ namespace ggml_cuda_mma {
if constexpr (I == 8 && J == 4) {
return threadIdx.x % 4;
} else if constexpr (I == 8 && J == 8) {
return (l * 4) | (threadIdx.x % 4);
return 4 * l + threadIdx.x % 4;
} else if constexpr (I == 16 && J == 8) {
return ((threadIdx.x % 4) * 2) | (l % 2);
return 2 * (threadIdx.x % 4) + l % 2;
} else if constexpr (I == 16 && J == 16) {
return ((l / 4) * 8) | ((threadIdx.x % 4) * 2) | (l % 2);
} else if constexpr (I == 32 && J == 8) {
return tile<16, 8, T>::get_j(l); // Memory layout simply repeated with same pattern in i direction.
return 8 * (l / 4) + 2 * (threadIdx.x % 4) + l % 2;
} else {
NO_DEVICE_CODE;
return -1;
static_assert(I == -1 && J == -1, "template specialization not implemented");
}
}
#endif // defined(GGML_USE_HIP)
@@ -202,83 +140,32 @@ namespace ggml_cuda_mma {
struct tile<I_, J_, half2> {
static constexpr int I = I_;
static constexpr int J = J_;
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
static constexpr int ne = I == 8 && J == 8 ? I * J / (WARP_SIZE/4) : I * J / WARP_SIZE;
half2 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 8 && J == 8) return true;
if (I == 32 && J == 8) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 8 && J == 8) {
return ((threadIdx.x / 16) * 4) | (threadIdx.x % 4);
} else if constexpr (I == 32 && J == 8) {
#ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
return (((threadIdx.x % 16) / 4) * 8) | ((threadIdx.x / 16) * 4) | (threadIdx.x % 4);
#else
return threadIdx.x;
#endif // GGML_CUDA_MMA_NO_VOLTA_PERM
} else {
NO_DEVICE_CODE;
return -1;
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr ((I == 8 || I == 32) && J == 8) {
return l;
} else {
NO_DEVICE_CODE;
return -1;
}
}
#else
static constexpr int ne = I * J / WARP_SIZE;
half2 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 8 && J == 4) return true;
if (I == 8 && J == 8) return true;
if (I == 16 && J == 8) return true;
if (I == 16 && J == 16) return true;
if (I == 32 && J == 8) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 8 && J == 8) {
return threadIdx.x / 4;
} else if constexpr (I == 16 && J == 4) {
return (l * 8) | (threadIdx.x / 4);
return l * 8 + threadIdx.x / 4;
} else if constexpr (I == 16 && J == 8) {
return ((l % 2) * 8) | (threadIdx.x / 4);
} else if constexpr (I == 32 && J == 8) {
return ((l / 4) * 16) | ((l % 2) * 8) | (threadIdx.x / 4);
return (l % 2) * 8 + threadIdx.x / 4;
} else {
NO_DEVICE_CODE;
return -1;
static_assert(I == -1 && J == -1, "template specialization not implemented");
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 8 && J == 8) {
return (l * 4) | (threadIdx.x % 4);
return l * 4 + threadIdx.x % 4;
} else if constexpr (I == 16 && J == 4) {
return threadIdx.x % 4;
} else if constexpr (I == 16 && J == 8) {
return ((l / 2) * 4) | (threadIdx.x % 4);
} else if constexpr (I == 32 && J == 8) {
return ((l & 2) * 2) | (threadIdx.x % 4);
return (l / 2) * 4 + threadIdx.x % 4;
} else {
NO_DEVICE_CODE;
return -1;
static_assert(I == -1 && J == -1, "template specialization not implemented");
}
}
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
};
template <int I_, int J_>
@@ -288,36 +175,27 @@ namespace ggml_cuda_mma {
static constexpr int ne = I * J / WARP_SIZE;
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 8 && J == 8) return true;
if (I == 16 && J == 4) return true;
if (I == 16 && J == 8) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 8 && J == 8) {
return threadIdx.x / 4;
} else if constexpr (I == 16 && J == 4) {
return (l * 8) | (threadIdx.x / 4);
return l * 8 + threadIdx.x / 4;
} else if constexpr (I == 16 && J == 8) {
return ((l % 2) * 8) | (threadIdx.x / 4);
return (l % 2) * 8 + threadIdx.x / 4;
} else {
NO_DEVICE_CODE;
return -1;
static_assert(I == -1 && J == -1, "template specialization not implemented");
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 8 && J == 8) {
return (l * 4) | (threadIdx.x % 4);
return l * 4 + threadIdx.x % 4;
} else if constexpr (I == 16 && J == 4) {
return threadIdx.x % 4;
} else if constexpr (I == 16 && J == 8) {
return ((l / 2) * 4) | (threadIdx.x % 4);
return (l / 2) * 4 + threadIdx.x % 4;
} else {
NO_DEVICE_CODE;
return -1;
static_assert(I == -1 && J == -1, "template specialization not implemented");
}
}
};
@@ -385,12 +263,8 @@ namespace ggml_cuda_mma {
: "=r"(xi[0]), "=r"(xi[1])
: "l"(xs));
#else
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
GGML_UNUSED_VARS(t, xs0, stride);
NO_DEVICE_CODE;
#else
load_generic(t, xs0, stride);
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
load_generic(xs0, stride);
GGML_UNUSED(t);
#endif // TURING_MMA_AVAILABLE
}
@@ -403,35 +277,11 @@ namespace ggml_cuda_mma {
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
: "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
: "l"(xs));
#else
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
GGML_UNUSED_VARS(t, xs0, stride);
NO_DEVICE_CODE;
#else
load_generic(t, xs0, stride);
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#endif // TURING_MMA_AVAILABLE
}
template <typename T>
static __device__ __forceinline__ void load_ldmatrix(
tile<32, 8, T> & t, const T * __restrict__ xs0, const int stride) {
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if 1
// TODO: more generic handling
static_assert(sizeof(T) == 4, "bad type size");
ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 4, xs0 + t.get_i(4)*stride + 4);
#else
load_generic(t, xs0, stride);
#endif // 1
#else
tile<16, 8, T> * t16 = (tile<16, 8, T> *) &t;
load_ldmatrix(t16[0], xs0 + 0*stride, stride);
load_ldmatrix(t16[1], xs0 + 16*stride, stride);
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
}
template <typename T>
static __device__ __forceinline__ void load_ldmatrix_trans(
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
@@ -696,43 +546,4 @@ namespace ggml_cuda_mma {
NO_DEVICE_CODE;
#endif // AMD_MFMA_AVAILABLE
}
template <typename T1, typename T2, int J, int K>
static __device__ __forceinline__ void mma(
tile<32, J, T1> & D, const tile<32, K, T2> & A, const tile<J, K, T2> & B) {
tile<16, J, T1> * D16 = (tile<16, J, T1> *) &D;
tile<16, K, T2> * A16 = (tile<16, K, T2> *) &A;
mma(D16[0], A16[0], B);
mma(D16[1], A16[1], B);
}
static __device__ __forceinline__ void mma(
tile<32, 8, float> & D, const tile<32, 8, half2> & A, const tile<8, 8, half2> & B) {
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
const int * Axi = (const int *) A.x;
const int * Bxi = (const int *) B.x;
int * Dxi = (int *) D.x;
asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
: "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[0]), "r"(Bxi[1]));
asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]), "r"(Bxi[3]));
asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
: "r"(Axi[4]), "r"(Axi[5]), "r"(Bxi[4]), "r"(Bxi[5]));
asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
: "r"(Axi[6]), "r"(Axi[7]), "r"(Bxi[6]), "r"(Bxi[7]));
#else
tile<16, 8, float> * D16 = (tile<16, 8, float> *) &D;
tile<16, 8, half2> * A16 = (tile<16, 8, half2> *) &A;
mma(D16[0], A16[0], B);
mma(D16[1], A16[1], B);
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
}
}

View File

@@ -119,27 +119,15 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
}
}
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne,
const size_t * src0_nb, const int src1_ncols, bool mul_mat_id) {
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, bool mul_mat_id) {
if (ggml_is_quantized(type)) {
return false;
}
const size_t ts = ggml_type_size(type);
if (src0_ne[0] % (warp_size * (4/ts)) != 0) {
if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {
return false;
}
if (src0_nb[0] != ts) {
return false;
}
// Pointers not aligned to the size of half2/nv_bfloat162/float2 would result in a crash:
for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
if (src0_nb[i] % (2*ts) != 0) {
return false;
}
}
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
return false;
}
@@ -160,7 +148,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
case GGML_TYPE_F32:
return ampere_mma_available(cc);
case GGML_TYPE_F16:
return volta_mma_available(cc) || turing_mma_available(cc);
return turing_mma_available(cc);
case GGML_TYPE_BF16:
return ampere_mma_available(cc);
default:

View File

@@ -17,7 +17,7 @@ struct mmf_ids_data {
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const size_t * src0_nb, const int src1_ncols, bool mul_mat_id);
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols, bool mul_mat_id);
template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids>
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
@@ -28,19 +28,9 @@ static __global__ void mul_mat_f(
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
if (!I_16_supported && !I_32_supported) {
NO_DEVICE_CODE;
return;
}
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
typedef tile<I_preferred, 8, T> tile_A;
typedef tile<8, 8, T> tile_B;
typedef tile<I_preferred, 8, float> tile_C;
typedef tile<16, 8, T> tile_A;
typedef tile< 8, 8, T> tile_B;
typedef tile<16, 8, float> tile_C;
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int tile_k_padded = warp_size + 4;
@@ -242,6 +232,7 @@ static __global__ void mul_mat_f(
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
}
//This kernel is for larger batch sizes of mul_mat_id
template <typename T, int rows_per_block, int cols_per_block, int nwarps>
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
@@ -254,19 +245,9 @@ static __global__ void mul_mat_f_ids(
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const uint3 sis1_fd, const uint3 nch_fd) {
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
if (!I_16_supported && !I_32_supported) {
NO_DEVICE_CODE;
return;
}
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work butr 16 is ~1% faster.
typedef tile<I_preferred, 8, T> tile_A;
typedef tile<8, 8, T> tile_B;
typedef tile<I_preferred, 8, float> tile_C;
typedef tile<16, 8, T> tile_A;
typedef tile< 8, 8, T> tile_B;
typedef tile<16, 8, float> tile_C;
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int tile_k_padded = warp_size + 4;
@@ -552,8 +533,7 @@ void mul_mat_f_cuda(
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream, const mmf_ids_data * ids_data) {
typedef tile<16, 8, T> tile_A_16;
typedef tile<32, 8, T> tile_A_32;
typedef tile<16, 8, T> tile_A;
typedef tile< 8, 8, T> tile_B;
GGML_ASSERT(ncols_x % 2 == 0);
@@ -564,8 +544,7 @@ void mul_mat_f_cuda(
const int64_t channel_ratio = nchannels_dst / nchannels_x;
const int64_t sample_ratio = nsamples_dst / nsamples_x;
const int device = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[device].cc;
const int device = ggml_cuda_get_device();
const int warp_size = ggml_cuda_info().devices[device].warp_size;
int64_t nwarps_best = 1;
@@ -580,7 +559,7 @@ void mul_mat_f_cuda(
}
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
const int nbytes_shared_iter = nwarps_best * (volta_mma_available(cc) ? tile_A_32::I : tile_A_16::I) * (warp_size + 4) * 4;
const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4;
const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0;

View File

@@ -3494,7 +3494,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
const int col_diff = col_high - col_low;
for (int j = threadIdx.y*warp_size + threadIdx.x; j < mmq_x; j += nwarps*warp_size) {
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
ids_dst_shared[j] = ids_dst[col_low + j];
}
__syncthreads();

View File

@@ -1,12 +1,11 @@
#include "ggml.h"
#include "common.cuh"
#include "unary.cuh"
#include "mmvf.cuh"
#include "convert.cuh"
#include "mmvf.cuh"
template <typename T, typename type_acc, int ncols_dst, int block_size, bool has_fusion = false>
template <typename T, typename type_acc, int ncols_dst, int block_size>
static __global__ void mul_mat_vec_f(
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
@@ -25,164 +24,58 @@ static __global__ void mul_mat_vec_f(
y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y;
dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
bool use_gate = false;
bool use_bias = false;
bool use_gate_bias = false;
ggml_glu_op glu_op = ggml_glu_op::GGML_GLU_OP_SWIGLU;
const T * gate_x = nullptr;
const float * x_bias = nullptr;
const float * gate_bias = nullptr;
if constexpr (has_fusion) {
use_gate = fusion.gate != nullptr;
use_bias = fusion.x_bias != nullptr;
use_gate_bias = fusion.gate_bias != nullptr;
glu_op = fusion.glu_op;
if (use_gate) {
gate_x = static_cast<const T *>(fusion.gate);
}
if (use_bias) {
x_bias = static_cast<const float *>(fusion.x_bias);
}
if (use_gate_bias) {
gate_bias = static_cast<const float *>(fusion.gate_bias);
use_gate_bias = use_gate;
} else {
use_gate_bias = false;
}
}
if (use_gate) {
gate_x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
}
if constexpr (has_fusion) {
const int channel_bias = ids ? channel_x : channel_dst;
if (use_bias) {
x_bias += int64_t(sample_dst)*stride_sample_dst + channel_bias*stride_channel_dst;
}
if (use_gate_bias) {
gate_bias += int64_t(sample_dst)*stride_sample_dst + channel_bias*stride_channel_dst;
}
}
const float2 * y2 = (const float2 *) y;
extern __shared__ char data_mmv[];
float * buf_iw = (float *) data_mmv;
float * buf_iw_gate = nullptr;
if constexpr (has_fusion) {
buf_iw_gate = (float *) (data_mmv + warp_size*sizeof(float));
}
if (block_size > warp_size) {
if (tid < warp_size) {
buf_iw[tid] = 0.0f;
if constexpr (has_fusion) {
if (use_gate) {
buf_iw_gate[tid] = 0.0f;
}
}
}
__syncthreads();
}
float sumf[ncols_dst] = {0.0f};
float sumf_gate[ncols_dst];
if constexpr (has_fusion) {
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
sumf_gate[j] = 0.0f;
}
}
if constexpr (std::is_same_v<T, float>) {
const float2 * x2 = (const float2 *) x;
const float2 * gate_x2 = nullptr;
if constexpr (has_fusion) {
if (use_gate) {
gate_x2 = (const float2 *) gate_x;
}
}
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
const float2 tmpx = x2[col2];
float2 tmpx_gate = make_float2(0.0f, 0.0f);
if constexpr (has_fusion) {
if (use_gate) {
tmpx_gate = gate_x2[col2];
}
}
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
const float2 tmpy = y2[j*stride_col_y2 + col2];
ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x);
ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y);
if constexpr (has_fusion) {
if (use_gate) {
ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x);
ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y);
}
}
}
}
} else if constexpr (std::is_same_v<T, half>) {
const half2 * x2 = (const half2 *) x;
const half2 * gate_x2 = nullptr;
if constexpr (has_fusion) {
if (use_gate) {
gate_x2 = (const half2 *) gate_x;
}
}
if (std::is_same_v<type_acc, float>) {
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
const float2 tmpx = __half22float2(x2[col2]);
float2 tmpx_gate = make_float2(0.0f, 0.0f);
if constexpr (has_fusion) {
if (use_gate) {
tmpx_gate = __half22float2(gate_x2[col2]);
}
}
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
const float2 tmpy = y2[j*stride_col_y2 + col2];
ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x);
ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y);
if constexpr (has_fusion) {
if (use_gate) {
ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x);
ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y);
}
}
}
}
} else {
#ifdef FP16_AVAILABLE
half2 sumh2[ncols_dst] = {{0.0f, 0.0f}};
half2 sumh2_gate[ncols_dst] = {{0.0f, 0.0f}};
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
const half2 tmpx = x2[col2];
half2 tmpx_gate = make_half2(0.0f, 0.0f);
if constexpr (has_fusion) {
if (use_gate) {
tmpx_gate = gate_x2[col2];
}
}
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
const float2 tmpy = y2[j*stride_col_y2 + col2];
sumh2[j] += tmpx * make_half2(tmpy.x, tmpy.y);
if constexpr (has_fusion) {
if (use_gate) {
sumh2_gate[j] += tmpx_gate * make_half2(tmpy.x, tmpy.y);
}
}
}
}
@@ -190,15 +83,6 @@ static __global__ void mul_mat_vec_f(
for (int j = 0; j < ncols_dst; ++j) {
sumf[j] = __low2float(sumh2[j]) + __high2float(sumh2[j]);
}
if constexpr (has_fusion) {
if (use_gate) {
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
sumf_gate[j] = __low2float(sumh2_gate[j]) + __high2float(sumh2_gate[j]);
}
}
}
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
@@ -207,20 +91,8 @@ static __global__ void mul_mat_vec_f(
//TODO: add support for ggml_cuda_mad for hip_bfloat162
#if defined(GGML_USE_HIP)
const int * x2 = (const int *) x;
const int * gate_x2 = nullptr;
if constexpr (has_fusion) {
if (use_gate) {
gate_x2 = (const int *) gate_x;
}
}
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
const int tmpx = x2[col2];
int tmpx_gate = 0;
if constexpr (has_fusion) {
if (use_gate) {
tmpx_gate = gate_x2[col2];
}
}
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
const float2 tmpy = y2[j*stride_col_y2 + col2];
@@ -228,45 +100,17 @@ static __global__ void mul_mat_vec_f(
const float tmpx1 = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]);
ggml_cuda_mad(sumf[j], tmpx0, tmpy.x);
ggml_cuda_mad(sumf[j], tmpx1, tmpy.y);
if constexpr (has_fusion) {
if (use_gate) {
const float tmpx0_gate = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx_gate)[0]);
const float tmpx1_gate = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx_gate)[1]);
ggml_cuda_mad(sumf_gate[j], tmpx0_gate, tmpy.x);
ggml_cuda_mad(sumf_gate[j], tmpx1_gate, tmpy.y);
}
}
}
}
#else
const nv_bfloat162 * x2 = (const nv_bfloat162 *) x;
const nv_bfloat162 * gate_x2 = nullptr;
if constexpr (has_fusion) {
if (use_gate) {
gate_x2 = (const nv_bfloat162 *) gate_x;
}
}
for (int col2 = tid; col2 < ncols2; col2 += block_size) {
const nv_bfloat162 tmpx = x2[col2];
nv_bfloat162 tmpx_gate;
if constexpr (has_fusion) {
if (use_gate) {
tmpx_gate = gate_x2[col2];
}
}
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
const float2 tmpy = y2[j*stride_col_y2 + col2];
ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x);
ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y);
if constexpr (has_fusion) {
if (use_gate) {
ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x);
ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y);
}
}
}
}
#endif
@@ -278,31 +122,13 @@ static __global__ void mul_mat_vec_f(
for (int j = 0; j < ncols_dst; ++j) {
sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
if constexpr (has_fusion) {
if (use_gate) {
sumf_gate[j] = warp_reduce_sum<warp_size>(sumf_gate[j]);
}
}
if (block_size > warp_size) {
buf_iw[tid/warp_size] = sumf[j];
if constexpr (has_fusion) {
if (use_gate) {
buf_iw_gate[tid/warp_size] = sumf_gate[j];
}
}
__syncthreads();
if (tid < warp_size) {
sumf[j] = buf_iw[tid];
sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
if constexpr (has_fusion) {
if (use_gate) {
sumf_gate[j] = buf_iw_gate[tid];
sumf_gate[j] = warp_reduce_sum<warp_size>(sumf_gate[j]);
}
}
}
if (j < ncols_dst) {
__syncthreads();
}
@@ -313,74 +139,12 @@ static __global__ void mul_mat_vec_f(
return;
}
float value = sumf[tid];
if constexpr (has_fusion) {
if (use_bias) {
value += x_bias[tid*stride_col_dst + row];
}
if (use_gate) {
float gate_value = sumf_gate[tid];
if (use_gate_bias) {
gate_value += gate_bias[tid*stride_col_dst + row];
}
switch (glu_op) {
case GGML_GLU_OP_SWIGLU:
value *= ggml_cuda_op_silu_single(gate_value);
break;
case GGML_GLU_OP_GEGLU:
value *= ggml_cuda_op_gelu_single(gate_value);
break;
case GGML_GLU_OP_SWIGLU_OAI: {
value = ggml_cuda_op_swiglu_oai_single(gate_value, value);
break;
}
default:
break;
}
}
}
dst[tid*stride_col_dst + row] = value;
if constexpr (!has_fusion) {
GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, glu_op, gate_x, x_bias, gate_bias, sumf_gate);
}
}
template<typename T, typename type_acc, int ncols_dst, int block_size>
static void mul_mat_vec_f_switch_fusion(
const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
const int64_t ncols, const int64_t nrows,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const cudaStream_t stream) {
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
if constexpr (ncols_dst == 1) {
if (has_fusion) {
mul_mat_vec_f<T, type_acc, ncols_dst, block_size, true><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
return;
}
}
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
mul_mat_vec_f<T, type_acc, ncols_dst, block_size><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
dst[tid*stride_col_dst + row] = sumf[tid];
}
template <typename T, typename type_acc, int ncols_dst>
void launch_mul_mat_vec_f_cuda(
const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
static void launch_mul_mat_vec_f_cuda(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols, const int64_t nrows,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
@@ -412,59 +176,57 @@ void launch_mul_mat_vec_f_cuda(
}
}
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
const int nbytes_shared = warp_size*sizeof(float) + (has_fusion ? warp_size*sizeof(float) : 0);
const int nbytes_shared = warp_size*sizeof(float);
const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
const dim3 block_dims(block_size_best, 1, 1);
switch (block_size_best) {
case 32: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 32>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f<T, type_acc, ncols_dst, 32><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 64: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 64>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f<T, type_acc, ncols_dst, 64><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 96: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 96>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f<T, type_acc, ncols_dst, 96><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 128: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 128>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f<T, type_acc, ncols_dst, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 160: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 160>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f<T, type_acc, ncols_dst, 160><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 192: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 192>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f<T, type_acc, ncols_dst, 192><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 224: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 224>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f<T, type_acc, ncols_dst, 224><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 256: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 256>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f<T, type_acc, ncols_dst, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
default: {
GGML_ABORT("fatal error");
@@ -474,7 +236,7 @@ void launch_mul_mat_vec_f_cuda(
template <typename T, typename type_acc>
static void mul_mat_vec_f_cuda_switch_ncols_dst(
const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
@@ -484,49 +246,49 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst(
switch (ncols_dst) {
case 1:
launch_mul_mat_vec_f_cuda<T, type_acc, 1>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 2:
launch_mul_mat_vec_f_cuda<T, type_acc, 2>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 3:
launch_mul_mat_vec_f_cuda<T, type_acc, 3>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 4:
launch_mul_mat_vec_f_cuda<T, type_acc, 4>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 5:
launch_mul_mat_vec_f_cuda<T, type_acc, 5>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 6:
launch_mul_mat_vec_f_cuda<T, type_acc, 6>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 7:
launch_mul_mat_vec_f_cuda<T, type_acc, 7>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
case 8:
launch_mul_mat_vec_f_cuda<T, type_acc, 8>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
(x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
break;
@@ -538,31 +300,29 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst(
template<typename T>
static void mul_mat_vec_f_cuda(
const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
enum ggml_prec prec, cudaStream_t stream) {
if constexpr(std::is_same_v<T, half>) {
if (prec == GGML_PREC_DEFAULT) {
mul_mat_vec_f_cuda_switch_ncols_dst<T, half>
(x, y, ids, fusion, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
(x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
return;
}
}
mul_mat_vec_f_cuda_switch_ncols_dst<T, float>
(x, y, ids, fusion, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
(x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
}
void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
const ggml_cuda_mm_fusion_args_host * fusion) {
void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
GGML_ASSERT( src1->type == GGML_TYPE_F32);
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -588,30 +348,6 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
float * dst_d = (float *) dst->data;
ggml_cuda_mm_fusion_args_device fusion_local{};
if (fusion) {
GGML_ASSERT( !ids || dst->ne[2] == 1);
GGML_ASSERT( ids || dst->ne[1] == 1);
if (fusion->x_bias) {
GGML_ASSERT(fusion->x_bias->type == GGML_TYPE_F32);
GGML_ASSERT(fusion->x_bias->ne[0] == dst->ne[0]);
GGML_ASSERT(!ids || fusion->x_bias->ne[1] == src0->ne[2]);
fusion_local.x_bias = fusion->x_bias->data;
}
if (fusion->gate) {
GGML_ASSERT(fusion->gate->type == src0->type && ggml_are_same_stride(fusion->gate, src0));
fusion_local.gate = fusion->gate->data;
}
if (fusion->gate_bias) {
GGML_ASSERT(fusion->gate_bias->type == GGML_TYPE_F32);
GGML_ASSERT(fusion->gate_bias->ne[0] == dst->ne[0]);
GGML_ASSERT(!ids || fusion->gate_bias->ne[1] == src0->ne[2]);
fusion_local.gate_bias = fusion->gate_bias->data;
}
fusion_local.glu_op = fusion->glu_op;
}
const int64_t s01 = src0->nb[1] / ts_src0;
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s1 = dst->nb[1] / ts_dst;
@@ -634,19 +370,19 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
switch (src0->type) {
case GGML_TYPE_F32: {
const float * src0_d = (const float *) src0->data;
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
} break;
case GGML_TYPE_F16: {
const half * src0_d = (const half *) src0->data;
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
} break;
case GGML_TYPE_BF16: {
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
} break;
@@ -673,6 +409,7 @@ void ggml_cuda_op_mul_mat_vec_f(
const int cc = ggml_cuda_info().devices[id].cc;
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
// ggml_cuda_op provides single, contiguous matrices
const int64_t stride_row = ne00;
const int64_t stride_col_y = ne10;
@@ -689,23 +426,22 @@ void ggml_cuda_op_mul_mat_vec_f(
const int64_t stride_sample_y = 0;
const int64_t stride_sample_dst = 0;
ggml_cuda_mm_fusion_args_device empty{};
switch (src0->type) {
case GGML_TYPE_F32: {
const float * src0_d = (const float *) src0_dd_i;
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break;
case GGML_TYPE_F16: {
const half * src0_d = (const half *) src0_dd_i;
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break;
case GGML_TYPE_BF16: {
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
} break;
@@ -716,23 +452,10 @@ void ggml_cuda_op_mul_mat_vec_f(
GGML_UNUSED_VARS(ctx, src1, dst, src1_ddq_i, src1_ncols, src1_padded_row_size);
}
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, const size_t * src0_nb, int64_t ne11) {
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
if (src0_ne[0] % 2 != 0) {
return false;
}
const size_t ts = ggml_type_size(type);
if (src0_nb[0] != ts) {
return false;
}
// Pointers not aligned to the size of half2/nv_bfloat162/float2 would result in a crash:
for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
if (src0_nb[i] % (2*ts) != 0) {
return false;
}
}
switch (type) {
case GGML_TYPE_F32:
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {

View File

@@ -1,7 +1,6 @@
#include "common.cuh"
void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
const ggml_cuda_mm_fusion_args_host * fusion = nullptr);
void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
void ggml_cuda_op_mul_mat_vec_f(
ggml_backend_cuda_context & ctx,
@@ -9,4 +8,4 @@ void ggml_cuda_op_mul_mat_vec_f(
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream);
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, const size_t * src0_nb, int64_t ne11);
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);

View File

@@ -1,6 +1,5 @@
#include "mmvq.cuh"
#include "quantize.cuh"
#include "unary.cuh"
#include "vecdotq.cuh"
#include <cstdint>
@@ -83,7 +82,7 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
return MMVQ_PARAMETERS_GENERIC;
}
static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) {
static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) {
if (table_id == MMVQ_PARAMETERS_GENERIC) {
switch (ncols_dst) {
case 1:
@@ -137,11 +136,11 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
return 1;
}
template <ggml_type type, int ncols_dst>
// tell the compiler to use as many registers as it wants, see nwarps definition below
template <ggml_type type, int ncols_dst, bool has_fusion>
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst,
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
@@ -170,56 +169,8 @@ static __global__ void mul_mat_vec_q(
const uint32_t sample_x = fastdiv(sample_dst, sample_ratio);
const uint32_t sample_y = sample_dst;
bool use_gate = false;
bool use_bias = false;
bool use_gate_bias = false;
const void * vgate = nullptr;
const float * x_bias = nullptr;
const float * gate_bias = nullptr;
ggml_glu_op active_glu;
if constexpr (has_fusion) {
use_gate = fusion.gate != nullptr;
use_bias = fusion.x_bias != nullptr;
use_gate_bias = fusion.gate_bias != nullptr && use_gate;
vgate = fusion.gate;
x_bias = (const float *) fusion.x_bias;
gate_bias = (const float *) fusion.gate_bias;
active_glu = fusion.glu_op;
}
const uint32_t channel_bias = ids ? channel_x : channel_dst;
float x_biases[ncols_dst] = { 0.0f };
float gate_biases[ncols_dst] = { 0.0f };
if constexpr (has_fusion) {
if (use_bias) {
x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
// 1. Hide latency by prefetching bias and gate here
// 2. load only on threads that won't die after partial sum calculation
if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
(rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
x_biases[j] = x_bias[j * stride_col_dst + threadIdx.x];
}
}
}
if (use_gate_bias) {
gate_bias = gate_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
(rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
gate_biases[j] = gate_bias[j * stride_col_dst + threadIdx.x];
}
}
}
}
// partial sum for each thread
float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}};
float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}};
const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
@@ -236,35 +187,17 @@ static __global__ void mul_mat_vec_q(
for (int i = 0; i < rows_per_cuda_block; ++i) {
tmp[j][i] += vec_dot_q_cuda(
vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
if constexpr (has_fusion) {
if (use_gate) {
tmp_gate[j][i] += vec_dot_q_cuda(
vgate, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
}
}
}
}
}
__shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
__shared__ float tmp_shared_gate[(has_fusion && (nwarps-1 > 0)) ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
if constexpr (!has_fusion) {
(void) tmp_shared_gate;
} else if (!use_gate) {
(void) tmp_shared_gate;
}
if (threadIdx.y > 0) {
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
#pragma unroll
for (int i = 0; i < rows_per_cuda_block; ++i) {
tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
if constexpr (has_fusion) {
if (use_gate) {
tmp_shared_gate[threadIdx.y-1][j][i][threadIdx.x] = tmp_gate[j][i];
}
}
}
}
}
@@ -283,55 +216,14 @@ static __global__ void mul_mat_vec_q(
#pragma unroll
for (int l = 0; l < nwarps-1; ++l) {
tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
if constexpr (has_fusion) {
if (use_gate) {
tmp_gate[j][i] += tmp_shared_gate[l][j][i][threadIdx.x];
}
}
}
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
if constexpr (has_fusion) {
if (use_gate) {
tmp_gate[j][i] = warp_reduce_sum<warp_size>(tmp_gate[j][i]);
}
}
}
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
float result = tmp[j][threadIdx.x];
if constexpr (has_fusion) {
if (use_bias) {
result += x_biases[j];
}
if (use_gate) {
float gate_value = tmp_gate[j][threadIdx.x];
if (use_gate_bias) {
gate_value += gate_biases[j];
}
switch (active_glu) {
case GGML_GLU_OP_SWIGLU:
result *= ggml_cuda_op_silu_single(gate_value);
break;
case GGML_GLU_OP_GEGLU:
result *= ggml_cuda_op_gelu_single(gate_value);
break;
case GGML_GLU_OP_SWIGLU_OAI: {
result = ggml_cuda_op_swiglu_oai_single(gate_value, result);
break;
}
default:
result = result * gate_value;
break;
}
}
}
dst[j*stride_col_dst + threadIdx.x] = result;
dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
}
}
if constexpr (!has_fusion) {
GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, active_glu, gate_bias, x_bias, tmp_gate);
}
}
static std::pair<dim3, dim3> calc_launch_params(
@@ -343,37 +235,9 @@ static std::pair<dim3, dim3> calc_launch_params(
return {block_nums, block_dims};
}
template<ggml_type type, int c_ncols_dst>
static void mul_mat_vec_q_switch_fusion(
const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst,
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared, cudaStream_t stream) {
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
if constexpr (c_ncols_dst == 1) {
if (has_fusion) {
mul_mat_vec_q<type, c_ncols_dst, true><<<block_nums, block_dims, nbytes_shared, stream>>>
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
return;
}
}
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
mul_mat_vec_q<type, c_ncols_dst, false><<<block_nums, block_dims, nbytes_shared, stream>>>
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
}
template <ggml_type type>
static void mul_mat_vec_q_switch_ncols_dst(
const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
const void * vx, const void * vy, const int32_t * ids, float * dst,
const int ncols_x, const int nrows_x, const int ncols_dst,
const int stride_row_x, const int stride_col_y, const int stride_col_dst,
const int nchannels_x, const int nchannels_y, const int nchannels_dst,
@@ -392,83 +256,80 @@ static void mul_mat_vec_q_switch_ncols_dst(
const int warp_size = ggml_cuda_info().devices[device].warp_size;
const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
GGML_ASSERT(!ids || ncols_dst == 1);
switch (ncols_dst) {
case 1: {
constexpr int c_ncols_dst = 1;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 2: {
constexpr int c_ncols_dst = 2;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 3: {
constexpr int c_ncols_dst = 3;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 4: {
constexpr int c_ncols_dst = 4;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 5: {
constexpr int c_ncols_dst = 5;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 6: {
constexpr int c_ncols_dst = 6;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 7: {
constexpr int c_ncols_dst = 7;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 8: {
constexpr int c_ncols_dst = 8;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
default:
GGML_ABORT("fatal error");
break;
}
GGML_UNUSED(has_fusion);
}
static void mul_mat_vec_q_switch_type(
const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, float * dst,
const int ncols_x, const int nrows_x, const int ncols_dst,
const int stride_row_x, const int stride_col_y, const int stride_col_dst,
const int nchannels_x, const int nchannels_y, const int nchannels_dst,
@@ -478,123 +339,143 @@ static void mul_mat_vec_q_switch_type(
switch (type_x) {
case GGML_TYPE_Q4_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q4_1:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_1>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q5_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_0>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q5_1:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_1>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q8_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q8_0>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_MXFP4:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_MXFP4>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q2_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q3_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q3_K>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q4_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_K>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q5_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_Q6_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q6_K>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ2_XXS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ2_XS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XS>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ2_S:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_S>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ3_XXS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_XXS>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ1_S:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_S>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ1_M:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_M>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ4_NL:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_NL>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ4_XS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
case GGML_TYPE_IQ3_S:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_S>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
(vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
stream);
break;
default:
GGML_ABORT("fatal error");
@@ -603,8 +484,7 @@ static void mul_mat_vec_q_switch_type(
}
void ggml_cuda_mul_mat_vec_q(
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
const ggml_cuda_mm_fusion_args_host * fusion) {
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
GGML_ASSERT( src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
@@ -628,31 +508,6 @@ void ggml_cuda_mul_mat_vec_q(
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
float * dst_d = (float *) dst->data;
ggml_cuda_mm_fusion_args_device fusion_local{};
if (fusion) {
GGML_ASSERT( !ids || dst->ne[2] == 1);
GGML_ASSERT( ids || dst->ne[1] == 1);
if (fusion->x_bias) {
GGML_ASSERT(fusion->x_bias->type == GGML_TYPE_F32);
GGML_ASSERT(fusion->x_bias->ne[0] == dst->ne[0]);
GGML_ASSERT(!ids || fusion->x_bias->ne[1] == src0->ne[2]);
fusion_local.x_bias = fusion->x_bias->data;
}
if (fusion->gate) {
GGML_ASSERT(fusion->gate->type == src0->type && ggml_are_same_stride(fusion->gate, src0));
fusion_local.gate = fusion->gate->data;
}
if (fusion->gate_bias) {
GGML_ASSERT(fusion->gate_bias->type == GGML_TYPE_F32);
GGML_ASSERT(fusion->gate_bias->ne[0] == dst->ne[0]);
GGML_ASSERT(!ids || fusion->gate_bias->ne[1] == src0->ne[2]);
fusion_local.gate_bias = fusion->gate_bias->data;
}
fusion_local.glu_op = fusion->glu_op;
}
// If src0 is a temporary compute buffer, clear any potential padding.
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
const size_t size_data = ggml_nbytes(src0);
@@ -694,10 +549,10 @@ void ggml_cuda_mul_mat_vec_q(
const int64_t stride_channel_y = ids ? s11 : s12;
mul_mat_vec_q_switch_type(
src0->data, src0->type, src1_q8_1.get(), ids_d, fusion_local, dst_d, ne00,
src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00,
ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, stream);
ne03, ne3, s03, s13, s3, stream);
}
void ggml_cuda_op_mul_mat_vec_q(
@@ -723,9 +578,8 @@ void ggml_cuda_op_mul_mat_vec_q(
const int stride_row_x = ne00 / ggml_blck_size(src0->type);
const int stride_col_y = src1_padded_row_size / QK8_1;
ggml_cuda_mm_fusion_args_device fusion_local{};
mul_mat_vec_q_switch_type(
src0_dd_i, src0->type, src1_ddq_i, nullptr, fusion_local, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream);
GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_ncols, src1_padded_row_size);

View File

@@ -3,7 +3,7 @@
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr);
const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
void ggml_cuda_op_mul_mat_vec_q(
ggml_backend_cuda_context & ctx,

View File

@@ -125,7 +125,7 @@ template<bool forward, bool has_ff, typename T>
static __global__ void rope_multi(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections, const bool is_imrope) {
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
@@ -152,29 +152,17 @@ static __global__ void rope_multi(
const int sector = (i0 / 2) % sect_dims;
float theta_base = 0.0;
if (is_imrope) {
if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
} else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
} else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
} else {
theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
}
} else {
if (sector < sections.v[0]) {
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sections.v[0] && sector < sec_w) {
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sec_w + sections.v[2]) {
theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
}
if (sector < sections.v[0]) {
theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sections.v[0] && sector < sec_w) {
theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
}
else if (sector >= sec_w + sections.v[2]) {
theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
}
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@@ -288,7 +276,7 @@ template<bool forward, typename T>
static void rope_multi_cuda(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, const bool is_imrope, cudaStream_t stream) {
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@@ -299,11 +287,11 @@ static void rope_multi_cuda(
if (freq_factors == nullptr) {
rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
attn_factor, corr_dims, theta_scale, freq_factors, sections);
} else {
rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
attn_factor, corr_dims, theta_scale, freq_factors, sections);
}
}
@@ -381,7 +369,6 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_mrope) {
@@ -419,11 +406,11 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
if (src0->type == GGML_TYPE_F32) {
rope_multi_cuda<forward>(
(const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
} else if (src0->type == GGML_TYPE_F16) {
rope_multi_cuda<forward>(
(const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
} else {
GGML_ABORT("fatal error");
}

View File

@@ -4,53 +4,30 @@
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
// Generic quantized set_rows kernel template
template <typename idx_t, typename block_type, int qk, void (*quantize_func)(const float *, block_type *)>
static __global__ void k_set_rows_quant(const float * __restrict__ src0,
const idx_t * __restrict__ src1,
block_type * __restrict__ dst,
const int64_t ne_total,
const int64_t ne10,
const int64_t ne11,
const int64_t ne12,
const int64_t ne13,
const int64_t s01,
const int64_t s02,
const int64_t s03,
const int64_t s10,
const int64_t s11,
const int64_t s12,
const int64_t s1,
const int64_t s2,
const int64_t s3,
const uint3 ne00,
const uint3 ne01,
const uint3 ne02,
const uint3 ne11_fd,
const uint3 ne12_fd) {
template<typename idx_t, typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
static __global__ void k_set_rows_quant(
const float * __restrict__ src0, const idx_t * __restrict__ src1, block_type * __restrict__ dst,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t s10, const int64_t s11, const int64_t s12,
const int64_t s1, const int64_t s2, const int64_t s3) {
const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
const int64_t ne_total = (ne00 * ne01 * ne02 * ne03) / qk;
if (i >= ne_total) {
return;
}
const int64_t i_base = i * qk;
uint32_t tmp = (uint32_t) i_base;
uint2 div_mod;
const int64_t i03 = i_base / (ne00 * ne01 * ne02);
const int64_t i02 = (i_base - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
const int64_t i01 = (i_base - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
const int64_t i00 = i_base - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
div_mod = fast_div_modulo(tmp, ne00);
const int64_t i00 = div_mod.y;
tmp = div_mod.x;
div_mod = fast_div_modulo(tmp, ne01);
const int64_t i01 = div_mod.y;
tmp = div_mod.x;
div_mod = fast_div_modulo(tmp, ne02);
const int64_t i02 = div_mod.y;
const int64_t i03 = div_mod.x;
const int64_t i12 = fastmodulo((uint32_t) i03, ne12_fd);
const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd);
const int64_t i12 = i03 % ne12;
const int64_t i11 = i02 % ne11;
const int64_t i10 = i01;
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
@@ -64,8 +41,6 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0,
quantize_func(src_block, dst_block);
GGML_UNUSED(ne10);
GGML_UNUSED(ne11);
GGML_UNUSED(ne12);
GGML_UNUSED(ne13);
}
@@ -96,65 +71,40 @@ static void set_rows_cuda_quant(
const int64_t s2 = nb2;
const int64_t s3 = nb3;
if (ne_total > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) {
const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00);
const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01);
const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
if (ne_total > 0) {
k_set_rows_quant<idx_t, block_type, qk, quantize_func><<<grid_size, block_size, 0, stream>>>(
src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01, s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd,
ne01_fd, ne02_fd, ne11_fd, ne12_fd);
src0_d, src1_d, dst_d,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
s01, s02, s03,
s10, s11, s12,
s1, s2, s3);
}
}
template <typename src_t, typename idx_t, typename dst_t>
static __global__ void k_set_rows(const src_t * __restrict__ src0,
const idx_t * __restrict__ src1,
dst_t * __restrict__ dst,
const int64_t ne_total,
const int64_t ne10,
const int64_t ne11,
const int64_t ne12,
const int64_t ne13,
const int64_t s01,
const int64_t s02,
const int64_t s03,
const int64_t s10,
const int64_t s11,
const int64_t s12,
const int64_t s1,
const int64_t s2,
const int64_t s3,
const uint3 ne00,
const uint3 ne01,
const uint3 ne02,
const uint3 ne11_fd,
const uint3 ne12_fd) {
template<typename src_t, typename idx_t, typename dst_t>
static __global__ void k_set_rows(
const src_t * __restrict__ src0, const idx_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t s10, const int64_t s11, const int64_t s12,
const int64_t s1, const int64_t s2, const int64_t s3) {
const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
if (i >= ne_total) {
return;
}
uint32_t tmp = (uint32_t) i;
uint2 div_mod;
const int64_t i03 = i / (ne00 * ne01 * ne02);
const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
div_mod = fast_div_modulo(tmp, ne00);
const int64_t i00 = div_mod.y;
tmp = div_mod.x;
div_mod = fast_div_modulo(tmp, ne01);
const int64_t i01 = div_mod.y;
tmp = div_mod.x;
div_mod = fast_div_modulo(tmp, ne02);
const int64_t i02 = div_mod.y;
const int64_t i03 = div_mod.x;
const int64_t i12 = fastmodulo((uint32_t) i03, ne12_fd);
const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd);
const int64_t i12 = i03 % ne12;
const int64_t i11 = i02 % ne11;
const int64_t i10 = i01;
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
@@ -165,8 +115,6 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0,
dst_row_ptr[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
GGML_UNUSED(ne10);
GGML_UNUSED(ne11);
GGML_UNUSED(ne12);
GGML_UNUSED(ne13);
}
@@ -196,16 +144,14 @@ static void set_rows_cuda(
const int64_t s2 = nb2/sizeof(dst_t);
const int64_t s3 = nb3/sizeof(dst_t);
if (ne_total > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) {
const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00);
const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01);
const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
k_set_rows<<<grid_size, block_size, 0, stream>>>(src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01,
s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd,
ne11_fd, ne12_fd);
if (ne_total > 0) {
k_set_rows<<<grid_size, block_size, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
s01, s02, s03,
s10, s11, s12,
s1, s2, s3);
}
}

View File

@@ -1,39 +0,0 @@
#include "set.cuh"
#include "cpy.cuh"
void ggml_cuda_op_set(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32));
GGML_ASSERT(src1->type == src0->type);
GGML_ASSERT(dst ->type == src0->type);
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
const size_t nb1 = ((int32_t *) dst->op_params)[0];
const size_t nb2 = ((int32_t *) dst->op_params)[1];
const size_t nb3 = ((int32_t *) dst->op_params)[2];
const size_t offset = ((int32_t *) dst->op_params)[3];
const bool inplace= (bool) ((int32_t *) dst->op_params)[4];
if (!inplace) {
ggml_cuda_cpy(ctx, src0, dst);
}
ggml_tensor dst_view = *dst;
dst_view.data = (void *)((char *)dst->data + offset);
dst_view.ne[0] = src1->ne[0];
dst_view.ne[1] = src1->ne[1];
dst_view.ne[2] = src1->ne[2];
dst_view.ne[3] = src1->ne[3];
dst_view.nb[0] = ggml_element_size(dst);
dst_view.nb[1] = nb1;
dst_view.nb[2] = nb2;
dst_view.nb[3] = nb3;
ggml_cuda_cpy(ctx, src1, &dst_view);
}

View File

@@ -1,7 +0,0 @@
#pragma once
#include "common.cuh"
#define CUDA_SET_BLOCK_SIZE 256
void ggml_cuda_op_set(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(72, 72);

View File

@@ -3,7 +3,7 @@
from glob import glob
import os
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 576]
HEAD_SIZES_KQ = [40, 64, 80, 96, 112, 128, 256, 576]
TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0"]
@@ -81,8 +81,6 @@ for ncols in [8, 16, 32, 64]:
for head_size_kq in HEAD_SIZES_KQ:
if head_size_kq == 40:
continue
if head_size_kq == 72:
continue
if head_size_kq != 576 and ncols2 == 16:
continue
if head_size_kq == 576 and ncols2 != 16:

View File

@@ -2,7 +2,6 @@
#include "ggml.h"
#include "topk-moe.cuh"
#include <cmath>
#include <initializer_list>
// Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
@@ -64,8 +63,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
float * weights,
int32_t * ids,
const int n_rows,
const int n_expert_used,
const float clamp_val) {
const int n_expert_used) {
const int row = blockIdx.x * blockDim.y + threadIdx.y;
if (row >= n_rows) {
return;
@@ -141,7 +139,6 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
if constexpr (with_norm) {
wt_sum = warp_reduce_sum(wt_sum);
wt_sum = max(wt_sum, clamp_val);
const float inv_sum = 1.0f / wt_sum;
for (int i = 0; i < experts_per_thread; i++) {
@@ -160,10 +157,6 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
weights[idx] = output_weights[i];
}
}
if (!with_norm) {
GGML_UNUSED(clamp_val);
}
}
template <bool with_norm, bool delayed_softmax = false>
@@ -173,9 +166,9 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
int32_t * ids,
const int n_rows,
const int n_expert,
const int n_expert_used,
const float clamp_val) {
const int n_expert_used) {
static_assert(!(with_norm && delayed_softmax), "delayed softmax is not supported with weight normalization");
const int rows_per_block = 4;
dim3 grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
dim3 block_dims(WARP_SIZE, rows_per_block, 1);
@@ -184,43 +177,43 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
switch (n_expert) {
case 1:
topk_moe_cuda<1, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 2:
topk_moe_cuda<2, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 4:
topk_moe_cuda<4, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 8:
topk_moe_cuda<8, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 16:
topk_moe_cuda<16, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 32:
topk_moe_cuda<32, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 64:
topk_moe_cuda<64, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 128:
topk_moe_cuda<128, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 256:
topk_moe_cuda<256, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
case 512:
topk_moe_cuda<512, with_norm, delayed_softmax>
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
<<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
break;
default:
GGML_ASSERT(false && "fatal error");
@@ -233,8 +226,7 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
ggml_tensor * weights,
ggml_tensor * ids,
const bool with_norm,
const bool delayed_softmax,
ggml_tensor * clamp) {
const bool delayed_softmax) {
GGML_ASSERT(logits->type == GGML_TYPE_F32);
GGML_ASSERT(weights->type == GGML_TYPE_F32);
GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -250,25 +242,18 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
const int n_expert_used = weights->ne[1];
float clamp_val = -INFINITY;
if (with_norm) {
if (clamp) {
clamp_val = ggml_get_op_params_f32(clamp, 0);
}
launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val);
launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
} else {
GGML_ASSERT(clamp == nullptr);
if (delayed_softmax) {
launch_topk_moe_cuda<false, true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used,
clamp_val);
launch_topk_moe_cuda<false, true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
} else {
launch_topk_moe_cuda<false, false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used,
clamp_val);
launch_topk_moe_cuda<false, false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
}
}
}
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp) {
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights) {
float scale = 1.0f;
float max_bias = 0.0f;
@@ -294,26 +279,13 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tenso
return false;
}
if (clamp) {
if (clamp->op != GGML_OP_CLAMP) {
return false;
}
float max_val = ggml_get_op_params_f32(clamp, 1);
if (max_val != INFINITY) {
return false;
}
}
return true;
}
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool norm, bool delayed_softmax) {
static std::initializer_list<enum ggml_op> norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV,
GGML_OP_RESHAPE };
GGML_OP_SUM_ROWS, GGML_OP_DIV, GGML_OP_RESHAPE };
static std::initializer_list<enum ggml_op> no_norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
GGML_OP_VIEW, GGML_OP_GET_ROWS };

View File

@@ -8,9 +8,8 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
ggml_tensor * weights,
ggml_tensor * ids,
const bool with_norm,
const bool delayed_softmax = false,
ggml_tensor * weight_clamp = nullptr);
const bool delayed_softmax = false);
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp = nullptr);
bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights);
std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);

View File

@@ -18,7 +18,10 @@ static __device__ __forceinline__ float op_step(float x) {
}
static __device__ __forceinline__ float op_gelu(float x) {
return ggml_cuda_op_gelu_single(x);
const float GELU_COEF_A = 0.044715f;
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
}
static __device__ __forceinline__ float op_gelu_erf(float x) {
@@ -34,7 +37,7 @@ static __device__ __forceinline__ float op_gelu_quick(float x) {
}
static __device__ __forceinline__ float op_silu(float x) {
return ggml_cuda_op_silu_single(x);
return x / (1.0f + expf(-x));
}
static __device__ __forceinline__ float op_tanh(float x) {
@@ -85,22 +88,6 @@ static __device__ __forceinline__ float op_elu(float x) {
return (x > 0.f) ? x : expm1f(x);
}
static __device__ __forceinline__ float op_floor(float x) {
return floorf(x);
}
static __device__ __forceinline__ float op_ceil(float x) {
return ceilf(x);
}
static __device__ __forceinline__ float op_round(float x) {
return round(x);
}
static __device__ __forceinline__ float op_trunc(float x) {
return trunc(x);
}
template <float (*op)(float), typename T>
static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -217,22 +204,6 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_unary<op_elu>(ctx, dst);
}
void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_unary<op_floor>(ctx, dst);
}
void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_unary<op_ceil>(ctx, dst);
}
void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_unary<op_round>(ctx, dst);
}
void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_unary<op_trunc>(ctx, dst);
}
/* gated ops */
template <float (*op)(float), typename T>
@@ -346,8 +317,13 @@ static __global__ void swiglu_oai_kernel(const T * x, const T * g, T * dst, cons
float xi = x[j0];
float gi = g[j1];
xi = fminf(xi, limit);
gi = fmaxf(fminf(gi, limit), -limit);
dst[i] = ggml_cuda_op_swiglu_oai_single(xi, gi, alpha, limit);
float out_glu = xi / (1.0f + expf(-xi * alpha));
out_glu = out_glu * (1.0f + gi);
dst[i] = out_glu;
}
template <typename T>

View File

@@ -1,4 +1,3 @@
#pragma once
#include "common.cuh"
#define CUDA_NEG_BLOCK_SIZE 256
@@ -63,14 +62,6 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -84,23 +75,3 @@ void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
return x / (1.0f + expf(-x));
}
__device__ __forceinline__ float ggml_cuda_op_gelu_single(float x) {
const float GELU_COEF_A = 0.044715f;
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
return 0.5f * x * (1.0f + tanhf(SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x)));
}
__device__ __forceinline__ float ggml_cuda_op_swiglu_oai_single(float x, float g, float alpha = 1.702f, float limit = 7.0f) {
x = fminf(x, limit);
g = fmaxf(fminf(g, limit), -limit);
float out_glu = x / (1.0f + expf(-x * alpha));
out_glu = out_glu * (1.0f + g);
return out_glu;
}

View File

@@ -81,70 +81,6 @@ static __global__ void upscale_f32_bilinear(const float * x, float * dst,
dst[index] = result;
}
namespace bicubic_interpolation {
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
__device__ const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
static __device__ float weight1(float x) { return ((a + 2) * x - (a + 3)) * x * x + 1; };
static __device__ float weight2(float x) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
static __device__ float bicubic(float p0, float p1, float p2, float p3, float x) {
const float w0 = weight2(x + 1);
const float w1 = weight1(x + 0);
const float w2 = weight1(1 - x);
const float w3 = weight2(2 - x);
return p0 * w0 + p1 * w1 + p2 * w2 + p3 * w3;
};
} // namespace bicubic_interpolation
static __global__ void upscale_f32_bicubic(const float * x, float * dst,
const int nb00, const int nb01, const int nb02, const int nb03,
const int ne00_src, const int ne01_src,
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
const float sf0, const float sf1, const float sf2, const float sf3,
const float pixel_offset) {
using bicubic_interpolation::bicubic;
const int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
if (index >= dst_total_elements) {
return;
}
const int i10_dst = index % ne10_dst;
const int i11_dst = (index / ne10_dst) % ne11_dst;
const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
const int i02_src = (int)(i12_dst / sf2);
const int i03_src = (int)(i13_dst / sf3);
const float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
const int y0_src = (int)floorf(y_src_f);
const float dy = y_src_f - (float)y0_src;
const float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset;
const int x0_src = (int)floorf(x_src_f);
const float dx = x_src_f - (float)x0_src;
const char * x_base = (const char *)x + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03;
auto load = [=](int x_off, int y_off) -> float {
int i00_src = max(0, min(x0_src + x_off, ne00_src - 1));
int i01_src = max(0, min(y0_src + y_off, ne01_src - 1));
return *(const float *)(x_base + (int64_t)i00_src * nb00 + (int64_t)i01_src * nb01);
};
const float result = bicubic(
bicubic(load(-1,-1), load(0,-1), load(1,-1), load(2,-1), dx),
bicubic(load(-1, 0), load(0, 0), load(1, 0), load(2, 0), dx),
bicubic(load(-1, 1), load(0, 1), load(1, 1), load(2, 1), dx),
bicubic(load(-1, 2), load(0, 2), load(1, 2), load(2, 2), dx), dy);
dst[index] = result;
}
static void upscale_f32_cuda(const float * x, float * dst,
const int nb00, const int nb01, const int nb02, const int nb03,
const int ne10, const int ne11, const int ne12, const int ne13,
@@ -168,18 +104,6 @@ static void upscale_f32_bilinear_cuda(const float * x, float * dst,
upscale_f32_bilinear<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
}
static void upscale_f32_bicubic_cuda(const float * x, float * dst,
const int nb00, const int nb01, const int nb02, const int nb03,
const int ne00_src, const int ne01_src,
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
const float sf0, const float sf1, const float sf2, const float sf3,
const float pixel_offset, cudaStream_t stream) {
const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
upscale_f32_bicubic<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
}
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
@@ -197,22 +121,17 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float sf2 = (float)dst->ne[2]/src0->ne[2];
const float sf3 = (float)dst->ne[3]/src0->ne[3];
float pixel_offset = 0.5f;
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
sf0 = dst->ne[0] > 1 && src0->ne[0] > 1 ? (float)(dst->ne[0] - 1) / (src0->ne[0] - 1) : sf0;
sf1 = dst->ne[1] > 1 && src0->ne[1] > 1 ? (float)(dst->ne[1] - 1) / (src0->ne[1] - 1) : sf1;
pixel_offset = 0.0f;
}
if (mode == GGML_SCALE_MODE_NEAREST) {
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
float pixel_offset = 0.5f;
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
pixel_offset = 0.0f;
}
upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
sf0, sf1, sf2, sf3, pixel_offset, stream);
} else if (mode == GGML_SCALE_MODE_BICUBIC) {
upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
sf0, sf1, sf2, sf3, pixel_offset, stream);
}
}

View File

@@ -211,15 +211,12 @@ static inline void hex_format_op_names(char * str, const struct ggml_tensor * t)
// ** backend sessions
struct ggml_hexagon_session {
ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
ggml_hexagon_session(int dev_id) noexcept(false);
~ggml_hexagon_session() noexcept(true);
void allocate(int dev_id) noexcept(false);
void release() noexcept(true);
void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
void flush();
ggml_backend_buffer_type buffer_type;
ggml_backend_buffer_type repack_buffer_type;
@@ -240,37 +237,15 @@ struct ggml_hexagon_session {
uint32_t prof_pkts;
};
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
// Bump pending flag (cleared in the session::flush once we get the responce)
this->op_pending++; // atomic inc
int err = dspqueue_write(this->queue,
0, // flags - the framework will autoset this
n_bufs, // number of buffers
bufs, // buffer references
sizeof(req),
(const uint8_t *) &req, // Message
1000000 // Timeout
);
if (err != 0) {
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
}
if (sync) {
flush();
}
}
// Flush HTP response queue i.e wait for all outstanding requests to complete
void ggml_hexagon_session::flush() {
dspqueue_t q = this->queue;
// Packet callback
static void htp_packet_callback(dspqueue_t queue, AEEResult error, void * context) {
auto sess = static_cast<ggml_hexagon_session *>(context);
// Repeatedly read packets from the queue until it's empty. We don't
// necessarily get a separate callback for each packet, and new packets
// may arrive while we're processing the previous one.
while (this->op_pending) {
while (1) {
struct htp_general_rsp rsp;
uint32_t rsp_size;
uint32_t flags;
@@ -278,23 +253,22 @@ void ggml_hexagon_session::flush() {
struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
uint32_t n_bufs;
// Read response packet from queue
int err = dspqueue_read(q, &flags,
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
&n_bufs, // Number of buffer references
bufs, // Buffer references
sizeof(rsp), // Max message length
&rsp_size, // Message length
(uint8_t *) &rsp,
1000000); // Timeout
// Read packet from queue
int err = dspqueue_read_noblock(queue, &flags,
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
&n_bufs, // Number of buffer references
bufs, // Buffer references
sizeof(rsp), // Max message length
&rsp_size, // Message length
(uint8_t *) &rsp);
if (err == AEE_EEXPIRED) {
// TODO: might need to bail out if the HTP is stuck on something
continue;
if (err == AEE_EWOULDBLOCK) {
// Consumed all packets available for now
return;
}
if (err != 0) {
GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
GGML_ABORT("ggml-hex: dspqueue_read_noblock failed: 0x%08x\n", (unsigned) err);
}
// Basic sanity checks
@@ -307,15 +281,21 @@ void ggml_hexagon_session::flush() {
// TODO: handle errors
}
// TODO: update profiling implementation, currently only works for opt_opsync mode
this->prof_usecs = rsp.prof_usecs;
this->prof_cycles = rsp.prof_cycles;
this->prof_pkts = rsp.prof_pkts;
// FIXME: update profiling implementation
sess->prof_usecs = rsp.prof_usecs;
sess->prof_cycles = rsp.prof_cycles;
sess->prof_pkts = rsp.prof_pkts;
this->op_pending--; // atomic dec
sess->op_pending--; // atomic dec
}
}
// Error callback - simply terminates with an error. Used where we don't
// expect errors.
[[noreturn]] static void htp_error_callback(dspqueue_t queue, AEEResult error, void * context) {
GGML_ABORT("ggml-hex: dspcall general error 0x%x: for queue %p\n", error, (void *) queue);
}
// ** backend buffers
struct ggml_backend_hexagon_buffer_type_context {
@@ -367,13 +347,7 @@ struct ggml_backend_hexagon_buffer_context {
ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
size += 4 * 1024; // extra page for padding
if (rpcmem_alloc2) {
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
} else {
GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
}
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
if (!this->base) {
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
@@ -682,15 +656,6 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
// Ensure we don't try to read more data than is available in the source buffer 'data'
// or write more than the tensor can hold.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);
@@ -702,8 +667,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
for (int64_t i = 0; i < nrows; i++) {
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
@@ -712,25 +676,6 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
memcpy(dst, buf_rp, row_size);
}
// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
// re-init the row because we are potentially copying a partial row
init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);
// Copy only the remaining bytes from the source.
memcpy(buf_pd, src, n_rem_bytes);
// Repack the entire buffer
repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);
// Write only the corresponding remaining bytes to the destination tensor.
memcpy(dst, buf_rp, n_rem_bytes);
}
ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
@@ -743,14 +688,6 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
// Ensure we don't try to copy more data than the tensor actually contains.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);
@@ -762,8 +699,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
for (int64_t i = 0; i < nrows; i++) {
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);
@@ -772,20 +708,6 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
memcpy(dst, buf_rp, row_size);
}
// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);
// We still need to read and unpack the entire source row because quantization is block-based.
memcpy(buf_pd, src, row_size);
unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
// But we only copy the remaining number of bytes to the destination.
memcpy(dst, buf_rp, n_rem_bytes);
}
ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
@@ -1008,15 +930,6 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
// Ensure we don't try to read more data than is available in the source buffer 'data'
// or write more than the tensor can hold.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);
@@ -1028,8 +941,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
for (int64_t i = 0; i < nrows; i++) {
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
@@ -1038,25 +950,6 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
memcpy(dst, buf_rp, row_size);
}
// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
// re-init the row because we are potentially copying a partial row
init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);
// Copy only the remaining bytes from the source.
memcpy(buf_pd, src, n_rem_bytes);
// Repack the entire buffer
repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);
// Write only the corresponding remaining bytes to the destination tensor.
memcpy(dst, buf_rp, n_rem_bytes);
}
ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
@@ -1069,14 +962,6 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
// Ensure we don't try to copy more data than the tensor actually contains.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);
@@ -1088,8 +973,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
for (int64_t i = 0; i < nrows; i++) {
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);
@@ -1098,20 +982,6 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
memcpy(dst, buf_rp, row_size);
}
// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);
// We still need to read and unpack the entire source row because quantization is block-based.
memcpy(buf_pd, src, row_size);
unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
// But we only copy the remaining number of bytes to the destination.
memcpy(dst, buf_rp, n_rem_bytes);
}
ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
@@ -1359,15 +1229,6 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
// Ensure we don't try to read more data than is available in the source buffer 'data'
// or write more than the tensor can hold.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);
@@ -1379,8 +1240,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
for (int64_t i = 0; i < nrows; i++) {
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
@@ -1389,25 +1249,6 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
memcpy(dst, buf_rp, row_size);
}
// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) data + (i * row_size);
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
// re-init the row because we are potentially copying a partial row
init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);
// Copy only the remaining bytes from the source.
memcpy(buf_pd, src, n_rem_bytes);
// Repack the entire buffer (partial data + zero padding).
repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);
// Write only the corresponding remaining bytes to the destination tensor.
memcpy(dst, buf_rp, n_rem_bytes);
}
ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
@@ -1420,14 +1261,6 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
// Ensure we don't try to copy more data than the tensor actually contains.
const size_t total_tensor_size = (size_t)nrows * row_size;
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
// Calculate how many full rows and how many remaining bytes we need to process.
const int64_t n_full_rows = n_bytes_to_copy / row_size;
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
void * buf_pd = ggml_aligned_malloc(row_size_pd);
GGML_ASSERT(buf_pd != NULL);
@@ -1439,8 +1272,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros
// 1. Process all the full rows
for (int64_t i = 0; i < n_full_rows; i++) {
for (int64_t i = 0; i < nrows; i++) {
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);
@@ -1449,20 +1281,6 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
memcpy(dst, buf_rp, row_size);
}
// 2. Process the final, potentially partial, row
if (n_rem_bytes > 0) {
const int64_t i = n_full_rows;
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
uint8_t * dst = (uint8_t *) data + (i * row_size);
// We still need to read and unpack the entire source row because the format is block-based.
memcpy(buf_pd, src, row_size);
unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
// But we only copy the remaining number of bytes to the destination to respect the size limit.
memcpy(dst, buf_rp, n_rem_bytes);
}
ggml_aligned_free(buf_pd, row_size_pd);
ggml_aligned_free(buf_rp, row_size_rp);
}
@@ -1481,19 +1299,19 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
switch (tensor->type) {
case GGML_TYPE_Q4_0:
GGML_ASSERT(offset == 0);
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
GGML_ASSERT(size == ggml_nbytes(tensor));
repack_q4_0_q4x4x2(tensor, data, size);
break;
case GGML_TYPE_Q8_0:
GGML_ASSERT(offset == 0);
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
GGML_ASSERT(size == ggml_nbytes(tensor));
repack_q8_0_q8x4x2(tensor, data, size);
break;
case GGML_TYPE_MXFP4:
GGML_ASSERT(offset == 0);
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
GGML_ASSERT(size == ggml_nbytes(tensor));
repack_mxfp4_mxfp4x4x2(tensor, data, size);
break;
@@ -1517,19 +1335,19 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
switch (tensor->type) {
case GGML_TYPE_Q4_0:
GGML_ASSERT(offset == 0);
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
GGML_ASSERT(size == ggml_nbytes(tensor));
repack_q4x4x2_q4_0(data, tensor, size);
break;
case GGML_TYPE_Q8_0:
GGML_ASSERT(offset == 0);
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
GGML_ASSERT(size == ggml_nbytes(tensor));
repack_q8x4x2_q8_0(data, tensor, size);
break;
case GGML_TYPE_MXFP4:
GGML_ASSERT(offset == 0);
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
GGML_ASSERT(size == ggml_nbytes(tensor));
repack_mxfp4x4x2_mxfp4(data, tensor, size);
break;
@@ -1685,13 +1503,12 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
}
// Get session URI
char htp_uri[256];
sprintf(htp_uri, "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
char session_uri[256];
{
char htp_uri[256];
snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
struct remote_rpc_get_uri u = {};
struct remote_rpc_get_uri u;
u.session_id = this->session_id;
u.domain_name = const_cast<char *>(CDSP_DOMAIN_NAME);
u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
@@ -1702,12 +1519,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
if (err != AEE_SUCCESS) {
// fallback to single session uris
int htp_URI_domain_len = strlen(htp_uri) + MAX_DOMAIN_NAMELEN;
snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);
GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri);
GGML_LOG_ERROR("ggml-hex: failed to get URI for session %d : error 0x%x\n", dev_id, err);
throw std::runtime_error("ggml-hex: remote_session_control(get-uri) failed (see log for details)");
}
}
@@ -1751,8 +1564,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
0, // Flags
128 * 1024, // Request queue size (in bytes)
64 * 1024, // Response queue size (in bytes)
nullptr, // Read packet callback (we handle reads explicitly)
nullptr, // Error callback (we handle errors during reads)
htp_packet_callback, htp_error_callback,
(void *) this, // Callback context
&queue);
if (err != 0) {
@@ -1819,13 +1631,10 @@ void ggml_hexagon_session::release() noexcept(true) {
}
}
ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) {
ggml_hexagon_session::ggml_hexagon_session(int dev_id) noexcept(false) {
buffer_type.context = nullptr;
repack_buffer_type.context = nullptr;
buffer_type.device = dev;
repack_buffer_type.device = dev;
try {
allocate(dev_id);
@@ -2393,7 +2202,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
bufs[0].ptr = src0->data;
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[0].size = ggml_nbytes(src0);
bufs[0].flags = 0;
bufs[0].flags = DSPQUEUE_BUFFER_FLAG_REF;
// Second buffer Input Activations. This is a buffer that the CPU
// writes and the DSP reads, so we'll need to flush CPU caches and
@@ -2403,7 +2212,8 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
bufs[1].ptr = src1->data;
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[1].size = ggml_nbytes(src1);
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Third buffer Output Activations. We'll handle DSP
@@ -2414,7 +2224,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
bufs[2].ptr = dst->data;
bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[2].size = ggml_nbytes(dst);
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
// Primary DSP session from the src0 (normally weight) tensor
auto sess = src0_buf->sess;
@@ -2442,7 +2252,27 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
}
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
sess->enqueue(req, bufs, 3, opt_opsync);
// Bump pending flag (cleared in the callback once we get the responce)
sess->op_pending++; // atomic inc
int err = dspqueue_write(sess->queue,
0, // flags - the framework will autoset this
3, // number of buffers
bufs, // buffer references
sizeof(req),
(const uint8_t *) &req, // Message
1000000 // Timeout
);
if (err != 0) {
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
}
}
if (opt_opsync) {
while (sess->op_pending) {
;
}
}
t2 = ggml_time_us();
@@ -2498,7 +2328,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
bufs[0].ptr = src0->data;
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[0].size = ggml_nbytes(src0);
bufs[0].flags = 0;
bufs[0].flags = DSPQUEUE_BUFFER_FLAG_REF;
// Second buffer Input Activations. This is a buffer that the CPU
// writes and the DSP reads, so we'll need to flush CPU caches and
@@ -2508,7 +2338,8 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
bufs[1].ptr = src1->data;
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[1].size = ggml_nbytes(src1);
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Third buffer expert IDs. This is a buffer that the CPU
@@ -2519,7 +2350,8 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
bufs[2].ptr = src2->data;
bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
bufs[2].size = ggml_nbytes(src2);
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Forth buffer Output Activations. We'll handle DSP
@@ -2530,7 +2362,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
bufs[3].ptr = dst->data;
bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[3].size = ggml_nbytes(dst);
bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
// Primary DSP session from the src0 (normally weight) tensor
auto sess = src0_buf->sess;
@@ -2559,7 +2391,27 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
}
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
sess->enqueue(req, bufs, 4, opt_opsync);
// Bump pending flag (cleared in the callback once we get the responce)
sess->op_pending++; // atomic inc
int err = dspqueue_write(sess->queue,
0, // flags - the framework will autoset this
4, // number of buffers
bufs, // buffer references
sizeof(req),
(const uint8_t *) &req, // Message
1000000 // Timeout
);
if (err != 0) {
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
}
}
if (opt_opsync) {
while (sess->op_pending) {
;
}
}
t2 = ggml_time_us();
@@ -2632,7 +2484,8 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
bufs[0].ptr = src0->data;
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[0].size = ggml_nbytes(src0);
bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
// Second buffer = Second Operand of Binary op
@@ -2644,7 +2497,8 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
bufs[1].ptr = src1->data;
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[1].size = ggml_nbytes(src1);
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Third buffer = Output Activations. We'll handle DSP
@@ -2655,7 +2509,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
bufs[2].ptr = dst->data;
bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[2].size = ggml_nbytes(dst);
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
// Primary DSP session from the src0 tensor
ggml_hexagon_session * sess = src0_buf->sess;
@@ -2683,7 +2537,26 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
}
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
sess->enqueue(req, bufs, 3, opt_opsync);
// Bump pending flag (cleared in the callback once we get the responce)
sess->op_pending++; // atomic inc
int err = dspqueue_write(sess->queue,
0, // flags - the framework will autoset this
3, // number of buffers
bufs, // buffer references
sizeof(req),
(const uint8_t *) &req, // Message
1000000); // Timeout
if (0 != err) {
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
}
}
if (opt_opsync) {
while (sess->op_pending) {
;
}
}
t2 = ggml_time_us();
@@ -2748,7 +2621,8 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
bufs[0].ptr = src0->data;
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[0].size = ggml_nbytes(src0);
bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
// Second buffer = experts bias
@@ -2756,7 +2630,8 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
bufs[1].ptr = src1->data;
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[1].size = ggml_nbytes(src1);
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Third buffer = activated experts
@@ -2764,7 +2639,8 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
bufs[2].ptr = src2->data;
bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
bufs[2].size = ggml_nbytes(src2);
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
// Forth buffer = output activations
@@ -2772,7 +2648,7 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
bufs[3].ptr = dst->data;
bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[3].size = ggml_nbytes(dst);
bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
// Primary DSP session from the src0 tensor
ggml_hexagon_session * sess = src0_buf->sess;
@@ -2802,7 +2678,26 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
}
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
sess->enqueue(req, bufs, 4, opt_opsync);
// Bump pending flag (cleared in the callback once we get the responce)
sess->op_pending++; // atomic inc
int err = dspqueue_write(sess->queue,
0, // flags - the framework will autoset this
4, // number of buffers
bufs, // buffer references
sizeof(req),
(const uint8_t *) &req, // Message
1000000); // Timeout
if (0 != err) {
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
}
}
if (opt_opsync) {
while (sess->op_pending) {
;
}
}
t2 = ggml_time_us();
@@ -2900,7 +2795,8 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
bufs[n_bufs].ptr = src0->data;
bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[n_bufs].size = ggml_nbytes(src0);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
++n_bufs;
@@ -2915,7 +2811,8 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
bufs[n_bufs].ptr = src1->data;
bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[n_bufs].size = ggml_nbytes(src1);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
++n_bufs;
}
@@ -2930,7 +2827,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
bufs[n_bufs].ptr = dst->data;
bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[n_bufs].size = ggml_nbytes(dst);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
++n_bufs;
// Primary DSP session from the src0 tensor
@@ -2963,7 +2860,26 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
}
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
sess->enqueue(req, bufs, n_bufs, opt_opsync);
// Bump pending flag (cleared in the callback once we get the responce)
sess->op_pending++; // atomic inc
int err = dspqueue_write(sess->queue,
0, // flags - the framework will autoset this
n_bufs, // number of buffers
bufs, // buffer references
sizeof(req),
(const uint8_t *) &req, // Message
1000000); // Timeout
if (0 != err) {
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
}
}
if (opt_opsync) {
while (sess->op_pending) {
;
}
}
t2 = ggml_time_us();
@@ -3037,7 +2953,8 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
bufs[n_bufs].ptr = src0->data;
bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
bufs[n_bufs].size = ggml_nbytes(src0);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
++n_bufs;
@@ -3051,7 +2968,8 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
bufs[n_bufs].ptr = src1->data;
bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
bufs[n_bufs].size = ggml_nbytes(src1);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
++n_bufs;
@@ -3066,7 +2984,8 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
bufs[n_bufs].ptr = src2->data;
bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
bufs[n_bufs].size = ggml_nbytes(src2);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | // Take a reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
++n_bufs;
}
@@ -3081,7 +3000,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
bufs[n_bufs].ptr = dst->data;
bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
bufs[n_bufs].size = ggml_nbytes(dst);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
++n_bufs;
// Primary DSP session from the src0 tensor
@@ -3114,7 +3033,26 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
}
if ((opt_opmask & HTP_OPMASK_QUEUE)) {
sess->enqueue(req, bufs, n_bufs, opt_opsync);
// Bump pending flag (cleared in the callback once we get the responce)
sess->op_pending++; // atomic inc
int err = dspqueue_write(sess->queue,
0, // flags - the framework will autoset this
n_bufs, // number of buffers
bufs, // buffer references
sizeof(req),
(const uint8_t *) &req, // Message
1000000); // Timeout
if (0 != err) {
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
}
}
if (opt_opsync) {
while (sess->op_pending) {
;
}
}
t2 = ggml_time_us();
@@ -3259,7 +3197,9 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
}
// Wait until all pending ops complete
sess->flush();
while (sess->op_pending) {
;
}
return GGML_STATUS_SUCCESS;
}
@@ -3270,7 +3210,9 @@ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str());
// Wait until all pending ops complete
sess->flush();
while (sess->op_pending) {
;
}
}
struct node_info {
@@ -3679,11 +3621,6 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
}
}
if(opt_arch < 75) {
opt_ndev = 1;
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
}
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
// Create devices / sessions
@@ -3691,7 +3628,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
devices[i].iface = ggml_backend_hexagon_device_i;
devices[i].reg = reg;
try {
devices[i].context = new ggml_hexagon_session(i, &devices[i]);
devices[i].context = new ggml_hexagon_session(i);
} catch (std::exception const &exc) {
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
devices[i].context = nullptr;

View File

@@ -64,7 +64,6 @@ extern "C" {
# pragma weak remote_handle64_control
# pragma weak fastrpc_mmap
# pragma weak fastrpc_munmap
# pragma weak rpcmem_alloc2
#endif
#if !defined(_WINDOWS)

View File

@@ -395,14 +395,28 @@ static void proc_matmul_req(struct htp_context * ctx,
struct htp_general_req * req,
struct dspqueue_buffer * bufs,
size_t n_bufs) {
struct dspqueue_buffer rsp_bufs[1];
// Prep response buffer structs (needed for error responses, etc)
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[0].fd = bufs[2].fd;
rsp_bufs[0].ptr = bufs[2].ptr;
rsp_bufs[0].size = bufs[2].size;
rsp_bufs[0].offset = bufs[2].offset;
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
rsp_bufs[2].fd = bufs[2].fd;
rsp_bufs[2].ptr = bufs[2].ptr;
rsp_bufs[2].size = bufs[2].size;
rsp_bufs[2].offset = bufs[2].offset;
rsp_bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
@@ -430,21 +444,41 @@ static void proc_matmul_req(struct htp_context * ctx,
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 3, &prof);
}
static void proc_matmul_id_req(struct htp_context * ctx,
struct htp_general_req * req,
struct dspqueue_buffer * bufs,
size_t n_bufs) {
struct dspqueue_buffer rsp_bufs[1];
// Prep response buffer structs (needed for error responses, etc)
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[2].fd = bufs[2].fd;
rsp_bufs[2].ptr = bufs[2].ptr;
rsp_bufs[2].size = bufs[2].size;
rsp_bufs[2].offset = bufs[2].offset;
rsp_bufs[2].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[0].fd = bufs[3].fd;
rsp_bufs[0].ptr = bufs[3].ptr;
rsp_bufs[0].size = bufs[3].size;
rsp_bufs[0].offset = bufs[3].offset;
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
rsp_bufs[3].fd = bufs[3].fd;
rsp_bufs[3].ptr = bufs[3].ptr;
rsp_bufs[3].size = bufs[3].size;
rsp_bufs[3].offset = bufs[3].offset;
rsp_bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
@@ -474,18 +508,32 @@ static void proc_matmul_id_req(struct htp_context * ctx,
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 4, &prof);
}
static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
struct dspqueue_buffer rsp_bufs[1];
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[0].fd = bufs[2].fd;
rsp_bufs[0].ptr = bufs[2].ptr;
rsp_bufs[0].offset = bufs[2].offset;
rsp_bufs[0].size = bufs[2].size;
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
rsp_bufs[2].fd = bufs[2].fd;
rsp_bufs[2].ptr = bufs[2].ptr;
rsp_bufs[2].offset = bufs[2].offset;
rsp_bufs[2].size = bufs[2].size;
rsp_bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
@@ -513,18 +561,38 @@ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * r
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 3, &prof);
}
static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
struct dspqueue_buffer rsp_bufs[1];
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[2].fd = bufs[2].fd;
rsp_bufs[2].ptr = bufs[2].ptr;
rsp_bufs[2].offset = bufs[2].offset;
rsp_bufs[2].size = bufs[2].size;
rsp_bufs[2].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[0].fd = bufs[3].fd;
rsp_bufs[0].ptr = bufs[3].ptr;
rsp_bufs[0].offset = bufs[3].offset;
rsp_bufs[0].size = bufs[3].size;
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
rsp_bufs[3].fd = bufs[3].fd;
rsp_bufs[3].ptr = bufs[3].ptr;
rsp_bufs[3].offset = bufs[3].offset;
rsp_bufs[3].size = bufs[3].size;
rsp_bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
@@ -554,18 +622,26 @@ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * r
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 4, &prof);
}
static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[0].fd = bufs[1].fd;
rsp_bufs[0].ptr = bufs[1].ptr;
rsp_bufs[0].offset = bufs[1].offset;
rsp_bufs[0].size = bufs[1].size;
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
@@ -593,7 +669,7 @@ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * re
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 2, &prof);
}
static void proc_activations_req(struct htp_context * ctx,
@@ -601,16 +677,33 @@ static void proc_activations_req(struct htp_context * ctx,
struct dspqueue_buffer * bufs,
uint32_t n_bufs) {
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
int write_idx = (n_bufs == 3) ? 2 : 1;
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
int write_idx = 1;
if (3 == n_bufs) {
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
write_idx = 2;
}
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[0].fd = bufs[write_idx].fd;
rsp_bufs[0].ptr = bufs[write_idx].ptr;
rsp_bufs[0].offset = bufs[write_idx].offset;
rsp_bufs[0].size = bufs[write_idx].size;
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
rsp_bufs[write_idx].fd = bufs[write_idx].fd;
rsp_bufs[write_idx].ptr = bufs[write_idx].ptr;
rsp_bufs[write_idx].offset = bufs[write_idx].offset;
rsp_bufs[write_idx].size = bufs[write_idx].size;
rsp_bufs[write_idx].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
struct htp_ops_context octx = { 0 };
@@ -649,7 +742,7 @@ static void proc_activations_req(struct htp_context * ctx,
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, n_bufs, &prof);
}
static void proc_rope_req(struct htp_context * ctx,
@@ -657,16 +750,39 @@ static void proc_rope_req(struct htp_context * ctx,
struct dspqueue_buffer * bufs,
uint32_t n_bufs) {
struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
memset(rsp_bufs, 0, sizeof(rsp_bufs));
int write_idx = (n_bufs == 4) ? 3 : 2;
rsp_bufs[0].fd = bufs[0].fd;
rsp_bufs[0].ptr = bufs[0].ptr;
rsp_bufs[0].offset = bufs[0].offset;
rsp_bufs[0].size = bufs[0].size;
rsp_bufs[0].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
rsp_bufs[1].fd = bufs[1].fd;
rsp_bufs[1].ptr = bufs[1].ptr;
rsp_bufs[1].offset = bufs[1].offset;
rsp_bufs[1].size = bufs[1].size;
rsp_bufs[1].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
int write_idx = 2;
if (4 == n_bufs) {
rsp_bufs[write_idx].fd = bufs[write_idx].fd;
rsp_bufs[write_idx].ptr = bufs[write_idx].ptr;
rsp_bufs[write_idx].offset = bufs[write_idx].offset;
rsp_bufs[write_idx].size = bufs[write_idx].size;
rsp_bufs[write_idx].flags = DSPQUEUE_BUFFER_FLAG_DEREF; // Release reference
write_idx++;
}
// We had written to the output buffer, we'd also need to flush it
rsp_bufs[0].fd = bufs[write_idx].fd;
rsp_bufs[0].ptr = bufs[write_idx].ptr;
rsp_bufs[0].offset = bufs[write_idx].offset;
rsp_bufs[0].size = bufs[write_idx].size;
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
rsp_bufs[write_idx].fd = bufs[write_idx].fd;
rsp_bufs[write_idx].ptr = bufs[write_idx].ptr;
rsp_bufs[write_idx].offset = bufs[write_idx].offset;
rsp_bufs[write_idx].size = bufs[write_idx].size;
rsp_bufs[write_idx].flags = (DSPQUEUE_BUFFER_FLAG_DEREF | // Release reference
DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush NSP
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
// Setup Op context
struct htp_ops_context octx = { 0 };
@@ -703,7 +819,7 @@ static void proc_rope_req(struct htp_context * ctx,
}
profile_stop(&prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, n_bufs, &prof);
}
static void htp_packet_callback(dspqueue_t queue, int error, void * context) {

View File

@@ -43,46 +43,46 @@ static inline int32_t htp_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_s
}
static inline void htp_dump_int8_line(char * pref, const int8_t * x, int n) {
char str[1024], *p = str, *p_end = str + sizeof(str);
p += snprintf(p, p_end - p, "%s: ", pref);
for (int i = 0; i < n && p < p_end; i++) {
p += snprintf(p, p_end - p, "%d, ", x[i]);
char str[1024], *p = str;
p += sprintf(p, "%s: ", pref);
for (int i = 0; i < 16; i++) {
p += sprintf(p, "%d, ", x[i]);
}
FARF(HIGH, "%s\n", str);
}
static inline void htp_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {
char str[1024], *p = str, *p_end = str + sizeof(str);
p += snprintf(p, p_end - p, "%s: ", pref);
for (int i = 0; i < n && p < p_end; i++) {
p += snprintf(p, p_end - p, "%d, ", x[i]);
char str[1024], *p = str;
p += sprintf(p, "%s: ", pref);
for (int i = 0; i < n; i++) {
p += sprintf(p, "%d, ", x[i]);
}
FARF(HIGH, "%s\n", str);
}
static inline void htp_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
char str[1024], *p = str, *p_end = str + sizeof(str);
p += snprintf(p, p_end - p, "%s: ", pref);
char str[1024], *p = str;
p += sprintf(p, "%s: ", pref);
for (int i = 0; i < n; i++) {
p += snprintf(p, p_end - p, "%d, ", (int) x[i]);
p += sprintf(p, "%d, ", (int) x[i]);
}
FARF(HIGH, "%s\n", str);
}
static inline void htp_dump_fp16_line(char * pref, const __fp16 * x, uint32_t n) {
char str[1024], *p = str, *p_end = str + sizeof(str);
p += snprintf(p, p_end - p, "%s: ", pref);
char str[1024], *p = str;
p += sprintf(p, "%s: ", pref);
for (int i = 0; i < n; i++) {
p += snprintf(p, p_end - p, "%.6f, ", (float) x[i]);
p += sprintf(p, "%.6f, ", (float) x[i]);
}
FARF(HIGH, "%s\n", str);
}
static inline void htp_dump_fp32_line(char * pref, const float * x, uint32_t n) {
char str[1024], *p = str, *p_end = str + sizeof(str);
p += snprintf(p, p_end - p, "%s: ", pref);
char str[1024], *p = str;
p += sprintf(p, "%s: ", pref);
for (int i = 0; i < n; i++) {
p += snprintf(p, p_end - p, "%.6f, ", x[i]);
p += sprintf(p, "%.6f, ", x[i]);
}
FARF(HIGH, "%s\n", str);
}

View File

@@ -29,11 +29,10 @@ if (CXX_IS_HIPCC)
endif()
else()
# Forward (AMD)GPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
if(AMDGPU_TARGETS AND NOT GPU_TARGETS)
set(GPU_TARGETS ${AMDGPU_TARGETS})
endif()
if(GPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES ${GPU_TARGETS})
elseif(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
endif()
cmake_minimum_required(VERSION 3.21)
enable_language(HIP)

View File

@@ -682,7 +682,6 @@ static inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph,
#endif
#ifdef __cplusplus
#include <array>
#include <initializer_list>
#include <vector>
@@ -698,21 +697,6 @@ inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph,
return ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
}
// Return true if the edges in the graph match expectations.
inline bool ggml_check_edges(const struct ggml_cgraph * cgraph,
int start_idx,
std::initializer_list<std::array<int, 3>> edges) {
for (const auto & edge : edges) {
int dst_node = edge[0];
int src_idx = edge[1];
int src_node = edge[2];
if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) {
return false;
}
}
return true;
}
// expose GGUF internals for test code
GGML_API size_t gguf_type_size(enum gguf_type type);
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);

View File

@@ -35,6 +35,7 @@ struct ggml_metal {
// additional, inference-time compiled pipelines
ggml_metal_pipelines_t pipelines_ext;
bool use_bfloat;
bool use_fusion;
bool use_concurrency;
bool use_graph_optimize;
@@ -120,10 +121,11 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
}
}
//const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
res->use_bfloat = props_dev->has_bfloat;
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
@@ -145,6 +147,7 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, res->use_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: use fusion = %s\n", __func__, res->use_fusion ? "true" : "false");
GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
@@ -289,7 +292,7 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
// queue the copy operation into the queue of the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
[encoder copyFromBuffer:buf_src
@@ -300,7 +303,6 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
[encoder endEncoding];
[cmd_buf commit];
[buf_src release];
// do not wait here for completion
//[cmd_buf waitUntilCompleted];
@@ -331,7 +333,7 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
// queue the copy operation into the queue of the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
[encoder copyFromBuffer:bid_src.metal
@@ -342,7 +344,6 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
[encoder endEncoding];
[cmd_buf commit];
[buf_dst release];
// do not wait here for completion
//[cmd_buf waitUntilCompleted];

View File

@@ -677,7 +677,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id_map0(ggml_metal_
char name[256];
snprintf(base, 256, "kernel_mul_mm_id_map0_ne20_%d", ne20);
snprintf(name, 256, "%s_ne02=%d", base, ne02);
snprintf(name, 256, "%s", base);
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
if (res) {
@@ -1332,12 +1332,11 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope(ggml_metal_library_t
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (is_neox) {
snprintf(base, 256, "kernel_rope_neox_%s", ggml_type_name(op->src[0]->type));
} else if ((is_mrope || is_imrope) && !is_vision) {
} else if (is_mrope && !is_vision) {
GGML_ASSERT(op->src[1]->ne[0]*4 >= op->src[0]->ne[2]); // need at least 4 pos per token
snprintf(base, 256, "kernel_rope_multi_%s", ggml_type_name(op->src[0]->type));
} else if (is_vision) {
@@ -1347,20 +1346,14 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope(ggml_metal_library_t
snprintf(base, 256, "kernel_rope_norm_%s", ggml_type_name(op->src[0]->type));
}
snprintf(name, 256, "%s_imrope=%d", base, is_imrope ? 1 : 0);
snprintf(name, 256, "%s", base);
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
if (res) {
return res;
}
ggml_metal_cv_t cv = ggml_metal_cv_init();
ggml_metal_cv_set_bool(cv, is_imrope, FC_ROPE + 0);
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
ggml_metal_cv_free(cv);
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
return res;
}

View File

@@ -95,9 +95,7 @@ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder);
typedef struct ggml_metal_library * ggml_metal_library_t;
ggml_metal_library_t ggml_metal_library_init (ggml_metal_device_t dev);
ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose);
ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev);
void ggml_metal_library_free(ggml_metal_library_t lib);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
@@ -195,7 +193,6 @@ struct ggml_metal_device_props {
bool has_simdgroup_mm;
bool has_unified_memory;
bool has_bfloat;
bool has_tensor;
bool use_residency_sets;
bool use_shared_buffers;

View File

@@ -21,9 +21,8 @@
#define GGML_METAL_HAS_RESIDENCY_SETS 1
#endif
// overload of MTLGPUFamilyMetalX (not available in some environments)
// overload of MTLGPUFamilyMetal3 (not available in some environments)
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
// virtual address for GPU memory allocations
static atomic_uintptr_t g_addr_device = 0x000000400ULL;
@@ -262,10 +261,6 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
}
if (ggml_metal_device_get_props(dev)->has_tensor) {
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
}
#if GGML_METAL_EMBED_LIBRARY
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
#endif
@@ -303,72 +298,6 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
return res;
}
ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
if (source == NULL) {
GGML_LOG_ERROR("%s: source is NULL\n", __func__);
return NULL;
}
id<MTLDevice> device = ggml_metal_device_get_obj(dev);
id<MTLLibrary> library = nil;
NSError * error = nil;
const int64_t t_start = ggml_time_us();
NSString * src = [[NSString alloc] initWithBytes:source
length:strlen(source)
encoding:NSUTF8StringEncoding];
if (!src) {
GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
return NULL;
}
@autoreleasepool {
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
MTLCompileOptions * options = [MTLCompileOptions new];
options.preprocessorMacros = prep;
library = [device newLibraryWithSource:src options:options error:&error];
if (error) {
if (verbose) {
GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
} else {
GGML_LOG_ERROR("%s: error compiling source\n", __func__);
}
library = nil;
}
[options release];
}
[src release];
if (!library) {
if (verbose) {
GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
}
return NULL;
}
if (verbose) {
GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
}
ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
if (!res) {
GGML_LOG_ERROR("%s: calloc failed\n", __func__);
return NULL;
}
res->obj = library;
res->device = device;
res->pipelines = ggml_metal_pipelines_init();
return res;
}
void ggml_metal_library_free(ggml_metal_library_t lib) {
if (!lib) {
return;
@@ -416,9 +345,9 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
if (!mtl_function) {
ggml_critical_section_end();
GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
GGML_LOG_ERROR("%s: error: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
if (error) {
GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
}
return nil;
@@ -426,21 +355,13 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
ggml_metal_pipelines_add(lib->pipelines, name, res);
[mtl_function release];
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
(int) res->obj.maxTotalThreadsPerThreadgroup,
(int) res->obj.threadExecutionWidth);
if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
ggml_critical_section_end();
GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
return nil;
}
ggml_metal_pipelines_add(lib->pipelines, name, res);
}
ggml_critical_section_end();
@@ -548,128 +469,6 @@ ggml_metal_device_t ggml_metal_device_init(void) {
dev->props.has_bfloat = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
dev->props.has_bfloat = false;
}
dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
dev->props.has_tensor = false;
}
// note: disable the tensor API by default for old chips because with the current implementation it is not useful
// - M2 Ultra: ~5% slower
// - M4, M4 Max: no significant difference
//
// TODO: try to update the tensor API kernels to at least match the simdgroup performance
if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
![[dev->mtl_device name] containsString:@"M5"] &&
![[dev->mtl_device name] containsString:@"M6"] &&
![[dev->mtl_device name] containsString:@"A19"] &&
![[dev->mtl_device name] containsString:@"A20"]) {
GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
dev->props.has_tensor = false;
}
// double-check that the tensor API compiles
if (dev->props.has_tensor) {
const char * src_tensor_f16 = "\n"
"#include <metal_stdlib> \n"
"#include <metal_tensor> \n"
"#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
" \n"
"using namespace metal; \n"
"using namespace mpp::tensor_ops; \n"
" \n"
"kernel void dummy_kernel( \n"
" tensor<device half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
" tensor<device half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
" device float * C [[buffer(2)]], \n"
" uint2 tgid [[threadgroup_position_in_grid]]) \n"
"{ \n"
" auto tA = A.slice(0, (int)tgid.y); \n"
" auto tB = B.slice((int)tgid.x, 0); \n"
" \n"
" matmul2d< \n"
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
" execution_simdgroups<4>> mm; \n"
" \n"
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
" \n"
" auto sA = tA.slice(0, 0); \n"
" auto sB = tB.slice(0, 0); \n"
" mm.run(sB, sA, cT); \n"
" \n"
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
" \n"
" cT.store(tC); \n"
"}";
GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
if (lib == NULL) {
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
dev->props.has_tensor = false;
} else {
ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
if (!ppl) {
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
dev->props.has_tensor = false;
}
ggml_metal_library_free(lib);
}
}
// try to compile a dummy kernel to determine if the tensor API is supported for bfloat
if (dev->props.has_tensor && dev->props.has_bfloat) {
const char * src_tensor_bf16 = "\n"
"#include <metal_stdlib> \n"
"#include <metal_tensor> \n"
"#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
" \n"
"using namespace metal; \n"
"using namespace mpp::tensor_ops; \n"
" \n"
"kernel void dummy_kernel( \n"
" tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
" tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
" device float * C [[buffer(2)]], \n"
" uint2 tgid [[threadgroup_position_in_grid]]) \n"
"{ \n"
" auto tA = A.slice(0, (int)tgid.y); \n"
" auto tB = B.slice((int)tgid.x, 0); \n"
" \n"
" matmul2d< \n"
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
" execution_simdgroups<4>> mm; \n"
" \n"
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
" \n"
" auto sA = tA.slice(0, 0); \n"
" auto sB = tB.slice(0, 0); \n"
" mm.run(sB, sA, cT); \n"
" \n"
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
" \n"
" cT.store(tC); \n"
"}";
GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
if (lib == NULL) {
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
dev->props.has_bfloat = false;
} else {
ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
if (!ppl) {
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
dev->props.has_bfloat = false;
}
ggml_metal_library_free(lib);
}
}
dev->props.use_residency_sets = true;
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
@@ -677,6 +476,7 @@ ggml_metal_device_t ggml_metal_device_init(void) {
#endif
dev->props.use_shared_buffers = dev->props.has_unified_memory;
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
dev->props.use_shared_buffers = false;
}
@@ -729,7 +529,6 @@ ggml_metal_device_t ggml_metal_device_init(void) {
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm ? "true" : "false");
GGML_LOG_INFO("%s: has unified memory = %s\n", __func__, dev->props.has_unified_memory ? "true" : "false");
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, dev->props.has_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: has tensor = %s\n", __func__, dev->props.has_tensor ? "true" : "false");
GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false");
GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false");
@@ -908,7 +707,6 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
if (op->src[0]->ne[0] != 32 &&
op->src[0]->ne[0] != 40 &&
op->src[0]->ne[0] != 64 &&
op->src[0]->ne[0] != 72 &&
op->src[0]->ne[0] != 80 &&
op->src[0]->ne[0] != 96 &&
op->src[0]->ne[0] != 112 &&

View File

@@ -76,7 +76,6 @@
#define FC_FLASH_ATTN_EXT_VEC_REDUCE 500
#define FC_MUL_MV 600
#define FC_MUL_MM 700
#define FC_ROPE 800
// op-specific constants
#define OP_FLASH_ATTN_EXT_NQPTG 8

View File

@@ -1036,11 +1036,6 @@ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
nth = std::min(nth, nk0);
if (nth*nrptg > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
nrptg = 1;
}
ggml_metal_kargs_set_rows args = {
/*.nk0 =*/ nk0,
/*.ne01 =*/ ne01,

Some files were not shown because too many files have changed in this diff Show More