Compare commits

..

2 Commits

Author SHA1 Message Date
Georgi Gerganov
35df147d80 cont : remove /api/tags 2026-04-20 15:45:42 +03:00
Georgi Gerganov
c1891fd6eb server : remove /api endpoints 2026-04-20 15:34:18 +03:00
404 changed files with 16719 additions and 35103 deletions

View File

@@ -1,4 +1,4 @@
ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
## Build Image

View File

@@ -2,19 +2,7 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
ARG UBUNTU_VERSION=24.04
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
ARG IGC_VERSION=v2.30.1
ARG IGC_VERSION_FULL=2_2.30.1+20950
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
ARG IGDGMM_VERSION=22.9.0
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
ARG NPU_DRIVER_VERSION=v1.32.0
ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
# Optional proxy build arguments
# Optional proxy build arguments - empty by default
ARG http_proxy=
ARG https_proxy=
@@ -90,47 +78,13 @@ ARG http_proxy
ARG https_proxy
RUN apt-get update \
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
&& apt-get install -y libgomp1 libtbb12 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
# Install GPU drivers
ARG IGC_VERSION
ARG IGC_VERSION_FULL
ARG COMPUTE_RUNTIME_VERSION
ARG COMPUTE_RUNTIME_VERSION_FULL
ARG IGDGMM_VERSION
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& dpkg --install *.deb \
&& rm -rf /tmp/neo/
# Install NPU drivers
ARG NPU_DRIVER_VERSION
ARG NPU_DRIVER_FULL
ARG LIBZE1_VERSION
RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
&& wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
&& tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
&& dpkg --install *.deb \
&& rm -rf /tmp/npu/
RUN cd /tmp \
&& wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
&& dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
&& rm libze1_${LIBZE1_VERSION}_amd64.deb
COPY --from=build /app/lib/ /app/
### Full (all binaries)

View File

@@ -6,7 +6,7 @@
<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->
## Requirements
# Requirements
<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->

View File

@@ -1,116 +0,0 @@
name: CI (snapdragon)
on:
workflow_dispatch:
push:
branches:
- master
paths:
- '.github/workflows/build-and-test-snapdragon.yml'
- 'ggml/include/ggml-hexagon.h'
- 'ggml/src/ggml-hexagon/**'
- 'docs/backend/snapdragon/**'
- 'scripts/snapdragon/**'
- 'CMakePresets.json'
pull_request:
types: [opened, synchronize, reopened]
paths:
- '.github/workflows/build-and-test-snapdragon.yml'
- 'ggml/include/ggml-hexagon.h'
- 'ggml/src/ggml-hexagon/**'
- 'docs/backend/snapdragon/**'
- 'scripts/snapdragon/**'
- 'CMakePresets.json'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
android-ndk-snapdragon:
runs-on: ubuntu-latest
container:
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
defaults:
run:
shell: bash
steps:
- name: Clone
uses: actions/checkout@v6
with:
fetch-depth: 0
lfs: false
- name: Build Llama.CPP for Snapdragon Android
id: build_llama_cpp_snapdragon_android
run: |
cp docs/backend/snapdragon/CMakeUserPresets.json .
cmake --preset arm64-android-snapdragon-release -B build
cmake --build build
cmake --install build --prefix pkg-snapdragon/llama.cpp
- name: Upload Llama.CPP Snapdragon Android Build Artifact
if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
uses: actions/upload-artifact@v6
with:
name: llama-cpp-android-arm64-snapdragon
path: pkg-snapdragon/llama.cpp
test-snapdragon-qdc:
name: Test on QDC Android Device (${{ matrix.device }})
needs: [android-ndk-snapdragon]
runs-on: ubuntu-slim
strategy:
fail-fast: false
matrix:
device: [SM8750, SM8650, SM8850]
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Download build artifact
uses: actions/download-artifact@v7
with:
name: llama-cpp-android-arm64-snapdragon
path: pkg-snapdragon/llama.cpp
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
cache: pip
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y curl unzip
- name: Install QDC SDK wheel
run: |
curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
unzip qdc_sdk.zip -d qdc_sdk
pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
- name: Check QDC API key
id: check_secret
env:
QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
run: echo "has-qdc-key=${{ env.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
- name: Run QDC tests (${{ matrix.device }})
if: steps.check_secret.outputs.has-qdc-key == 'true'
run: |
python scripts/snapdragon/qdc/run_qdc_jobs.py \
--test all \
--pkg-dir pkg-snapdragon/llama.cpp \
--model-url "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
--device ${{ matrix.device }}
env:
QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
- name: Cleanup
if: always()
run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip

View File

@@ -1,24 +1,26 @@
name: CI (android)
on:
workflow_dispatch:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths:
- '.github/workflows/build-android.yml'
- '**/CMakeLists.txt'
- '**/.cmake'
- '**/*.h'
- '**/*.hpp'
- '**/*.c'
- '**/*.cpp'
paths: [
'.github/workflows/build-android.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
pull_request:
types: [opened, synchronize, reopened]
paths:
- '.github/workflows/build-android.yml'
- 'examples/llama.android/**'
paths: [
'.github/workflows/build-android.yml',
'examples/llama.android/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -65,24 +67,35 @@ jobs:
defaults:
run:
shell: bash
strategy:
matrix:
include:
- build: 'arm64-cpu'
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
- build: 'arm64-snapdragon'
defines: '--preset arm64-android-snapdragon-release'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
lfs: false
- name: Build
id: ndk_build
- name: Build Llama.CPP for Hexagon Android
id: build_llama_cpp_hexagon_android
run: |
cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
cp docs/backend/snapdragon/CMakeUserPresets.json .
fi
cmake ${{ matrix.defines }} -B build
cmake --build build
cmake --install build --prefix pkg-adb/llama.cpp
- name: Upload Android Build Artifact
if: ${{ always() && steps.ndk_build.outcome == 'success' }}
- name: Upload Llama.CPP Hexagon Android Build Artifact
if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
uses: actions/upload-artifact@v6
with:
name: llama-cpp-android-arm64-cpu
name: llama-cpp-android-${{ matrix.build }}
path: pkg-adb/llama.cpp

View File

@@ -1,120 +0,0 @@
name: CI (openvino)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-openvino.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-openvino.yml',
'ggml/src/ggml-openvino/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-openvino:
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
concurrency:
group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
cancel-in-progress: false
strategy:
matrix:
include:
- variant: cpu
runner: '"ubuntu-24.04"'
openvino_device: "CPU"
- variant: gpu
runner: '["self-hosted","Linux","Intel","OpenVINO"]'
openvino_device: "GPU"
runs-on: ${{ fromJSON(matrix.runner) }}
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
if: runner.environment == 'github-hosted'
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
- name: Use OpenVINO Toolkit Cache
if: runner.environment == 'github-hosted'
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenVINO dependencies
run: |
cd ./openvino_toolkit
chmod +x ./install_dependencies/install_openvino_dependencies.sh
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
- name: Build
id: cmake_build
run: |
source ./openvino_toolkit/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
time cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Test
id: cmake_test
# TODO: fix and re-enable the `test-llama-archs` test below
run: |
cd ${{ github.workspace }}
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
export GGML_OPENVINO_DEVICE=GPU
fi
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000

View File

@@ -265,10 +265,6 @@ jobs:
ggml-ci-intel-openvino-gpu-low-perf:
runs-on: [self-hosted, Linux, Intel, OpenVINO]
concurrency:
group: openvino-gpu-${{ github.head_ref || github.ref }}
cancel-in-progress: false
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"

View File

@@ -1,142 +0,0 @@
name: CI (sycl)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-sycl.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-sycl.yml',
'ggml/src/ggml-sycl/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-sycl:
strategy:
matrix:
build: [fp32, fp16]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
continue-on-error: true
steps:
- uses: actions/checkout@v6
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
cd /tmp
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-sycl-${{ matrix.build }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DLLAMA_OPENSSL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_SYCL_F16=${{ matrix.fp16 }}
time cmake --build build --config Release -j $(nproc)
windows-latest-sycl:
runs-on: windows-2022
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: windows-latest-sycl
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat

View File

@@ -555,6 +555,186 @@ jobs:
-DGGML_MUSA=ON
time cmake --build build --config Release -j $(nproc)
ubuntu-22-sycl:
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v6
- name: add oneAPI to apt
shell: bash
run: |
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
- name: install oneAPI dpcpp compiler
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
- name: install oneAPI MKL library
shell: bash
run: |
sudo apt install intel-oneapi-mkl-devel
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-sycl
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
time cmake --build build --config Release -j $(nproc)
ubuntu-22-sycl-fp16:
runs-on: ubuntu-22.04
continue-on-error: true
steps:
- uses: actions/checkout@v6
- name: add oneAPI to apt
shell: bash
run: |
cd /tmp
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
- name: install oneAPI dpcpp compiler
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
- name: install oneAPI MKL library
shell: bash
run: |
sudo apt install intel-oneapi-mkl-devel
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-sycl-fp16
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DGGML_SYCL_F16=ON
time cmake --build build --config Release -j $(nproc)
ubuntu-24-openvino:
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
strategy:
matrix:
include:
- variant: cpu
runner: '"ubuntu-24.04"'
openvino_device: "CPU"
- variant: gpu
runner: '["self-hosted","Linux","X64","Intel"]'
openvino_device: "GPU"
runs-on: ${{ fromJSON(matrix.runner) }}
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
if: runner.environment == 'github-hosted'
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
- name: Use OpenVINO Toolkit Cache
if: runner.environment == 'github-hosted'
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenVINO dependencies
run: |
cd ./openvino_toolkit
chmod +x ./install_dependencies/install_openvino_dependencies.sh
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
- name: Build
id: cmake_build
run: |
source ./openvino_toolkit/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
time cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Test
id: cmake_test
# TODO: fix and re-enable the `test-llama-archs` test below
run: |
cd ${{ github.workspace }}
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
export GGML_OPENVINO_DEVICE=GPU
fi
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
windows-latest:
runs-on: windows-2025
@@ -763,6 +943,39 @@ jobs:
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
windows-latest-sycl:
runs-on: windows-2022
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: windows-latest-sycl
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Install
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat
windows-latest-hip:
runs-on: windows-2022

View File

@@ -31,7 +31,7 @@ jobs:
uses: actions/setup-python@v6
with:
python-version: "3.11"
pip-install: -r requirements/requirements-all.txt ty==0.0.33
pip-install: -r requirements/requirements-all.txt ty==0.0.26
# - name: Type-check with Pyright
# uses: jakebailey/pyright-action@v2
# with:

View File

@@ -598,29 +598,15 @@ jobs:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
@@ -628,6 +614,10 @@ jobs:
variant: ccache
evict-old-files: 1d
- name: Install
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: Build
id: cmake_build
shell: cmd
@@ -680,82 +670,6 @@ jobs:
path: llama-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
ubuntu-24-sycl:
strategy:
matrix:
build: [fp32, fp16]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04
env:
ONEAPI_ROOT: /opt/intel/oneapi/
ONEAPI_INSTALLER_VERSION: "2025.3.3"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Use oneAPI Installation Cache
uses: actions/cache@v5
id: cache-sycl
with:
path: ${{ env.ONEAPI_ROOT }}
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
- name: Download & Install oneAPI
shell: bash
if: steps.cache-sycl.outputs.cache-hit != 'true'
run: |
cd /tmp
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-sycl-${{ matrix.build }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DLLAMA_OPENSSL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_SYCL_F16=${{ matrix.fp16 }}
time cmake --build build --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
ubuntu-22-rocm:
runs-on: ubuntu-22.04
@@ -1131,7 +1045,6 @@ jobs:
- ubuntu-cpu
- ubuntu-vulkan
- ubuntu-24-openvino
- ubuntu-24-sycl
- android-arm64
- macOS-cpu
- ios-xcode-build
@@ -1220,8 +1133,6 @@ jobs:
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
**Android:**
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)

14
.gitignore vendored
View File

@@ -34,6 +34,7 @@
/.vscode/
/nppBackup
# Coverage
/gcovr-report/
@@ -73,7 +74,6 @@
!/models/templates
# Zig
/zig-out/
/zig-cache/
@@ -93,7 +93,6 @@
!/examples/sycl/*.sh
# Server Web UI temporary files
/tools/server/webui/node_modules
/tools/server/webui/dist
# we no longer use gz for index.html
@@ -107,11 +106,9 @@ __pycache__/
poetry.toml
# Nix
/result
# Test binaries
/tests/test-backend-ops
/tests/test-double-float
/tests/test-grad0
@@ -127,7 +124,6 @@ poetry.toml
/tests/test-tokenizer-1-spm
# Scripts
!/scripts/install-oneapi.bat
# Generated by scripts
@@ -136,24 +132,16 @@ poetry.toml
/wikitext-2-raw/
# Test models for lora adapters
/lora-tests
# Local scripts
/run-vim.sh
/run-chat.sh
/run-spec.sh
/.ccache/
# IDE
/*.code-workspace
/.windsurf/
# emscripten
a.out.*
# AGENTS
AGENTS.local.md
.pi/SYSTEM.md

View File

@@ -1,33 +0,0 @@
You are a coding agent. Here are some very important rules that you must follow:
General:
- By very precise and concise when writing code, comments, explanations, etc.
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
- Don't try to build or run the code unless you are explicitly asked to do so
Coding:
- When in doubt, always refer to the CONTRIBUTING.md file of the project
- When referencing issues or PRs in comments, use the format:
- C/C++ code: `// ref: <url>`
- Other (CMake, etc.): `# ref: <url>`
Pull requests (PRs):
- New branch names are prefixed with "gg/"
- Before opening a pull request, ask the user to confirm the description
- When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
- Always create the pull requests in draft mode
Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
- Do not explicitly set the git author in commits - rely on the default git config
Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md)
- [PEG parser](docs/development/parsing.md)
- [Auto parser](docs/autoparser.md)
- [Jinja engine](common/jinja/README.md)
- [PR template](.github/pull_request_template.md)

View File

@@ -23,7 +23,6 @@
/ci/ @ggerganov
/cmake/ @ggerganov
/common/ @ggml-org/llama-common
/common/fit.* @JohannesGaessler
/common/jinja/ @CISC
/common/ngram-map.* @srogmann
/convert_*.py @CISC
@@ -53,29 +52,28 @@
/examples/speculative/ @ggerganov
/ggml/cmake/ @ggerganov
/ggml/include/ @ggerganov
/ggml/src/ggml-backend-meta.cpp @JohannesGaessler
/ggml/src/ggml-cann/ @ggml-org/ggml-cann
/ggml/src/ggml-common.h @ggerganov
/ggml/src/ggml-cpu/ @ggerganov
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
/ggml/src/ggml-cuda/ @ggml-org/ggml-cuda
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
/ggml/src/ggml-hip/ @IMbackK
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
/ggml/src/ggml-impl.h @ggerganov
/ggml/src/ggml-metal/ @ggml-org/ggml-metal
/ggml/src/ggml-opencl/ @ggml-org/ggml-opencl
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/ggml-quants.* @ggerganov
/ggml/src/ggml-rpc/ @ggml-org/ggml-rpc
/ggml/src/ggml-sycl/ @ggml-org/ggml-sycl
/ggml/src/ggml-threading.* @ggerganov
/ggml/src/ggml-virtgpu/ @kpouget
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
/ggml/src/ggml-virtgpu/ @kpouget
/ggml/src/ggml-webgpu/ @ggml-org/ggml-webgpu
/ggml/src/ggml-zdnn/ @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
/ggml/src/ggml.c @ggerganov
/ggml/src/ggml.cpp @ggerganov
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky

View File

@@ -73,8 +73,6 @@ add_library(${TARGET}
debug.h
download.cpp
download.h
fit.cpp
fit.h
hf-cache.cpp
hf-cache.h
http.h

File diff suppressed because it is too large Load Diff

View File

@@ -25,8 +25,7 @@ struct common_arg {
const char * value_hint_2 = nullptr; // for second arg value
const char * env = nullptr;
std::string help;
bool is_sampling = false; // is current arg a sampling param?
bool is_spec = false; // is current arg a speculative decoding param?
bool is_sparam = false; // is current arg a sampling param?
bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
void (*handler_void) (common_params & params) = nullptr;
void (*handler_string) (common_params & params, const std::string &) = nullptr;
@@ -75,8 +74,7 @@ struct common_arg {
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
common_arg & set_env(const char * env);
common_arg & set_sampling();
common_arg & set_spec();
common_arg & set_sparam();
common_arg & set_preset_only();
bool in_example(enum llama_example ex);
bool is_exclude(enum llama_example ex);

View File

@@ -296,7 +296,7 @@ void analyze_reasoning::compare_reasoning_presence() {
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
});
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.tag("post", (p.space() + p.marker() + p.space())) + p.rest();
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
});
// try the more aggressive parse first, if it fails, fall back to the delimiter one
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@@ -306,11 +306,11 @@ void analyze_reasoning::compare_reasoning_presence() {
if (result.result.success()) {
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
mode = reasoning_mode::TAG_BASED;
start = result.tags["pre"];
end = result.tags["post"];
start = trim_leading_whitespace(result.tags["pre"]);
end = trim_trailing_whitespace(result.tags["post"]);
} else if (!result.tags["post"].empty()) {
mode = reasoning_mode::TAG_BASED;
end = result.tags["post"];
end = trim_trailing_whitespace(result.tags["post"]);
}
}
}

View File

@@ -397,25 +397,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
return render_message_to_json(msgs, c);
}
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
if (tools.empty()) {
return json();
}
auto result = json::array();
for (const auto & tool : tools) {
result.push_back({
{ "type", "function" },
{ "function", {
{ "name", tool.name },
{ "description", tool.description },
{ "parameters", json::parse(tool.parameters) },
}},
});
}
return result;
}
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
std::vector<common_chat_tool> result;
@@ -451,6 +432,56 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
return result;
}
json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools) {
if (tools.empty()) {
return json();
}
auto result = json::array();
for (const auto & tool : tools) {
result.push_back({
{ "type", "function" },
{ "function",
{
{ "name", tool.name },
{ "description", tool.description },
{ "parameters", json::parse(tool.parameters) },
} },
});
}
return result;
}
json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
json delta = json::object();
if (!diff.reasoning_content_delta.empty()) {
delta["reasoning_content"] = diff.reasoning_content_delta;
}
if (!diff.content_delta.empty()) {
delta["content"] = diff.content_delta;
}
if (diff.tool_call_index != std::string::npos) {
json tool_call;
tool_call["index"] = diff.tool_call_index;
if (!diff.tool_call_delta.id.empty()) {
tool_call["id"] = diff.tool_call_delta.id;
tool_call["type"] = "function";
}
if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
json function = json::object();
if (!diff.tool_call_delta.name.empty()) {
function["name"] = diff.tool_call_delta.name;
}
if (!diff.tool_call_delta.arguments.empty()) {
function["arguments"] = diff.tool_call_delta.arguments;
}
tool_call["function"] = function;
}
delta["tool_calls"] = json::array({ tool_call });
}
return delta;
}
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
if (use_jinja) {
try {
@@ -544,26 +575,6 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
return tmpls->has_explicit_template;
}
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
static bool is_lfm2_template(const std::string & src) {
return src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos;
}
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
common_chat_prompt_preset asr_preset;
asr_preset.system = "";
asr_preset.user = "Transcribe audio to text";
if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
asr_preset.system = "Perform ASR.";
asr_preset.user = "";
}
return asr_preset;
}
std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
if (!variant.empty()) {
if (variant == "tool_use") {
@@ -2073,7 +2084,10 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
return common_chat_params_init_kimi_k2(tmpl, params);
}
if (is_lfm2_template(src)) {
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
if (src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params);
}
@@ -2382,3 +2396,4 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
GGML_ASSERT(chat_templates->template_default != nullptr);
return chat_templates->template_default->caps.to_map();
}

View File

@@ -256,13 +256,14 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
// Parses a JSON array of messages in OpenAI's chat completion API format.
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
// DEPRECATED: only used in tests
nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
// get template caps, useful for reporting to server /props endpoint
std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);
@@ -274,11 +275,3 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
const common_chat_template & tmpl,
const std::string & src,
autoparser::generation_params & params);
// specialized per-task preset
struct common_chat_prompt_preset {
std::string system;
std::string user;
};
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);

View File

@@ -3,7 +3,6 @@
#include "build-info.h"
#include "common.h"
#include "fit.h"
#include "log.h"
#include "llama.h"
#include "sampling.h"
@@ -70,7 +69,7 @@ common_time_meas::~common_time_meas() {
// CPU utils
//
int32_t common_cpu_get_num_physical_cores() {
int32_t cpu_get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings;
@@ -185,11 +184,11 @@ static int cpu_count_math_cpus(int n_cpu) {
/**
* Returns number of CPUs on system that are useful for math.
*/
int32_t common_cpu_get_num_math() {
int32_t cpu_get_num_math() {
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
if (n_cpu < 1) {
return common_cpu_get_num_physical_cores();
return cpu_get_num_physical_cores();
}
if (is_hybrid_cpu()) {
cpu_set_t affinity;
@@ -202,7 +201,7 @@ int32_t common_cpu_get_num_math() {
}
}
#endif
return common_cpu_get_num_physical_cores();
return cpu_get_num_physical_cores();
}
// Helper for setting process priority
@@ -263,7 +262,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
//
void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model) {
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
int32_t n_set = 0;
if (cpuparams.n_threads < 0) {
@@ -271,7 +270,7 @@ void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_para
if (role_model != nullptr) {
cpuparams = *role_model;
} else {
cpuparams.n_threads = common_cpu_get_num_math();
cpuparams.n_threads = cpu_get_num_math();
}
}
@@ -1148,7 +1147,7 @@ common_init_result::common_init_result(common_params & params) :
if (params.fit_params) {
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
common_fit_params(params.model.path.c_str(), &mparams, &cparams,
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
params.tensor_split,
params.tensor_buft_overrides.data(),
params.fit_params_target.data(),
@@ -1521,7 +1520,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
return cparams;
}
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params) {
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
struct ggml_threadpool_params tpp;
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults

View File

@@ -54,7 +54,7 @@ struct common_control_vector_load_info;
// CPU utils
//
struct common_cpu_params {
struct cpu_params {
int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
@@ -63,8 +63,8 @@ struct common_cpu_params {
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
};
int32_t common_cpu_get_num_physical_cores();
int32_t common_cpu_get_num_math();
int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
//
// Common params
@@ -274,7 +274,6 @@ struct common_params_sampling {
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
bool backend_sampling = false;
@@ -297,19 +296,34 @@ struct common_params_model {
struct common_ngram_mod;
// draft-model-based speculative decoding parameters
struct common_params_speculative_draft {
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
struct common_params_speculative {
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
// general-purpose speculative decoding parameters
common_params_model mparams;
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts
// ngram-based speculative decoding
llama_context_params cparams; // these are the parameters for the draft llama_context
uint16_t ngram_size_n = 12; // ngram size for lookup
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
std::shared_ptr<common_ngram_mod> ngram_mod;
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
// draft-model speculative decoding
struct common_params_model mparams_dft;
llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
llama_context_params cparams_dft; // these are the parameters for the draft llama_context
int32_t n_ctx = 0; // draft context size
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
@@ -317,60 +331,25 @@ struct common_params_speculative_draft {
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
common_cpu_params cpuparams;
common_cpu_params cpuparams_batch;
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
};
struct common_params_speculative_ngram_mod {
int32_t n_match = 24;
int32_t n_max = 64;
int32_t n_min = 48;
// shared instance of the ngram container for all speculative decoding contexts
std::shared_ptr<common_ngram_mod> obj;
};
struct common_params_speculative_ngram_map {
uint16_t size_n = 12; // ngram size for lookup
uint16_t size_m = 48; // mgram size for speculative tokens
uint16_t min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
};
struct common_params_speculative_ngram_cache {
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
};
struct common_params_speculative {
// TODO: become a vector in order to support "chains of speculators"
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
common_params_speculative_draft draft;
common_params_speculative_ngram_mod ngram_mod;
common_params_speculative_ngram_map ngram_simple;
common_params_speculative_ngram_map ngram_map_k;
common_params_speculative_ngram_map ngram_map_k4v;
common_params_speculative_ngram_cache ngram_cache;
bool has_dft() const {
return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
}
};
struct common_params_vocoder {
struct common_params_model model;
std::string speaker_file; // speaker file path
std::string speaker_file = ""; // speaker file path // NOLINT
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
};
struct common_params_diffusion {
@@ -441,20 +420,19 @@ struct common_params {
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
bool fit_params_print = false; // print the estimated required memory to run the model
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
// margin per device in bytes for fitting parameters to free memory:
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
common_cpu_params cpuparams;
common_cpu_params cpuparams_batch;
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
@@ -602,6 +580,8 @@ struct common_params {
bool force_pure_content_parser = false;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
int reasoning_budget = -1;
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
@@ -698,7 +678,7 @@ std::string common_params_get_system_info(const common_params & params);
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr);
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
bool set_process_priority(enum ggml_sched_priority prio);
//
@@ -766,11 +746,6 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
str.compare(0, prefix.size(), prefix) == 0;
}
// remove when moving to c++20
inline bool string_starts_with(std::string_view str, char prefix) {
return !str.empty() && str.front() == prefix;
}
// remove when moving to c++20
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
return str.size() >= suffix.size() &&
@@ -866,7 +841,7 @@ common_init_result_ptr common_init_from_params(common_params & params);
struct llama_model_params common_model_params_to_llama ( common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

View File

@@ -1,38 +1,9 @@
#include "debug.h"
#include "common.h"
#include "log.h"
#include <cmath>
#include <regex>
#include <string>
#include <vector>
struct common_debug_cb_user_data::impl {
std::vector<uint8_t> data;
std::vector<std::regex> tensor_filters;
bool abort_on_nan{false};
};
common_debug_cb_user_data::common_debug_cb_user_data() : pimpl(std::make_unique<impl>()) {}
common_debug_cb_user_data::~common_debug_cb_user_data() = default;
common_debug_cb_user_data::common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan)
: pimpl(std::make_unique<impl>())
{
for (const auto & pattern : filter_patterns) {
try {
std::string anchored_pattern = "^" + pattern;
pimpl->tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
} catch (const std::regex_error & e) {
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
}
}
pimpl->abort_on_nan = abort_on_nan;
params.cb_eval = common_debug_cb_eval;
params.cb_eval_user_data = this;
}
static std::string common_ggml_ne_string(const ggml_tensor * t) {
std::string str;
@@ -76,7 +47,8 @@ static float common_ggml_get_float_value(const uint8_t * data,
#define INDENT " "
static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n, bool abort_on_nan) {
template <bool abort>
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
@@ -122,7 +94,7 @@ static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int6
LOG(INDENT "sum = %f\n", sum);
}
if (abort_on_nan) {
if constexpr (abort) {
if (std::isnan(sum)) {
LOG("encountered NaN - aborting\n");
exit(0);
@@ -140,9 +112,8 @@ static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int6
* @param user_data user data to pass at each call back
* @return true to receive data or continue the graph, false otherwise
*/
bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (common_debug_cb_user_data *) user_data;
auto * pimpl = cb_data->pimpl.get();
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (base_callback_data *) user_data;
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
@@ -151,10 +122,10 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
return true; // Always retrieve data
}
bool matches_filter = pimpl->tensor_filters.empty();
bool matches_filter = cb_data->tensor_filters.empty();
if (!matches_filter) {
for (const auto & filter : pimpl->tensor_filters) {
for (const auto & filter : cb_data->tensor_filters) {
if (std::regex_search(t->name, filter)) {
matches_filter = true;
break;
@@ -177,14 +148,20 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
if (!is_host) {
auto n_bytes = ggml_nbytes(t);
pimpl->data.resize(n_bytes);
ggml_backend_tensor_get(t, pimpl->data.data(), 0, n_bytes);
cb_data->data.resize(n_bytes);
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
}
if (!ggml_is_quantized(t->type) && matches_filter) {
uint8_t * data = is_host ? (uint8_t *) t->data : pimpl->data.data();
common_debug_print_tensor(data, t->type, t->ne, t->nb, 3, pimpl->abort_on_nan);
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
}
return true;
}
// Explicit template instantiations
template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);

View File

@@ -1,31 +1,43 @@
#pragma once
#include <memory>
#include "common.h"
#include <string>
#include <vector>
#include <regex>
// common debug functions and structs
struct common_params;
// Print a tensor's detailed data
// data - the tensor's data in byte format
// type - the tensor's quantization type
// ne - the tensor dimensions array
// nb - the tensor strides array
// n - the number of rows/columns to fully print
template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
// Intended to use as callback for ggml_backend_sched_eval_callback
// prints tensors that are processed in the computation graph
// by default prints all tensors, but can be configured by creating a `common_debug_cb_user_data` instance with
// non-empty filter_patterns. See examples/debug.cpp for possible usage patterns
// `common_debug_cb_user_data` contains `abort_on_nan` flag that determines whether an error should be thrown whenever a NaN is encountered
// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
// The template parameter determines whether an error should be thrown whenever a NaN is encountered
// in a tensor (useful for stopping debug sessions on first erroneous tensor)
// The callback data will be passed as the third parameter (user_data)
bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
struct base_callback_data {
std::vector<uint8_t> data;
std::vector<std::regex> tensor_filters;
struct common_debug_cb_user_data {
struct impl;
std::unique_ptr<impl> pimpl;
base_callback_data() = default;
common_debug_cb_user_data();
~common_debug_cb_user_data();
common_debug_cb_user_data(const common_debug_cb_user_data &) = delete;
common_debug_cb_user_data & operator=(const common_debug_cb_user_data &) = delete;
common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan = false);
base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
for (const auto & pattern : filter_patterns) {
try {
std::string anchored_pattern = "^" + pattern;
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
} catch (const std::regex_error & e) {
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
}
}
params.cb_eval = common_debug_cb_eval<false>;
params.cb_eval_user_data = this;
}
};

View File

@@ -627,7 +627,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
if (!tag.empty()) {
tags.push_back(tag);
} else {
tags = {"Q4_K_M", "Q8_0"};
tags = {"Q4_K_M", "Q4_0"};
}
for (const auto & t : tags) {

View File

@@ -1,951 +0,0 @@
#include "fit.h"
#include "log.h"
#include "../src/llama-ext.h"
#include <array>
#include <cassert>
#include <stdexcept>
#include <cinttypes>
#include <set>
#include <string>
#include <vector>
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
// enum to identify part of a layer for distributing its tensors:
enum common_layer_fraction_t {
LAYER_FRACTION_NONE = 0, // nothing
LAYER_FRACTION_ATTN = 1, // attention
LAYER_FRACTION_UP = 2, // attention + up
LAYER_FRACTION_GATE = 3, // attention + up + gate
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
};
class common_params_fit_exception : public std::runtime_error {
using std::runtime_error::runtime_error;
};
static std::vector<llama_device_memory_data> common_get_device_memory_data(
const char * path_model,
const llama_model_params * mparams,
const llama_context_params * cparams,
std::vector<ggml_backend_dev_t> & devs,
uint32_t & hp_ngl,
uint32_t & hp_n_ctx_train,
uint32_t & hp_n_expert,
ggml_log_level log_level) {
struct user_data_t {
struct {
ggml_log_callback callback;
void * user_data;
} original_logger;
ggml_log_level min_level; // prints below this log level go to debug log
};
user_data_t ud;
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
ud.min_level = log_level;
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
const user_data_t * ud = (const user_data_t *) user_data;
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
}, &ud);
llama_model_params mparams_copy = *mparams;
mparams_copy.no_alloc = true;
mparams_copy.use_mmap = false;
mparams_copy.use_mlock = false;
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
if (model == nullptr) {
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
throw std::runtime_error("failed to load model");
}
llama_context * ctx = llama_init_from_model(model, *cparams);
if (ctx == nullptr) {
llama_model_free(model);
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
throw std::runtime_error("failed to create llama_context from model");
}
const size_t nd = llama_model_n_devices(model);
std::vector<llama_device_memory_data> ret(nd + 1);
llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
for (const auto & [buft, mb] : memory_breakdown) {
if (ggml_backend_buft_is_host(buft)) {
ret.back().mb.model += mb.model;
ret.back().mb.context += mb.context;
ret.back().mb.compute += mb.compute;
continue;
}
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (!dev) {
continue;
}
for (size_t i = 0; i < nd; i++) {
if (dev == llama_model_get_device(model, i)) {
ret[i].mb.model += mb.model;
ret[i].mb.context += mb.context;
ret[i].mb.compute += mb.compute;
break;
}
}
}
{
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error("no CPU backend found");
}
size_t free;
size_t total;
ggml_backend_dev_memory(cpu_dev, &free, &total);
ret.back().free = free;
ret.back().total = total;
}
for (size_t i = 0; i < nd; i++) {
size_t free;
size_t total;
ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
// devices can return 0 bytes for free and total memory if they do not
// have any to report. in this case, we will use the host memory as a fallback
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
if (free == 0 && total == 0) {
free = ret.back().free;
total = ret.back().total;
}
ret[i].free = free;
ret[i].total = total;
}
devs.clear();
for (int i = 0; i < llama_model_n_devices(model); i++) {
devs.push_back(llama_model_get_device(model, i));
}
hp_ngl = llama_model_n_layer(model);
hp_n_ctx_train = llama_model_n_ctx_train(model);
hp_n_expert = llama_model_n_expert(model);
common_memory_breakdown_print(ctx);
llama_free(ctx);
llama_model_free(model);
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
return ret;
}
static void common_params_fit_impl(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
if (mparams->split_mode == LLAMA_SPLIT_MODE_TENSOR) {
throw common_params_fit_exception("llama_params_fit is not implemented for SPLIT_MODE_TENSOR, abort");
}
constexpr int64_t MiB = 1024*1024;
typedef std::vector<llama_device_memory_data> dmds_t;
const llama_model_params default_mparams = llama_model_default_params();
std::vector<ggml_backend_dev_t> devs;
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
uint32_t hp_nct = 0; // hparams.n_ctx_train
uint32_t hp_nex = 0; // hparams.n_expert
// step 1: get data for default parameters and check whether any changes are necessary in the first place
LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
const size_t nd = devs.size(); // number of devices
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
margins.reserve(nd);
if (nd == 0) {
margins.push_back(margins_s[0]);
} else {
for (size_t id = 0; id < nd; id++) {
margins.push_back(margins_s[id]);
}
}
std::vector<std::string> dev_names;
{
dev_names.reserve(nd);
size_t max_length = 0;
for (const auto & dev : devs) {
std::string name = ggml_backend_dev_name(dev);
name += " (";
name += ggml_backend_dev_description(dev);
name += ")";
dev_names.push_back(name);
max_length = std::max(max_length, name.length());
}
for (std::string & dn : dev_names) {
dn.insert(dn.end(), max_length - dn.length(), ' ');
}
}
int64_t sum_free = 0;
int64_t sum_projected_free = 0;
int64_t sum_projected_used = 0;
int64_t sum_projected_model = 0;
std::vector<int64_t> projected_free_per_device;
projected_free_per_device.reserve(nd);
if (nd == 0) {
sum_projected_used = dmds_full.back().mb.total();
sum_free = dmds_full.back().total;
sum_projected_free = sum_free - sum_projected_used;
LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (sum_projected_free >= margins[0]) {
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
__func__, sum_projected_free/MiB, margins[0]/MiB);
return;
}
} else {
if (nd > 1) {
LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
}
for (size_t id = 0; id < nd; id++) {
const llama_device_memory_data & dmd = dmds_full[id];
const int64_t projected_used = dmd.mb.total();
const int64_t projected_free = dmd.free - projected_used;
projected_free_per_device.push_back(projected_free);
sum_free += dmd.free;
sum_projected_used += projected_used;
sum_projected_free += projected_free;
sum_projected_model += dmd.mb.model;
if (nd > 1) {
LOG_INF("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
}
}
assert(sum_free >= 0 && sum_projected_used >= 0);
LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
__func__, sum_projected_used/MiB, sum_free/MiB);
if (nd == 1) {
if (projected_free_per_device[0] >= margins[0]) {
LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
return;
}
} else {
bool changes_needed = false;
for (size_t id = 0; id < nd; id++) {
if (projected_free_per_device[id] < margins[id]) {
changes_needed = true;
break;
}
}
if (!changes_needed) {
LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
return;
}
}
}
// step 2: try reducing memory use by reducing the context size
{
int64_t global_surplus = sum_projected_free;
if (nd == 0) {
global_surplus -= margins[0];
} else {
for (size_t id = 0; id < nd; id++) {
global_surplus -= margins[id];
}
}
if (global_surplus < 0) {
if (nd <= 1) {
LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
__func__, margins[0]/MiB, -global_surplus/MiB);
} else {
LOG_INF(
"%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
__func__, -global_surplus/MiB);
}
if (cparams->n_ctx == 0) {
if (hp_nct > n_ctx_min) {
int64_t sum_used_target = sum_free;
if (nd == 0) {
sum_used_target -= margins[0];
} else {
for (size_t id = 0; id < nd; id++) {
sum_used_target -= margins[id];
}
}
if (nd > 1) {
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
// - for dense models only whole layers can be assigned to devices
// - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
// - on average we expect a waste of 0.5 layers/tensors per device
// - use slightly more than the expected average for nd devices to be safe
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
}
int64_t sum_projected_used_min_ctx = 0;
cparams->n_ctx = n_ctx_min;
const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
if (nd == 0) {
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
} else {
for (size_t id = 0; id < nd; id++) {
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
}
}
if (sum_used_target > sum_projected_used_min_ctx) {
// linear interpolation between minimum and maximum context size:
cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
/ (sum_projected_used - sum_projected_used_min_ctx);
cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
if (nd <= 1) {
LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
return;
}
LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
} else {
const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
}
} else {
if (n_ctx_min == UINT32_MAX) {
LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
} else {
LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
__func__, hp_nct, n_ctx_min);
}
}
} else {
LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
}
}
}
if (nd == 0) {
throw common_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
}
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
throw common_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
}
if (nd > 1) {
if (!tensor_split) {
throw common_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
}
if (mparams->tensor_split) {
for (size_t id = 0; id < nd; id++) {
if (mparams->tensor_split[id] != 0.0f) {
throw common_params_fit_exception("model_params::tensor_split already set by user, abort");
}
}
}
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
throw common_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
}
}
if (!tensor_buft_overrides) {
throw common_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
}
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
throw common_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
}
// step 3: iteratively fill the back to front with "dense" layers
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
// - for a MoE model, same as dense model but with all MoE tensors in system memory
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
auto get_overflow_pattern = [&](const size_t il, const common_layer_fraction_t lf) -> const char * {
constexpr size_t n_strings = 1000;
if (il >= n_strings) {
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
}
switch (lf) {
case LAYER_FRACTION_ATTN: {
static std::array<std::string, n_strings> patterns;
if (patterns[il].empty()) {
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|up|gate_up|down).*";
}
return patterns[il].c_str();
}
case LAYER_FRACTION_UP: {
static std::array<std::string, n_strings> patterns;
if (patterns[il].empty()) {
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|gate_up|down).*";
}
return patterns[il].c_str();
}
case LAYER_FRACTION_GATE: {
static std::array<std::string, n_strings> patterns;
if (patterns[il].empty()) {
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
}
return patterns[il].c_str();
}
case LAYER_FRACTION_MOE: {
static std::array<std::string, n_strings> patterns;
if (patterns[il].empty()) {
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate_up|gate)_(ch|)exps";
}
return patterns[il].c_str();
}
default:
GGML_ABORT("fatal error");
}
};
struct ngl_t {
uint32_t n_layer = 0; // number of total layers
uint32_t n_part = 0; // number of partial layers, <= n_layer
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
common_layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
uint32_t n_full() const {
assert(n_layer >= n_part);
return n_layer - n_part;
}
};
const size_t ntbo = llama_max_tensor_buft_overrides();
// utility function to set n_gpu_layers and tensor_split
auto set_ngl_tensor_split_tbo = [&](
const std::vector<ngl_t> & ngl_per_device,
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
llama_model_params & mparams) {
mparams.n_gpu_layers = 0;
for (size_t id = 0; id < nd; id++) {
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
if (nd > 1) {
tensor_split[id] = ngl_per_device[id].n_layer;
}
}
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
mparams.tensor_split = tensor_split;
size_t itbo = 0;
for (size_t id = 0; id < nd; id++) {
il0 += ngl_per_device[id].n_full();
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
if (itbo + 1 >= ntbo) {
tensor_buft_overrides[itbo].pattern = nullptr;
tensor_buft_overrides[itbo].buft = nullptr;
itbo++;
mparams.tensor_buft_overrides = tensor_buft_overrides;
throw common_params_fit_exception("llama_max_tensor_buft_overrides() == "
+ std::to_string(ntbo) + " is insufficient for model");
}
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
itbo++;
}
il0 += ngl_per_device[id].n_part;
}
tensor_buft_overrides[itbo].pattern = nullptr;
tensor_buft_overrides[itbo].buft = nullptr;
itbo++;
mparams.tensor_buft_overrides = tensor_buft_overrides;
};
// utility function that returns the memory use per device for given numbers of layers per device
auto get_memory_for_layers = [&](
const char * func_name,
const std::vector<ngl_t> & ngl_per_device,
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
llama_model_params mparams_copy = *mparams;
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
const dmds_t dmd_nl = common_get_device_memory_data(
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
LOG_INF("%s: memory for test allocation by device:\n", func_name);
for (size_t id = 0; id < nd; id++) {
const ngl_t & n = ngl_per_device[id];
LOG_INF(
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
}
std::vector<int64_t> ret;
ret.reserve(nd);
for (size_t id = 0; id < nd; id++) {
ret.push_back(dmd_nl[id].mb.total());
}
return ret;
};
int64_t global_surplus_cpu_moe = 0;
if (hp_nex > 0) {
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate_up|gate)_(ch|)exps"; // matches all MoE tensors
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
tensor_buft_overrides[1] = {nullptr, nullptr};
mparams->tensor_buft_overrides = tensor_buft_overrides;
LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
const dmds_t dmds_cpu_moe = common_get_device_memory_data(
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
for (size_t id = 0; id < nd; id++) {
global_surplus_cpu_moe += dmds_cpu_moe[id].free;
global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
}
if (global_surplus_cpu_moe > 0) {
LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
__func__, global_surplus_cpu_moe/MiB);
} else {
LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
__func__, -global_surplus_cpu_moe/MiB);
}
// reset
tensor_buft_overrides[0] = {nullptr, nullptr};
mparams->tensor_buft_overrides = tensor_buft_overrides;
}
std::vector<int64_t> targets; // maximum acceptable memory use per device
targets.reserve(nd);
for (size_t id = 0; id < nd; id++) {
targets.push_back(dmds_full[id].free - margins[id]);
LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
}
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
overflow_bufts.reserve(nd);
for (size_t id = 0; id < nd; id++) {
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
}
std::vector<ngl_t> ngl_per_device(nd);
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
// optimize the number of layers per device using the method of false position:
// - ngl_per_device has 0 layers for each device, lower bound
// - try a "high" configuration where a device is given all unassigned layers
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
// - check memory use of our guess, replace either the low or high bound
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
// - the last device has the output layer, which cannot be a partial layer
if (hp_nex == 0) {
LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
} else {
LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
}
for (int id = nd - 1; id >= 0; id--) {
uint32_t n_unassigned = hp_ngl + 1;
for (size_t jd = id + 1; jd < nd; ++jd) {
assert(n_unassigned >= ngl_per_device[jd].n_layer);
n_unassigned -= ngl_per_device[jd].n_layer;
}
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
ngl_per_device_high[id].n_layer = n_unassigned;
if (hp_nex > 0) {
ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
}
if (ngl_per_device_high[id].n_layer > 0) {
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
step_size = std::max(step_size, uint32_t(1));
step_size = std::min(step_size, delta - 1);
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
ngl_per_device_test[id].n_layer += step_size;
if (hp_nex) {
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
step_size - 1 : step_size; // the first layer is the output layer which must always be full
}
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
if (mem_test[id] <= targets[id]) {
ngl_per_device = ngl_per_device_test;
mem = mem_test;
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
} else {
ngl_per_device_high = ngl_per_device_test;
mem_high = mem_test;
LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
}
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
}
} else {
assert(ngl_per_device_high[id].n_layer == n_unassigned);
ngl_per_device = ngl_per_device_high;
mem = mem_high;
LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
}
}
const int64_t projected_margin = dmds_full[id].free - mem[id];
LOG_INF(
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
}
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
return;
}
// step 4: for a MoE model where all dense tensors fit,
// convert the dense-only layers in the back to full layers in the front until all devices are full
// essentially the same procedure as for the dense-only layers except front-to-back
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
size_t id_dense_start = nd;
for (int id = nd - 1; id >= 0; id--) {
if (ngl_per_device[id].n_layer > 0) {
id_dense_start = id;
continue;
}
break;
}
assert(id_dense_start < nd);
LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
for (size_t jd = id_dense_start; jd < nd; jd++) {
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
ngl_per_device_high[id].n_layer += n_layer_move;
ngl_per_device_high[jd].n_layer -= n_layer_move;
ngl_per_device_high[jd].n_part = 0;
}
size_t id_dense_start_high = nd - 1;
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
step_size = std::max(step_size, uint32_t(1));
step_size = std::min(step_size, delta - 1);
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
size_t id_dense_start_test = id_dense_start;
uint32_t n_converted_test = 0;
for (;id_dense_start_test < nd; id_dense_start_test++) {
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
ngl_per_device_test[id].n_layer += n_convert_jd;
n_converted_test += n_convert_jd;
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
break;
}
}
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
if (mem_test[id] <= targets[id]) {
ngl_per_device = ngl_per_device_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
} else {
ngl_per_device_high = ngl_per_device_test;
mem_high = mem_test;
id_dense_start_high = id_dense_start_test;
LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
}
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
}
} else {
ngl_per_device = ngl_per_device_high;
mem = mem_high;
id_dense_start = id_dense_start_high;
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
}
// try to fit at least part of one more layer
if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
size_t id_dense_start_test = id_dense_start;
ngl_per_device_test[id_dense_start_test].n_layer--;
ngl_per_device_test[id_dense_start_test].n_part--;
ngl_per_device_test[id].n_layer++;
ngl_per_device_test[id].n_part++;
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
id_dense_start_test++;
}
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
if (id < nd - 1) {
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
}
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
}
} else {
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
overflow_bufts = overflow_bufts_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
}
}
}
const int64_t projected_margin = dmds_full[id].free - mem[id];
LOG_INF(
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
}
// print info for devices that were not changed during the conversion from dense only to full layers:
for (size_t id = id_dense_start + 1; id < nd; id++) {
const int64_t projected_margin = dmds_full[id].free - mem[id];
LOG_INF(
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
}
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
}
enum common_params_fit_status common_fit_params(
const char * path_model,
llama_model_params * mparams,
llama_context_params * cparams,
float * tensor_split,
llama_model_tensor_buft_override * tensor_buft_overrides,
size_t * margins,
uint32_t n_ctx_min,
ggml_log_level log_level) {
const int64_t t0_us = llama_time_us();
common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
try {
common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
LOG_INF("%s: successfully fit params to free device memory\n", __func__);
} catch (const common_params_fit_exception & e) {
LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
status = COMMON_PARAMS_FIT_STATUS_FAILURE;
} catch (const std::runtime_error & e) {
LOG_ERR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
status = COMMON_PARAMS_FIT_STATUS_ERROR;
}
const int64_t t1_us = llama_time_us();
LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
return status;
}
void common_memory_breakdown_print(const struct llama_context * ctx) {
//const auto & devices = ctx->get_model().devices;
const auto * model = llama_get_model(ctx);
std::vector<ggml_backend_dev_t> devices;
for (int i = 0; i < llama_model_n_devices(model); i++) {
devices.push_back(llama_model_get_device(model, i));
}
llama_memory_breakdown memory_breakdown = llama_get_memory_breakdown(ctx);
std::vector<std::array<std::string, 9>> table_data;
table_data.reserve(devices.size());
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
constexpr size_t MiB = 1024 * 1024;
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
// track seen buffer types to avoid double counting:
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
// accumulative memory breakdown for each device and for host:
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
llama_memory_breakdown_data mb_host;
for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (ggml_backend_buft_is_host(buft)) {
mb_host.model += mb.model;
mb_host.context += mb.context;
mb_host.compute += mb.compute;
seen_buffer_types.insert(buft);
continue;
}
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (dev) {
int i_dev = -1;
for (size_t i = 0; i < devices.size(); i++) {
if (devices[i] == dev) {
i_dev = i;
break;
}
}
if (i_dev != -1) {
mb_dev[i_dev].model += mb.model;
mb_dev[i_dev].context += mb.context;
mb_dev[i_dev].compute += mb.compute;
seen_buffer_types.insert(buft);
continue;
}
}
}
// print memory breakdown for each device:
for (size_t i = 0; i < devices.size(); i++) {
ggml_backend_dev_t dev = devices[i];
llama_memory_breakdown_data mb = mb_dev[i];
const std::string name = ggml_backend_dev_name(dev);
std::string desc = ggml_backend_dev_description(dev);
for (const std::string & prefix : desc_prefixes_strip) {
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
desc = desc.substr(prefix.length());
}
}
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
const size_t self = mb.model + mb.context + mb.compute;
const int64_t unaccounted = static_cast<int64_t>(total) - static_cast<int64_t>(free) - static_cast<int64_t>(self);
table_data.push_back({
template_gpu,
" - " + name + " (" + desc + ")",
std::to_string(total / MiB),
std::to_string(free / MiB),
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
std::to_string(unaccounted / static_cast<int64_t>(MiB))});
}
// print memory breakdown for host:
{
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
table_data.push_back({
template_other,
" - Host",
"", // total
"", // free
std::to_string(self / MiB),
std::to_string(mb_host.model / MiB),
std::to_string(mb_host.context / MiB),
std::to_string(mb_host.compute / MiB),
""}); // unaccounted
}
// print memory breakdown for all remaining buffer types:
for (const auto & buft_mb : memory_breakdown) {
ggml_backend_buffer_type_t buft = buft_mb.first;
const llama_memory_breakdown_data & mb = buft_mb.second;
if (seen_buffer_types.count(buft) == 1) {
continue;
}
const std::string name = ggml_backend_buft_name(buft);
const size_t self = mb.model + mb.context + mb.compute;
table_data.push_back({
template_other,
" - " + name,
"", // total
"", // free
std::to_string(self / MiB),
std::to_string(mb.model / MiB),
std::to_string(mb.context / MiB),
std::to_string(mb.compute / MiB),
""}); // unaccounted
seen_buffer_types.insert(buft);
}
for (size_t j = 1; j < table_data[0].size(); j++) {
size_t max_len = 0;
for (const auto & td : table_data) {
max_len = std::max(max_len, td[j].length());
}
for (auto & td : table_data) {
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
}
}
for (const auto & td : table_data) {
LOG_INF(td[0].c_str(),
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
td[6].c_str(), td[7].c_str(), td[8].c_str());
}
}
void common_fit_print(
const char * path_model,
llama_model_params * mparams,
llama_context_params * cparams) {
std::vector<ggml_backend_dev_t> devs;
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
uint32_t hp_nct = 0; // hparams.n_ctx_train
uint32_t hp_nex = 0; // hparams.n_expert
auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR);
GGML_ASSERT(dmd.size() == devs.size() + 1);
for (size_t id = 0; id < devs.size(); id++) {
printf("%s ", ggml_backend_dev_name(devs[id]));
printf("%zu ", dmd[id].mb.model/1024/1024);
printf("%zu ", dmd[id].mb.context/1024/1024);
printf("%zu ", dmd[id].mb.compute/1024/1024);
printf("\n");
}
printf("Host ");
printf("%zu ", dmd.back().mb.model/1024/1024);
printf("%zu ", dmd.back().mb.context/1024/1024);
printf("%zu ", dmd.back().mb.compute/1024/1024);
printf("\n");
}

View File

@@ -1,32 +0,0 @@
#pragma once
#include "ggml.h"
enum common_params_fit_status {
COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
COMMON_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
COMMON_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
};
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
// - returns true if the parameters could be successfully modified to fit device memory
// - this function is NOT thread safe because it modifies the global llama logger state
// - only parameters that have the same value as in llama_default_model_params are modified
// with the exception of the context size which is modified if and only if equal to 0
enum common_params_fit_status common_fit_params(
const char * path_model,
struct llama_model_params * mparams,
struct llama_context_params * cparams,
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
size_t * margins, // margins of memory to leave per device in bytes
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
// print estimated memory to stdout
void common_fit_print(
const char * path_model,
struct llama_model_params * mparams,
struct llama_context_params * cparams);
void common_memory_breakdown_print(const struct llama_context * ctx);

View File

@@ -57,7 +57,7 @@ static fs::path get_cache_directory() {
#ifndef _WIN32
const struct passwd * pw = getpwuid(getuid());
if (pw && pw->pw_dir && *pw->pw_dir) {
if (pw->pw_dir && *pw->pw_dir) {
return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
}
#endif

View File

@@ -1,3 +1,4 @@
#include "log.h"
#include "value.h"
#include "runtime.h"
#include "caps.h"

View File

@@ -106,16 +106,10 @@ struct statement {
size_t pos; // position in source, for debugging
virtual ~statement() = default;
virtual std::string type() const { return "Statement"; }
// execute_impl must be overridden by derived classes
virtual value execute_impl(context &) { throw_exec_error(); }
virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
// execute is the public method to execute a statement with error handling
value execute(context &);
private:
[[noreturn]] void throw_exec_error() const {
throw std::runtime_error("cannot exec " + type());
}
};
// Type Checking Utilities
@@ -149,7 +143,7 @@ struct program : public statement {
program() = default;
explicit program(statements && body) : body(std::move(body)) {}
std::string type() const override { return "Program"; }
[[noreturn]] value execute_impl(context &) override {
value execute_impl(context &) override {
throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
}
};
@@ -201,7 +195,7 @@ struct break_statement : public statement {
}
};
[[noreturn]] value execute_impl(context &) override {
value execute_impl(context &) override {
throw break_statement::signal();
}
};
@@ -215,7 +209,7 @@ struct continue_statement : public statement {
}
};
[[noreturn]] value execute_impl(context &) override {
value execute_impl(context &) override {
throw continue_statement::signal();
}
};
@@ -515,7 +509,7 @@ struct slice_expression : public expression {
chk_type<expression>(this->step_expr);
}
std::string type() const override { return "SliceExpression"; }
[[noreturn]] value execute_impl(context &) override {
value execute_impl(context &) override {
throw std::runtime_error("must be handled by MemberExpression");
}
};

View File

@@ -590,10 +590,6 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
}
[[noreturn]] static value string_join_not_implemented(const func_args &) {
throw not_implemented_exception("String join builtin not implemented");
}
const func_builtins & value_string_t::get_builtins() const {
static const func_builtins builtins = {
{"default", default_value},
@@ -855,7 +851,9 @@ const func_builtins & value_string_t::get_builtins() const {
res->val_str.mark_input_based_on(val_input->as_string());
return res;
}},
{"join", string_join_not_implemented},
{"join", [](const func_args &) -> value {
throw not_implemented_exception("String join builtin not implemented");
}},
};
return builtins;
}
@@ -886,9 +884,6 @@ const func_builtins & value_bool_t::get_builtins() const {
return builtins;
}
[[noreturn]] static value array_unique_not_implemented(const func_args &) {
throw not_implemented_exception("Array unique builtin not implemented");
}
const func_builtins & value_array_t::get_builtins() const {
static const func_builtins builtins = {
@@ -1089,14 +1084,13 @@ const func_builtins & value_array_t::get_builtins() const {
std::reverse(arr.begin(), arr.end());
return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
}},
{"unique", array_unique_not_implemented},
{"unique", [](const func_args &) -> value {
throw not_implemented_exception("Array unique builtin not implemented");
}},
};
return builtins;
}
[[noreturn]] static value object_join_not_implemented(const func_args &) {
throw not_implemented_exception("object join not implemented");
}
const func_builtins & value_object_t::get_builtins() const {
if (!has_builtins) {
@@ -1189,7 +1183,9 @@ const func_builtins & value_object_t::get_builtins() const {
});
return result;
}},
{"join", object_join_not_implemented},
{"join", [](const func_args &) -> value {
throw not_implemented_exception("object join not implemented");
}},
};
return builtins;
}

View File

@@ -129,25 +129,27 @@ struct value_t {
// Note: only for debugging and error reporting purposes
virtual std::string type() const { return ""; }
virtual int64_t as_int() const { throw_type_error("is not an int value"); }
virtual double as_float() const { throw_type_error("is not a float value"); }
virtual string as_string() const { throw_type_error("is not a string value"); }
virtual bool as_bool() const { throw_type_error("is not a bool value"); }
virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
virtual bool is_none() const { return false; }
virtual bool is_undefined() const { return false; }
virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }
virtual const func_builtins & get_builtins() const {
throw std::runtime_error("No builtins available for type " + type());
}
virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }
virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }
virtual bool is_numeric() const { return false; }
virtual bool is_hashable() const { return false; }
@@ -161,11 +163,6 @@ struct value_t {
// Note: only for debugging purposes
virtual std::string as_repr() const { return as_string().str(); }
private:
[[noreturn]] void throw_type_error(const char* expected) const {
throw std::runtime_error(type() + " " + expected);
}
protected:
virtual bool equivalent(const value_t &) const = 0;
virtual bool nonequal(const value_t & other) const { return !equivalent(other); }

View File

@@ -49,7 +49,7 @@ enum common_log_col : int {
};
// disable colors by default
static const char* g_col[] = {
static std::vector<const char *> g_col = {
"",
"",
"",
@@ -247,6 +247,7 @@ public:
entries = std::move(new_entries);
}
cv.notify_one();
}
@@ -264,6 +265,7 @@ public:
{
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [this]() { return head != tail; });
cur = entries[head];
head = (head + 1) % entries.size();
@@ -299,6 +301,7 @@ public:
tail = (tail + 1) % entries.size();
}
cv.notify_one();
}
@@ -335,7 +338,7 @@ public:
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
} else {
for (size_t i = 0; i < std::size(g_col); i++) {
for (size_t i = 0; i < g_col.size(); i++) {
g_col[i] = "";
}
}
@@ -365,20 +368,14 @@ struct common_log * common_log_init() {
}
struct common_log * common_log_main() {
// We intentionally leak (i.e. do not delete) the logger singleton because
// common_log destructor called at DLL teardown phase will cause hanging on Windows.
// OS will release resources anyway so it should not be a significant issue,
// though this design may cause logs to be lost if not flushed before the program exits.
// Refer to https://github.com/ggml-org/llama.cpp/issues/22142 for details.
static struct common_log * log;
static struct common_log log;
static std::once_flag init_flag;
std::call_once(init_flag, [&]() {
log = new common_log;
// Set default to auto-detect colors
log->set_colors(tty_can_use_colors());
log.set_colors(tty_can_use_colors());
});
return log;
return &log;
}
void common_log_pause(struct common_log * log) {

View File

@@ -49,11 +49,7 @@ void common_log_default_callback(enum ggml_log_level level, const char * text, v
struct common_log;
struct common_log * common_log_init();
// Singleton, intentionally leaked to avoid Windows teardown hangs.
// Call common_log_flush() before exit if you want to ensure all logs are flushed.
struct common_log * common_log_main();
struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
void common_log_free (struct common_log * log);

View File

@@ -43,7 +43,7 @@ static std::set<std::string> get_remote_preset_whitelist(const std::map<std::str
for (const auto & it : key_to_opt) {
const std::string & key = it.first;
const common_arg & opt = it.second;
if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
allowed_keys.insert(key);
// also add variant keys (args without leading dashes and env vars)
for (const auto & arg : opt.get_args()) {

View File

@@ -122,20 +122,6 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
}
break;
case REASONING_BUDGET_DONE:
// Re-arm on a new start tag: some models emit multiple <think> blocks
// per response, and each should get a fresh budget window.
if (ctx->start_matcher.advance(token)) {
ctx->state = REASONING_BUDGET_COUNTING;
ctx->remaining = ctx->budget;
ctx->end_matcher.reset();
LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget);
if (ctx->remaining <= 0) {
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
}
}
break;
}
}
@@ -232,6 +218,34 @@ static struct llama_sampler * common_reasoning_budget_init_state(
);
}
struct llama_sampler * common_reasoning_budget_init(
const struct llama_vocab * vocab,
const std::vector<llama_token> & start_tokens,
const std::vector<llama_token> & end_tokens,
const std::vector<llama_token> & forced_tokens,
int32_t budget,
const std::vector<llama_token> & prefill_tokens) {
// Determine initial state from prefill: COUNTING if the prefill begins with
// the start sequence but does not also contain the end sequence after it.
common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE;
if (!prefill_tokens.empty() && !start_tokens.empty() &&
prefill_tokens.size() >= start_tokens.size() &&
std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) {
initial_state = REASONING_BUDGET_COUNTING;
// If the end sequence also follows the start in the prefill, reasoning
// was opened and immediately closed — stay IDLE.
if (!end_tokens.empty() &&
prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) {
auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size();
if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() &&
std::equal(end_tokens.begin(), end_tokens.end(), end_start)) {
initial_state = REASONING_BUDGET_IDLE;
}
}
}
return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
}
struct llama_sampler * common_reasoning_budget_init(
const struct llama_vocab * vocab,
const std::vector<llama_token> & start_tokens,

View File

@@ -29,7 +29,10 @@ enum common_reasoning_budget_state {
// end_tokens - token sequence for natural deactivation
// forced_tokens - token sequence forced when budget expires
// budget - max tokens allowed in the reasoning block
// initial_state - initial state
// prefill_tokens - tokens already present in the prompt (generation prompt);
// used to determine the initial state: COUNTING if they begin
// with start_tokens (but don't also end with end_tokens),
// IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING.
//
struct llama_sampler * common_reasoning_budget_init(
const struct llama_vocab * vocab,
@@ -37,6 +40,16 @@ struct llama_sampler * common_reasoning_budget_init(
const std::vector<llama_token> & end_tokens,
const std::vector<llama_token> & forced_tokens,
int32_t budget,
common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE);
const std::vector<llama_token> & prefill_tokens = {});
// Variant that takes an explicit initial state (used by tests and clone).
// COUNTING with budget <= 0 is promoted to FORCING.
struct llama_sampler * common_reasoning_budget_init(
const struct llama_vocab * vocab,
const std::vector<llama_token> & start_tokens,
const std::vector<llama_token> & end_tokens,
const std::vector<llama_token> & forced_tokens,
int32_t budget,
common_reasoning_budget_state initial_state);
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);

View File

@@ -1,12 +1,10 @@
#include "sampling.h"
#include "common.h"
#include "fit.h"
#include "ggml.h"
#include "log.h"
#include "reasoning-budget.h"
#include "ggml.h"
#include <algorithm>
#include <cctype>
#include <climits>
@@ -260,35 +258,32 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}
// Compute prefill tokens from the generation prompt
std::vector<llama_token> prefill_tokens;
if (!params.generation_prompt.empty()) {
GGML_ASSERT(vocab != nullptr);
auto tokens = common_tokenize(vocab, params.generation_prompt, false, true);
for (size_t i = 0; i < tokens.size(); i++) {
std::string piece = common_token_to_piece(vocab, tokens[i], true);
if (i == 0 && std::isspace(piece[0]) && !std::isspace(params.generation_prompt[0])) {
// Some tokenizers will add a space before the first special token, need to exclude
continue;
}
LOG_DBG("%s: prefill token: %d = %s\n", __func__, tokens[i], piece.c_str());
prefill_tokens.push_back(tokens[i]);
}
}
// Feed generation prompt tokens to the grammar sampler so it advances past
// tokens the template already placed in the prompt.
// Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
if (grmr && !params.grammar_lazy && common_grammar_needs_prefill(params.grammar)) {
try {
for (const auto & token : prefill_tokens) {
llama_sampler_accept(grmr, token);
LOG_DBG("%s: grammar accepted prefill token (%d)\n", __func__, token);
std::vector<llama_token> prefill_tokens;
if (!params.generation_prompt.empty() && common_grammar_needs_prefill(params.grammar)) {
GGML_ASSERT(vocab != nullptr);
prefill_tokens = common_tokenize(vocab, params.generation_prompt, false, true);
if (!prefill_tokens.empty()) {
std::string first_token = common_token_to_piece(vocab, prefill_tokens[0], true);
if (std::isspace(first_token[0]) && !std::isspace(params.generation_prompt[0])) {
// Some tokenizers will add a space before the first special token, need to remove
prefill_tokens = std::vector<llama_token>(prefill_tokens.begin() + 1, prefill_tokens.end());
}
}
if (grmr && !params.grammar_lazy) {
try {
for (const auto & token : prefill_tokens) {
llama_sampler_accept(grmr, token);
LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
}
} catch (std::exception &e) {
LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
throw e;
}
} catch (std::exception &e) {
LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
throw e;
}
}
@@ -299,12 +294,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
params.reasoning_budget_start,
params.reasoning_budget_end,
params.reasoning_budget_forced,
params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens);
for (const auto & token : prefill_tokens) {
llama_sampler_accept(rbudget, token);
LOG_DBG("%s: reasoning-budget accepted prefill token (%d)\n", __func__, token);
}
params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
prefill_tokens);
}
if (params.has_logit_bias()) {
@@ -438,7 +429,7 @@ static bool grammar_should_apply(struct common_sampler * gsmpl) {
return true;
}
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated) {
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
if (!gsmpl) {
return;
}
@@ -446,11 +437,9 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
const auto tm = gsmpl->tm();
// grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
const auto accept_grammar = is_generated && grammar_should_apply(gsmpl);
accept_grammar = accept_grammar && grammar_should_apply(gsmpl);
if (gsmpl->rbudget && is_generated) {
llama_sampler_accept(gsmpl->rbudget, token);
}
llama_sampler_accept(gsmpl->rbudget, token);
if (gsmpl->grmr && accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token);
@@ -522,7 +511,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
common_memory_breakdown_print(ctx);
llama_memory_breakdown_print(ctx);
}
}

View File

@@ -41,8 +41,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
void common_sampler_free(struct common_sampler * gsmpl);
// if is_generated is true, the token is accepted by the sampling chain, the reasoning budget sampler, and the grammar sampler
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated);
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
void common_sampler_reset (struct common_sampler * gsmpl);
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);

View File

@@ -61,26 +61,18 @@ static bool common_speculative_are_compatible(
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) {
LOG_WRN("%s: draft model vocab type must match target model to use speculation but "
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
return false;
}
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
(llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
LOG_WRN("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
__func__,
llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
return false;
}
if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
(llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
LOG_WRN("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
__func__,
llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
if (
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
) {
LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
return false;
}
@@ -151,9 +143,6 @@ struct common_speculative_state {
llama_tokens & result) = 0;
virtual void accept(uint16_t n_accepted) = 0;
virtual int32_t n_max(const common_params_speculative & params) const = 0;
virtual int32_t n_min(const common_params_speculative & params) const = 0;
};
struct common_speculative_checkpoint {
@@ -167,6 +156,8 @@ struct common_speculative_checkpoint {
size_t size() const {
return data.size();
}
size_t ckpt_size = 0;
};
struct common_speculative_state_draft : public common_speculative_state {
@@ -174,7 +165,7 @@ struct common_speculative_state_draft : public common_speculative_state {
llama_context * ctx_dft;
bool use_ckpt = false;
common_speculative_checkpoint ckpt;
struct common_speculative_checkpoint ckpt;
common_sampler * smpl;
@@ -247,16 +238,26 @@ struct common_speculative_state_draft : public common_speculative_state {
llama_batch_free(batch);
}
void begin(const llama_tokens & /*prompt*/) override {
void begin(const llama_tokens & prompt) override {
if (use_ckpt && ckpt.size() > 0) {
// delete checkpoint
LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
__func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
ckpt.pos_min = 0;
ckpt.pos_max = 0;
ckpt.n_tokens = 0;
ckpt.ckpt_size = 0;
ckpt.data.clear();
}
}
size_t create_checkpoint(int n_tokens_prompt) {
size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
int slot_id = 0;
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
ckpt.n_tokens = n_tokens_prompt;
ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
ckpt.data.resize(checkpoint_size);
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
@@ -269,13 +270,13 @@ struct common_speculative_state_draft : public common_speculative_state {
return n;
}
size_t restore_checkpoint() {
size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
int slot_id = 0;
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
if (n != ckpt.size()) {
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu",
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size());
if (n != ckpt_size_part_expected) {
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
}
llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
@@ -287,8 +288,6 @@ struct common_speculative_state_draft : public common_speculative_state {
const llama_tokens & prompt_tgt,
llama_token id_last,
llama_tokens & result) override {
const auto & sparams = params.draft;
auto * spec = this;
auto & batch = spec->batch;
@@ -302,7 +301,7 @@ struct common_speculative_state_draft : public common_speculative_state {
int reuse_i = 0; // index of part to be reused in prompt_dft
int reuse_n = 0; // length of part to be reused in prompt_dft
const int n_ctx = llama_n_ctx(ctx_dft) - sparams.n_max;
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;
llama_tokens prompt_cnv;
if (!spec->vocab_cmpt) {
@@ -334,18 +333,13 @@ struct common_speculative_state_draft : public common_speculative_state {
const int i_start = std::max<int>(0, (int) prompt_cur.size() - n_ctx);
if (use_ckpt && i_start > 0) {
LOG_WRN("%s: context shift is not supported with checkpoint-based contexts - skipping\n", __func__);
return;
}
// reuse as much as possible from the old draft context
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
for (int i = 0; i < (int) prompt_dft.size(); ++i) {
int cur = 0;
while (i_start + cur < (int) prompt_cur.size() &&
i + cur < (int) prompt_dft.size() &&
prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
i + cur < (int) prompt_dft.size() &&
prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
cur++;
}
@@ -353,26 +347,21 @@ struct common_speculative_state_draft : public common_speculative_state {
reuse_i = i;
reuse_n = cur;
}
if (use_ckpt) {
break;
}
}
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
__func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
if (use_ckpt && ckpt.n_tokens > reuse_n) {
LOG_DBG("%s: checkpoint (n_tokens = %d) is outdated -> delete it\n", __func__, (int) ckpt.n_tokens);
if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
__func__, reuse_i, reuse_n);
reuse_i = 0;
reuse_n = 0;
ckpt = {};
}
result.clear();
result.reserve(sparams.n_max);
result.reserve(params.n_max);
bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
llama_memory_clear(mem_dft, false);
prompt_dft.clear();
@@ -383,7 +372,7 @@ struct common_speculative_state_draft : public common_speculative_state {
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
result.push_back(prompt_dft[i]);
if (sparams.n_max <= (int) result.size()) {
if (params.n_max <= (int) result.size()) {
break;
}
}
@@ -391,38 +380,50 @@ struct common_speculative_state_draft : public common_speculative_state {
return;
}
if (reuse_i > 0) {
GGML_ASSERT(!use_ckpt);
bool do_restore = false;
if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
// This can happen after a partial acceptance (speculative decoding with checkpoints)
LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
__func__, prompt_dft.size(), prompt_cur.size());
prompt_dft.resize(prompt_cur.size());
do_restore = true;
}
if (reuse_i > 0) {
bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
if (!is_removed) {
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
return;
}
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
}
if (reuse_n < (int) prompt_dft.size()) {
if (reuse_n < (int) prompt_dft.size() || do_restore) {
if (use_ckpt) {
if (ckpt.n_tokens > 0) {
LOG_DBG("%s: restoring checkpoint, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size());
restore_checkpoint();
reuse_n = ckpt.n_tokens;
prompt_dft.resize(reuse_n);
if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
__func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
}
draft_restore_checkpoint(ckpt.ckpt_size);
reuse_n = ckpt.n_tokens;
prompt_dft.resize(reuse_n);
needs_ckpt = false;
} else {
const bool is_removed = llama_memory_seq_rm(mem_dft, 0, reuse_n, -1);
bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
if (!is_removed) {
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size());
return;
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
__func__, reuse_n, prompt_dft.size());
}
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
}
}
}
if (needs_ckpt) {
ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
}
// prepare a batch to evaluate any new tokens in the prompt
common_batch_clear(batch);
@@ -436,17 +437,12 @@ struct common_speculative_state_draft : public common_speculative_state {
// we should rarely end-up here during normal decoding
if (batch.n_tokens > 0) {
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
LOG_DBG("%s: draft prompt batch: %d tokens\n", __func__, batch.n_tokens);
int ret = llama_decode(ctx_dft, batch);
if (ret != 0 && ret != 1) {
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
__func__, ret, prompt_cur.size());
}
if (use_ckpt) {
create_checkpoint(prompt_dft.size());
}
}
const llama_pos n_past = prompt_dft.size();
@@ -458,7 +454,7 @@ struct common_speculative_state_draft : public common_speculative_state {
prompt_dft.push_back(id_last);
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
int ret = llama_decode(ctx_dft, batch);
if (ret != 0 && ret != 1) {
@@ -469,7 +465,7 @@ struct common_speculative_state_draft : public common_speculative_state {
common_sampler_reset(smpl);
// sample n_draft tokens from the draft model
for (int i = 0; i < sparams.n_max; ++i) {
for (int i = 0; i < params.n_max; ++i) {
common_batch_clear(batch);
common_sampler_sample(smpl, ctx_dft, 0, true);
@@ -486,14 +482,14 @@ struct common_speculative_state_draft : public common_speculative_state {
common_sampler_accept(smpl, id, true);
// only collect very high-confidence draft tokens
if (cur_p->data[0].p < sparams.p_min) {
result.push_back(id);
if (params.n_max <= (int) result.size()) {
break;
}
result.push_back(id);
if (sparams.n_max <= (int) result.size()) {
// only collect very high-confidence draft tokens
if (cur_p->data[0].p < params.p_min) {
break;
}
@@ -514,14 +510,10 @@ struct common_speculative_state_draft : public common_speculative_state {
detokenized = replace_to_tgt(detokenized);
LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
result = common_tokenize(ctx_tgt, detokenized, false, true);
if (result.size() > (size_t) sparams.n_max) {
result.resize(sparams.n_max);
if (result.size() > (size_t)params.n_max) {
result.resize(params.n_max);
}
}
if (result.size() < (size_t) sparams.n_min) {
result.clear();
}
}
void accept(uint16_t n_accepted) override {
@@ -529,14 +521,6 @@ struct common_speculative_state_draft : public common_speculative_state {
GGML_UNUSED(n_accepted);
}
int32_t n_max(const common_params_speculative & params) const override {
return params.draft.n_max;
}
int32_t n_min(const common_params_speculative & params) const override {
return params.draft.n_min;
}
std::string replace_to_dft(const std::string & input) const {
std::string result = input;
@@ -589,14 +573,6 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
// noop
GGML_UNUSED(n_accepted);
}
int32_t n_max(const common_params_speculative & params) const override {
return params.draft.n_max;
}
int32_t n_min(const common_params_speculative & params) const override {
return params.draft.n_min;
}
};
// state of self-speculation (simple implementation, not ngram-map)
@@ -626,27 +602,19 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
// noop
GGML_UNUSED(n_accepted);
}
int32_t n_max(const common_params_speculative & /*params*/) const override {
return config.size_mgram;
}
int32_t n_min(const common_params_speculative & /*params*/) const override {
return config.size_mgram;
}
};
struct common_speculative_state_ngram_map_k : public common_speculative_state {
// draft ngram map for speculative decoding without draft model
common_ngram_map config;
common_ngram_map map;
common_speculative_state_ngram_map_k(
enum common_speculative_type type,
common_ngram_map config)
: common_speculative_state(type), config(std::move(config)) {}
common_ngram_map map)
: common_speculative_state(type), map(std::move(map)) {}
void begin(const llama_tokens & prompt) override {
common_ngram_map_begin(config, prompt);
common_ngram_map_begin(map, prompt);
}
void draft(
@@ -654,20 +622,12 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
const llama_tokens & prompt_tgt,
llama_token id_last,
llama_tokens & result) override {
common_ngram_map_draft(config, prompt_tgt, id_last, result);
common_ngram_map_draft(map, prompt_tgt, id_last, result);
GGML_UNUSED(params);
}
void accept(uint16_t n_accepted) override {
common_ngram_map_accept(config, n_accepted);
}
int32_t n_max(const common_params_speculative & /*params*/) const override {
return config.size_value;
}
int32_t n_min(const common_params_speculative & /*params*/) const override {
return config.size_value;
common_ngram_map_accept(map, n_accepted);
}
};
@@ -724,7 +684,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
const llama_tokens & prompt_tgt,
llama_token id_last,
llama_tokens & result) override {
const auto & sparams = params.ngram_mod;
GGML_UNUSED(params);
n_draft_last = 0;
@@ -744,16 +704,16 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
i_last = cur_len - n;
}
result.resize(n + sparams.n_max);
result.resize(n + params.n_max);
for (size_t i = 0; i < n - 1; ++i) {
result[i] = prompt_tgt[cur_len - n + 1 + i];
}
result[n - 1] = id_last;
for (int i = 0; i < sparams.n_max; ++i) {
for (int i = 0; i < params.n_max; ++i) {
const llama_token token = mod.get(result.data() + i);
if (token == common_ngram_mod::EMPTY) {
if (i < sparams.n_min) {
if (i < params.n_min) {
result.clear();
return;
}
@@ -775,33 +735,26 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
}
void accept(uint16_t n_accepted) override {
if (verbose) {
LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
}
// compute acceptance fraction if we have a recorded draft length
if (n_draft_last > 0) {
const double f_acc = (double)n_accepted / (double)n_draft_last;
if (f_acc < 0.5) {
n_low++;
if (n_low >= 3) {
if (verbose) {
LOG_WRN("%s: low acceptance streak (%d) resetting ngram_mod\n", __func__, n_low);
}
LOG_WRN("%s: low acceptance streak (%d) resetting ngram_mod\n", __func__, n_low);
mod.reset();
n_low = 0;
i_last = 0;
}
} else {
n_low = 0;
}
}
}
int32_t n_max(const common_params_speculative & params) const override {
return params.ngram_mod.n_max;
}
int32_t n_min(const common_params_speculative & params) const override {
return params.ngram_mod.n_min;
}
};
struct common_speculative_state_ngram_cache : public common_speculative_state {
@@ -895,14 +848,6 @@ struct common_speculative_state_ngram_cache : public common_speculative_state {
// TODO: noop
GGML_UNUSED(n_accepted);
}
int32_t n_max(const common_params_speculative & /*params*/) const override {
return n_draft;
}
int32_t n_min(const common_params_speculative & /*params*/) const override {
return 0;
}
};
struct common_speculative {
@@ -911,13 +856,11 @@ struct common_speculative {
common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
};
static common_ngram_map get_common_ngram_map(
common_speculative_type type,
const common_params_speculative_ngram_map & config) {
uint16_t size_key = config.size_n;
uint16_t size_value = config.size_m;
bool key_only = type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
uint16_t min_hits = config.min_hits;
static common_ngram_map get_common_ngram_map(const common_speculative_config & config) {
uint16_t size_key = config.params.ngram_size_n;
uint16_t size_value = config.params.ngram_size_m;
bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
uint16_t min_hits = config.params.ngram_min_hits;
return common_ngram_map(size_key, size_value, key_only, min_hits);
}
@@ -975,8 +918,8 @@ common_speculative * common_speculative_init(
common_params_speculative & params,
llama_context * ctx_tgt) {
llama_context * ctx_dft = nullptr;
if (params.draft.model) {
ctx_dft = llama_init_from_model(params.draft.model, params.draft.cparams);
if (params.model_dft) {
ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
if (ctx_dft == nullptr) {
LOG_ERR("%s", "failed to create draft context\n");
return nullptr;
@@ -986,7 +929,7 @@ common_speculative * common_speculative_init(
// Compute the implementations to use based on the config and their order of preference
std::vector<common_speculative_config> configs = {}; // list of speculative configs to try
{
bool has_draft = !params.draft.mparams.path.empty();
bool has_draft = !params.mparams_dft.path.empty();
bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
bool has_ngram_cache = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE);
@@ -1009,17 +952,16 @@ common_speculative * common_speculative_init(
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
}
if (has_ngram_mod) {
auto & sparams = params.ngram_mod;
// shared instance for all speculative decoding contexts
if (!params.ngram_mod) {
params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
if (!sparams.obj) {
sparams.obj = std::make_shared<common_ngram_mod>(sparams.n_match, 4*1024*1024);
LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
params.ngram_size_n, params.ngram_mod->size(),
(float)(params.ngram_mod->size_bytes())/1024/1024);
LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__,
sparams.n_match, sparams.obj->size(), (float)(sparams.obj->size_bytes())/1024/1024);
if (sparams.n_match < 16) {
LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, "
"see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, sparams.n_match);
if (params.ngram_size_n < 16) {
LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
}
}
@@ -1049,7 +991,7 @@ common_speculative * common_speculative_init(
impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
/* .ctx_tgt = */ ctx_tgt,
/* .ctx_dft = */ ctx_dft,
/* .replacements = */ params.draft.replacements,
/* .replacements = */ params.replacements,
/* .use_ckpt = */ use_ckpt
));
break;
@@ -1059,18 +1001,18 @@ common_speculative * common_speculative_init(
break;
}
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
common_ngram_map ngram_map = get_common_ngram_map(config.type, config.params.ngram_simple);
common_ngram_map ngram_map = get_common_ngram_map(config);
uint16_t ngram_size_key = ngram_map.size_key;
uint16_t mgram_size_value = ngram_map.size_value;
auto config_simple = common_ngram_simple_config {
/* .size_ngram = */ ngram_size_key,
/* .size_mgram = */ mgram_size_value
/* .size_ngram = */ ngram_size_key,
/* .size_mgram = */ mgram_size_value
};
auto state = std::make_unique<common_speculative_state_ngram_simple>(
/* .type = */ config.type,
/* .state = */ config_simple
/* .type = */ config.type,
/* .state = */ config_simple
);
impls.push_back(std::move(state));
break;
@@ -1079,17 +1021,18 @@ common_speculative * common_speculative_init(
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: {
impls.push_back(std::make_unique<common_speculative_state_ngram_map_k>(
(config.type),
get_common_ngram_map(config.type, config.params.ngram_map_k)
get_common_ngram_map(config)
));
break;
}
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
GGML_ASSERT(config.params.ngram_mod.obj);
impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod.obj));
GGML_ASSERT(config.params.ngram_mod);
impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
break;
}
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
auto state = create_state_ngram_cache(params.ngram_cache.lookup_cache_static, params.ngram_cache.lookup_cache_dynamic, config);
auto state = create_state_ngram_cache(
params.lookup_cache_static, params.lookup_cache_dynamic, config);
impls.push_back(std::make_unique<common_speculative_state_ngram_cache>(state));
break;
}
@@ -1147,15 +1090,6 @@ llama_tokens common_speculative_draft(
impl->n_call_draft++;
}
{
const int n_min = impl->n_min(params);
if (!result.empty() && (int) result.size() < n_min) {
LOG_DBG("%s: ignoring small draft: %d < %d\n", __func__, (int) result.size(), n_min);
result.clear();
}
}
if (!result.empty()) {
LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
@@ -1165,7 +1099,7 @@ llama_tokens common_speculative_draft(
impl->n_gen_drafts++;
impl->n_gen_tokens += result.size();
break; // we have a draft, so break out of the loop and return it.
break; // We have a draft, so break out of the loop and return it.
}
}
@@ -1193,32 +1127,6 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
}
}
int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params) {
if (spec == nullptr) {
return 0;
}
int32_t n_max = 0;
for (const auto & impl : spec->impls) {
n_max = std::max(n_max, impl->n_max(params));
}
return n_max;
}
int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params) {
if (spec == nullptr) {
return 0;
}
int32_t n_min = 0;
for (const auto & impl : spec->impls) {
n_min = std::max(n_min, impl->n_min(params));
}
return n_min;
}
void common_speculative_print_stats(const common_speculative * spec) {
if (spec == nullptr) {
return;

View File

@@ -33,9 +33,6 @@ llama_tokens common_speculative_draft(
// informs the speculative decoder that n_accepted tokens were accepted by the target model
void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params);
int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params);
// print statistics about the speculative decoding
void common_speculative_print_stats(const common_speculative * spec);

View File

@@ -272,22 +272,6 @@ class ModelBase:
return tensors
@staticmethod
def _scale_is_trivial(scale: Tensor) -> bool:
return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6
def _write_scale_tensor(self, scale_name: str, scale: Tensor):
if not self._scale_is_trivial(scale):
scale_f32 = scale.float().numpy().flatten()
logger.info(f" + {scale_name} (per-tensor scale, shape [{scale_f32.size}])")
self.gguf_writer.add_tensor(scale_name, scale_f32)
def _write_scales_tensor(self, scale_name: str, scales: list[float]):
if not np.allclose(scales, 1.0, atol=1e-6):
scale_vals = np.array(scales, dtype=np.float32)
logger.info(f" + {scale_name} (per-expert scale, shape [{len(scales)}])")
self.gguf_writer.add_tensor(scale_name, scale_vals)
def dequant_model(self):
# If all quantized tensors were already handled (e.g. pure NVFP4), skip
if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors):
@@ -510,7 +494,7 @@ class ModelBase:
s = self.model_tensors[name]
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
tensors_to_remove.append(name)
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
if name.endswith((".k_scale", ".v_scale")):
tensors_to_remove.append(name)
elif quant_method is not None:
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@@ -618,6 +602,10 @@ class ModelBase:
raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
return raw, [out_features, n_super * 64]
@staticmethod
def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
if "language_model." in name:
name = name.replace("language_model.", "")
@@ -628,8 +616,19 @@ class ModelBase:
logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2)
self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale)
# Emit per-tensor scale2 as a separate F32 tensor when non-trivial
if not self._nvfp4_scale2_is_trivial(scale2):
scale2_f32 = scale2.float().numpy().flatten()
scale_name = new_name.replace(".weight", ".scale")
logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
self.gguf_writer.add_tensor(scale_name, scale2_f32)
# Emit per-tensor input_scale as a separate F32 tensor when non-trivial
if not self._nvfp4_scale2_is_trivial(input_scale):
input_scale_f32 = input_scale.float().numpy().flatten()
input_scale_name = new_name.replace(".weight", ".input_scale")
logger.info(f" + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
def _generate_nvfp4_tensors(self):
# Per-layer expert merging to avoid holding all experts in memory
@@ -720,17 +719,24 @@ class ModelBase:
logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
# Emit per-expert scale2 tensor if any expert has non-trivial scale2
scales.sort(key=lambda x: x[0])
self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales])
scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
if not np.allclose(scale_vals, 1.0, atol=1e-6):
scale_name = new_name.replace(".weight", ".scale")
logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
self.gguf_writer.add_tensor(scale_name, scale_vals)
# Emit per-expert input_scale tensor if any expert has non-trivial input_scale
input_scales.sort(key=lambda x: x[0])
self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales])
input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
input_scale_name = new_name.replace(".weight", ".input_scale")
logger.info(f" + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
del experts, merged
def _needs_nvfp4_processing(self) -> bool:
return True
def prepare_tensors(self):
# detect NVFP4 quantization (ModelOpt format)
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
@@ -740,12 +746,7 @@ class ModelBase:
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
with open(quant_config_file, "r", encoding="utf-8") as f:
hf_quant_config = json.load(f)
quant_config = hf_quant_config.get("quantization") or {}
producer = hf_quant_config.get("producer") or {}
producer_name = (producer.get("name") or "").lower()
if quant_method is None:
self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
quant_config = json.load(f).get("quantization") or {}
quant_algo = quant_config.get("quant_algo", quant_algo)
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
@@ -761,7 +762,7 @@ class ModelBase:
# NVFP4 weights are repacked and written directly to gguf_writer.
# This must run before dequant_model so NVFP4 tensors are removed
# from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
if self._is_nvfp4 and self._needs_nvfp4_processing():
if self._is_nvfp4:
self._generate_nvfp4_tensors()
self.dequant_model()
@@ -2193,10 +2194,6 @@ class MmprojModel(ModelBase):
# merge configs
self.preprocessor_config = {**self.preprocessor_config, **cfg}
def _needs_nvfp4_processing(self) -> bool:
# nvfp4 quantization applies to the text model only.
return False
def get_vision_config(self) -> dict[str, Any] | None:
config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
return self.global_config.get(config_name)
@@ -4457,12 +4454,6 @@ class NemotronNanoV2VLModel(MmprojModel):
}
return vision_config
def dequant_model(self):
if self._is_nvfp4:
# Skip nvfp4 quantization for vision/audio model.
return
super().dequant_model()
def set_gguf_parameters(self):
if "image_mean" not in self.preprocessor_config:
self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
@@ -4486,10 +4477,6 @@ class NemotronNanoV2VLModel(MmprojModel):
if "input_conditioner" in name:
return
# mtmd does not support video yet so skip tensors related to video.
if "radio_model.model.patch_generator.video_embedder" in name:
return
# RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
if "patch_generator.pos_embed" in name:
if not name.endswith(".weight"):
@@ -6658,7 +6645,7 @@ class BertModel(TextModel):
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment]
if isinstance(tokenizer, SentencePieceProcessor):
for token_id in range(tokenizer.vocab_size()):
@@ -10837,11 +10824,7 @@ class NemotronHModel(GraniteHybridModel):
# uses self.model_arch to build the tensor name map, and all MoE-specific
# mappings would be missed if it were called with the default non-MoE arch.
hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
has_moe_params = (
"num_experts_per_tok" in hparams
or (isinstance(hparams.get("llm_config"), dict) and "num_experts_per_tok" in hparams["llm_config"])
)
if has_moe_params:
if "num_experts_per_tok" in hparams:
self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
self.is_moe = True
@@ -10988,11 +10971,6 @@ class NemotronHModel(GraniteHybridModel):
if name.startswith(("vision_model.", "mlp1.")):
return
if name.startswith(("sound_encoder.")):
return
if name.startswith(("sound_projection.")):
return
# Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
if name.startswith("language_model."):
name = name[len("language_model."):]
@@ -11877,7 +11855,7 @@ class LLaDAMoEModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("HunYuanDenseV1ForCausalLM")
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
class HunYuanModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
@@ -12016,58 +11994,28 @@ class HunYuanModel(TextModel):
@ModelBase.register("HunYuanVLForConditionalGeneration")
class HunyuanVLVisionModel(MmprojModel):
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
# Each variant maps to a different projector type in clip.cpp so image
# preprocessing follows the correct code path.
class HunyuanOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
# HunyuanOCR uses max_image_size instead of image_size
if "image_size" not in self.hparams_vision:
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
@staticmethod
def is_ocr_variant(hparams: dict) -> bool:
"""Return True for HunyuanOCR, False for HunyuanVL.
The projector's output dim must equal the text model's hidden_size by
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
ViT -> LLM projection dim is a hard architectural signature, not a
magic number.
"""
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
return vision_out == 1024
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
vcfg = self.hparams_vision
if self.is_ocr_variant(self.global_config):
# --- HunyuanOCR ---
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
return
# --- HunyuanVL ---
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
hparams = self.hparams_vision
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if not name.startswith("vit."):
return
return # skip text tensors
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
if "position_embedding" in name:
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
@@ -12075,66 +12023,11 @@ class HunyuanVLVisionModel(MmprojModel):
def tensor_force_quant(self, name, new_name, bid, n_dims):
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
@ModelBase.register("HunYuanVLForConditionalGeneration")
class HunyuanVLTextModel(HunYuanModel):
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
# the config and pick the matching GGUF architecture.
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
@staticmethod
def _is_ocr_config(hparams: dict) -> bool:
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
# HunyuanVLVisionModel.is_ocr_variant.
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
def __init__(self, dir_model: Path, *args, **kwargs):
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
if self._is_ocr_config(raw_hparams):
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
else:
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
super().__init__(dir_model, *args, **kwargs)
def set_gguf_parameters(self):
super().set_gguf_parameters()
# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
# the HunYuan-Dense arch which already handles standard rope in super().
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
return
if self.rope_parameters.get("rope_type") != "xdrope":
return
# defaults for HunyuanVL. The C++ side later computes:
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1)))
ctx_len = int(self.hparams["max_position_embeddings"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len)
self.gguf_writer.add_context_length(ctx_len)
self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"]))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision tensors — they are written by HunyuanVLVisionModel
if name.startswith("vit."):
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("SmolLM3ForCausalLM")
class SmolLM3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.SMOLLM3

View File

@@ -244,6 +244,7 @@ build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
- `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
> [!NOTE]
> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
@@ -273,6 +274,8 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
Run llama.cpp with OpenVINO backend Docker container.
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
> [!NOTE]
> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
```bash
# Run Docker container

View File

@@ -31,8 +31,6 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.
## Recommended Release
### Windows
The following releases are verified and recommended:
|Commit ID|Tag|Release|Verified Platform| Update date|
@@ -41,22 +39,9 @@ The following releases are verified and recommended:
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
### Ubuntu 24.04
The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
It is recommended to use them with Intel Docker.
The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.
## News
- 2026.04
- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
- Fused MoE.
- Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
- 2026.03
- Support Flash-Attention: less memory usage, performance impact depends on LLM.
@@ -244,7 +229,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|Verified release|
|-|
|2025.3.3 |
|2025.2.1|
|2025.1|
|2024.1|
@@ -355,12 +339,6 @@ Choose one of following methods to run.
./examples/sycl/test.sh
```
- Run llama-server:
```sh
./examples/sycl/start-svr.sh -m PATH/MODEL_FILE
```
2. Command line
Launch inference
@@ -649,18 +627,10 @@ Choose one of following methods to run.
1. Script
- Run test:
```
examples\sycl\win-test.bat
```
- Run llama-server:
```
examples\sycl\win-start-svr.bat -m PATH\MODEL_FILE
```
2. Command line
Launch inference

View File

@@ -249,27 +249,18 @@ build: 6a8cf8914 (6733)
```
- `GGML_HEXAGON_PROFILE=1`
Enables Op profiling:
Generates a host-side profile for the ggml-hexagon Ops.
- `1` Basic profile with per-op `usecs` and `cycles` counters
- `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
- `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
Examples:
`GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
- `GGML_HEXAGON_OPSTAGE=0x0`
Allows enabling specific stages of the Op processing pipeline:
- `GGML_HEXAGON_OPMASK=0x0`
Allows enabling specific stages of the processing pipeline:
- `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
- `0x2` Enable Op Compute (MUL_MAT, etc.)
Examples:
`GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
`GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
`GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
`GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
- `GGML_HEXAGON_OPFILTER=regex`
Allows filtering (disabling) Ops that match the regex pattern:

View File

@@ -26,7 +26,7 @@ Legend:
| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | | ❌ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -60,7 +60,7 @@ Legend:
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ❌ | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ❌ | ❌ |
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
@@ -105,7 +105,7 @@ Legend:
| SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |

File diff suppressed because it is too large Load Diff

View File

@@ -202,14 +202,10 @@ static bool run(llama_context * ctx, const common_params & params) {
print_tokenized_prompt(ctx, tokens, params.prompt);
if (params.save_logits) {
try {
output_data output {ctx, model, params};
std::filesystem::path model_path{params.model.path};
std::string model_name{model_path.stem().string()};
save_output_data(output, model_name, params.logits_output_dir);
} catch (const std::exception & e) {
LOG_ERR("%s : error saving logits: %s\n", __func__, e.what());
}
output_data output {ctx, model, params};
std::filesystem::path model_path{params.model.path};
std::string model_name{model_path.stem().string()};
save_output_data(output, model_name, params.logits_output_dir);
}
return true;
@@ -227,7 +223,7 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
std::optional<common_debug_cb_user_data> cb_data;
std::optional<base_callback_data> cb_data;
if (!params.save_logits) {
cb_data.emplace(params, params.tensor_filter);
}

View File

@@ -3,6 +3,7 @@
#include "debug.h"
#include "log.h"
#include "llama.h"
#include "llama-cpp.h"
#include <clocale>
#include <string>
@@ -37,7 +38,7 @@ static bool run(llama_context * ctx, const common_params & params) {
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_debug_cb_user_data cb_data;
base_callback_data cb_data;
common_params params;
@@ -52,7 +53,7 @@ int main(int argc, char ** argv) {
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = common_debug_cb_eval;
params.cb_eval = common_debug_cb_eval<false>;
params.cb_eval_user_data = &cb_data;
params.warmup = false;

View File

@@ -73,12 +73,12 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
auto ctx_arg = common_params_parser_init(params, md.ex);
std::vector<common_arg *> common_options;
std::vector<common_arg *> sampling_options;
std::vector<common_arg *> sparam_options;
std::vector<common_arg *> specific_options;
for (auto & opt : ctx_arg.options) {
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
if (opt.is_sampling) {
sampling_options.push_back(&opt);
if (opt.is_sparam) {
sparam_options.push_back(&opt);
} else if (opt.in_example(ctx_arg.ex)) {
specific_options.push_back(&opt);
} else {
@@ -93,7 +93,7 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
ss << "### Common params\n\n";
write_table(ss, common_options);
ss << "\n\n### Sampling params\n\n";
write_table(ss, sampling_options);
write_table(ss, sparam_options);
ss << "\n\n### " << md.specific_section_header << "\n\n";
write_table(ss, specific_options);

View File

@@ -37,9 +37,9 @@ int main(int argc, char ** argv){
common_ngram_cache ngram_cache;
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.ngram_cache.lookup_cache_static.c_str());
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.lookup_cache_static.c_str());
common_ngram_cache_save(ngram_cache, params.speculative.ngram_cache.lookup_cache_static);
common_ngram_cache_save(ngram_cache, params.speculative.lookup_cache_static);
return 0;
}

View File

@@ -24,7 +24,7 @@ int main(int argc, char ** argv){
return 1;
}
const int n_draft = params.speculative.draft.n_max;
const int n_draft = params.speculative.n_max;
// init llama.cpp
llama_backend_init();
@@ -49,18 +49,18 @@ int main(int argc, char ** argv){
{
const int64_t t_start_draft_us = ggml_time_us();
if (!params.speculative.ngram_cache.lookup_cache_static.empty()) {
if (!params.speculative.lookup_cache_static.empty()) {
try {
ngram_cache_static = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_static);
ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static);
} catch (std::ifstream::failure const &) {
LOG_ERR("failed to open static lookup cache: %s", params.speculative.ngram_cache.lookup_cache_static.c_str());
LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str());
exit(1);
}
}
if (!params.speculative.ngram_cache.lookup_cache_dynamic.empty()) {
if (!params.speculative.lookup_cache_dynamic.empty()) {
try {
ngram_cache_dynamic = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_dynamic);
ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic);
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
}

View File

@@ -25,7 +25,7 @@ int main(int argc, char ** argv){
}
// max. number of additional tokens to draft if match is found
const int n_draft = params.speculative.draft.n_max;
const int n_draft = params.speculative.n_max;
// init llama.cpp
llama_backend_init();
@@ -54,18 +54,18 @@ int main(int argc, char ** argv){
const int64_t t_start_draft_us = ggml_time_us();
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
if (!params.speculative.ngram_cache.lookup_cache_static.empty()) {
if (!params.speculative.lookup_cache_static.empty()) {
try {
ngram_cache_static = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_static);
ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static);
} catch (std::ifstream::failure const &) {
LOG_ERR("failed to open static lookup cache: %s", params.speculative.ngram_cache.lookup_cache_static.c_str());
LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str());
exit(1);
}
}
if (!params.speculative.ngram_cache.lookup_cache_dynamic.empty()) {
if (!params.speculative.lookup_cache_dynamic.empty()) {
try {
ngram_cache_dynamic = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_dynamic);
ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic);
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
}
@@ -213,7 +213,7 @@ int main(int argc, char ** argv){
// Update dynamic ngram cache with context ngram cache and save it to disk:
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
common_ngram_cache_save(ngram_cache_dynamic, params.speculative.ngram_cache.lookup_cache_dynamic);
common_ngram_cache_save(ngram_cache_dynamic, params.speculative.lookup_cache_dynamic);
LOG("\n\n");

View File

@@ -25,11 +25,7 @@ MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
TYPE="${OUTTYPE:-f16}"
METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
if [[ -n "$MMPROJ" ]]; then
CONVERTED_MODEL="${OUTPUT_DIR}/mmproj-${MODEL_NAME}.gguf"
else
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
fi
CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
echo "Model path: ${MODEL_PATH}"
echo "Model name: ${MODEL_NAME}"
@@ -42,7 +38,6 @@ if [[ -n "$DEBUG" ]]; then
else
CMD_ARGS=("python")
fi
CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
CMD_ARGS+=("${MODEL_PATH}")
CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
@@ -55,3 +50,7 @@ CMD_ARGS+=("--outtype" "${TYPE}")
echo ""
echo "The environment variable CONVERTED_MODEL can be set to this path using:"
echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
if [[ -n "$MMPROJ" ]]; then
mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
echo "The mmproj model was created in $(realpath "$mmproj_file")"
fi

View File

@@ -8,24 +8,8 @@
#include <clocale>
#include <cstdio>
#include <cstring>
#include <cinttypes>
#include <string>
#include <vector>
#include <utility>
struct spec_checkpoint {
int64_t n_tokens = 0;
std::vector<uint8_t> data;
size_t size() const {
return data.size();
}
bool empty() const {
return data.empty();
}
};
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
@@ -43,7 +27,7 @@ int main(int argc, char ** argv) {
return 1;
}
if (params.speculative.draft.mparams.path.empty()) {
if (params.speculative.mparams_dft.path.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;
}
@@ -62,14 +46,6 @@ int main(int argc, char ** argv) {
model_tgt = llama_init_tgt->model();
ctx_tgt = llama_init_tgt->context();
// check if the context supports partial sequence removal
const auto ctx_seq_rm = common_context_can_seq_rm(ctx_tgt);
const bool use_ckpt = (ctx_seq_rm == COMMON_CONTEXT_SEQ_RM_TYPE_FULL);
if (use_ckpt) {
LOG_INF("speculative decoding will use checkpoints (context does not support partial sequence removal)\n");
}
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
// load the draft model
@@ -77,7 +53,7 @@ int main(int argc, char ** argv) {
// TODO: simplify this logic
{
const auto & params_spec = params.speculative.draft;
const auto & params_spec = params.speculative;
auto params_dft = params;
@@ -85,15 +61,15 @@ int main(int argc, char ** argv) {
params_dft.n_ctx = params_spec.n_ctx;
params_dft.n_batch = llama_n_ctx_seq(ctx_tgt);
params_dft.devices = params_spec.devices;
params_dft.model = params_spec.mparams;
params_dft.model = params_spec.mparams_dft;
params_dft.n_gpu_layers = params_spec.n_gpu_layers;
if (params_spec.cpuparams.n_threads > 0) {
params_dft.cpuparams.n_threads = params.speculative.draft.cpuparams.n_threads;
params_dft.cpuparams_batch.n_threads = params.speculative.draft.cpuparams_batch.n_threads;
params_dft.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
params_dft.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
}
params_dft.tensor_buft_overrides = params.speculative.draft.tensor_buft_overrides;
params_dft.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
auto mparams_dft = common_model_params_to_llama(params_dft);
@@ -103,8 +79,8 @@ int main(int argc, char ** argv) {
return 1;
}
params.speculative.draft.model = model_dft.get();
params.speculative.draft.cparams = common_context_params_to_llama(params_dft);
params.speculative.model_dft = model_dft.get();
params.speculative.cparams_dft = common_context_params_to_llama(params_dft);
}
// Tokenize the prompt
@@ -143,7 +119,7 @@ int main(int argc, char ** argv) {
const auto t_enc_start = ggml_time_us();
// target model sampling context
common_sampler_ptr smpl(common_sampler_init(model_tgt, params.sampling));
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
// eval the prompt
llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
@@ -166,49 +142,21 @@ int main(int argc, char ** argv) {
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
size_t n_draft = 0;
llama_tokens draft;
spec_checkpoint spec_ckpt;
const auto t_enc_end = ggml_time_us();
const auto t_dec_start = ggml_time_us();
while (true) {
// generate or reuse draft tokens
// optionally, generate draft tokens that can be appended to the target batch
//
// this is the most important part of the speculation. the more probable tokens that are provided here
// the better the performance will be. in theory, this computation can be performed asynchronously and even
// offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
// from a cache or lookup tables.
//
if (draft.empty()) {
// generate a new draft
draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
llama_tokens draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
// save the original draft size
n_draft = draft.size();
// save a checkpoint of the target context before evaluating the draft
// this allows us to restore the state if partial draft acceptance occurs
if (!draft.empty() && use_ckpt) {
const size_t ckpt_size = llama_state_seq_get_size_ext(ctx_tgt, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
spec_ckpt.data.resize(ckpt_size);
const size_t n = llama_state_seq_get_data_ext(ctx_tgt, spec_ckpt.data.data(), ckpt_size, 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
GGML_ASSERT(n == ckpt_size);
spec_ckpt.n_tokens = (int64_t) prompt_tgt.size();
LOG_DBG("created speculative checkpoint (n_tokens = %" PRId64 ", size = %.3f MiB)\n",
spec_ckpt.n_tokens, (float) spec_ckpt.data.size() / 1024 / 1024);
}
} else {
// we have a previous (partial) draft to reuse from checkpoint restoration
if (use_ckpt) {
GGML_ASSERT(!spec_ckpt.empty());
}
}
//LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
// always have a token to evaluate from before - id_last
common_batch_clear(batch_tgt);
@@ -216,6 +164,11 @@ int main(int argc, char ** argv) {
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
{
// do not waste time on small drafts
if (draft.size() < (size_t) params_spec.n_min) {
draft.clear();
}
for (size_t i = 0; i < draft.size(); ++i) {
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
}
@@ -225,12 +178,6 @@ int main(int argc, char ** argv) {
llama_decode(ctx_tgt, batch_tgt);
}
// only save the sampler sampler state if we use checkpoints
common_sampler_ptr smpl_save;
if (use_ckpt) {
smpl_save.reset(common_sampler_clone(smpl.get()));
}
// sample from the full target batch and return the accepted tokens based on the target sampler
//
// for each token to be accepted, the sampler would have to sample that same token
@@ -238,38 +185,14 @@ int main(int argc, char ** argv) {
// available logits from the batch and sample the next token until we run out of logits or the sampler
// disagrees with the draft
//
auto ids = common_sampler_sample_and_accept_n(smpl.get(), ctx_tgt, draft);
const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
//LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());
GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
// check for partial draft acceptance:
// if the context doesn't support partial sequence removal, restore the checkpoint
// and make the accepted tokens the new partial draft for the next iteration
if (use_ckpt && ids.size() - 1 < draft.size()) {
LOG_DBG("partial acceptance: %zu < %zu, restoring checkpoint\n", ids.size() - 1, draft.size());
draft = std::move(ids);
const size_t n = llama_state_seq_set_data_ext(ctx_tgt, spec_ckpt.data.data(), spec_ckpt.size(), 0, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
GGML_ASSERT(n == spec_ckpt.size());
llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, spec_ckpt.n_tokens, -1);
prompt_tgt.resize(spec_ckpt.n_tokens);
smpl = std::move(smpl_save);
n_past = (int) prompt_tgt.size();
continue;
}
common_speculative_accept(spec, ids.size() - 1);
// full acceptance: consume the draft and commit accepted tokens
n_past += ids.size() - 1;
n_drafted += n_draft; // note: we ignore the discarded small drafts
n_drafted += draft.size(); // note: we ignore the discarded small drafts
n_accept += ids.size() - 1;
n_predict += ids.size();
@@ -299,9 +222,6 @@ int main(int argc, char ** argv) {
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
// clear the draft since it has been consumed
draft.clear();
{
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
@@ -323,7 +243,7 @@ int main(int argc, char ** argv) {
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_INF("\n");
LOG_INF("n_draft = %d\n", params_spec.draft.n_max);
LOG_INF("n_draft = %d\n", params_spec.n_max);
LOG_INF("n_predict = %d\n", n_predict);
LOG_INF("n_drafted = %d\n", n_drafted);
LOG_INF("n_accept = %d\n", n_accept);
@@ -334,10 +254,11 @@ int main(int argc, char ** argv) {
LOG_INF("\n");
LOG_INF("target:\n\n");
common_perf_print(ctx_tgt, smpl.get());
common_perf_print(ctx_tgt, smpl);
llama_batch_free(batch_tgt);
common_sampler_free(smpl);
common_speculative_free(spec);
llama_backend_free();

View File

@@ -49,7 +49,7 @@ int main(int argc, char ** argv) {
return 1;
}
if (params.speculative.draft.mparams.path.empty()) {
if (params.speculative.mparams_dft.path.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;
}
@@ -58,7 +58,7 @@ int main(int argc, char ** argv) {
const int n_seq_dft = params.n_parallel;
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
const float p_draft_split = params.speculative.draft.p_split;
const float p_draft_split = params.speculative.p_split;
std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed);
std::uniform_real_distribution<> u_dist;
@@ -80,15 +80,15 @@ int main(int argc, char ** argv) {
ctx_tgt = llama_init_tgt->context();
// load the draft model
params.devices = params.speculative.draft.devices;
params.model = params.speculative.draft.mparams;
params.n_gpu_layers = params.speculative.draft.n_gpu_layers;
if (params.speculative.draft.cpuparams.n_threads > 0) {
params.cpuparams.n_threads = params.speculative.draft.cpuparams.n_threads;
params.devices = params.speculative.devices;
params.model = params.speculative.mparams_dft;
params.n_gpu_layers = params.speculative.n_gpu_layers;
if (params.speculative.cpuparams.n_threads > 0) {
params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
}
params.cpuparams_batch.n_threads = params.speculative.draft.cpuparams_batch.n_threads;
params.tensor_buft_overrides = params.speculative.draft.tensor_buft_overrides;
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
auto llama_init_dft = common_init_from_params(params);
@@ -110,21 +110,13 @@ int main(int argc, char ** argv) {
return 1;
}
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
(llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
LOG_ERR("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
__func__,
llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
return 1;
}
if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
(llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
LOG_ERR("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
__func__,
llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
if (
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
) {
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
return 1;
}
@@ -145,12 +137,11 @@ int main(int argc, char ** argv) {
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
common_token_to_piece(vocab_tgt, i).c_str(),
common_token_to_piece(vocab_dft, i).c_str());
common_token_to_piece(ctx_tgt, i).c_str(),
common_token_to_piece(ctx_dft, i).c_str());
return 1;
}
}
@@ -192,7 +183,7 @@ int main(int argc, char ** argv) {
//GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft));
// how many tokens to draft each time
int n_draft = params.speculative.draft.n_max;
int n_draft = params.speculative.n_max;
int n_predict = 0;
int n_drafted = 0;

View File

@@ -1,124 +0,0 @@
#!/bin/bash
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
Help() {
cat << EOF
Usage: $(basename "$0") [OPTIONS]
This script processes files with specified options.
Options:
-h, --help Display this help message and exit.
-c, --context <value> Set context length. Bigger need more memory.
-p, --promote <value> Prompt to start generation with.
-m, --model <value> Full model file path.
-mg,--main-gpu <value> Set main GPU ID (0 - n) for single GPU mode.
-sm,--split-mode <value> How to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
-ngl,--n-gpu-layers <value> Max. number of layers to store in VRAM (default: -1)
-lv,--log-verbosity <value> Set the verbosity threshold. Messages with a higher verbosity will be
ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
EOF
}
BIN_FILE=./build/bin/llama-server
SEED=0
GPUS_SETTING=""
MODEL_FILE=../models/Qwen3.5-4B-Q4_0.gguf
NGL=99
CONTEXT=4096
GGML_SYCL_DEVICE=-1
SPLIT_MODE=layer
LOG_VERBOSE=3
while [[ $# -gt 0 ]]; do
case "$1" in
-c|--context)
CONTEXT=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-m|--model)
MODEL_FILE="$2"
# Shift twice to consume both the option flag and its value
shift
shift
;;
-mg|--main-gpu)
GGML_SYCL_DEVICE=$2
SPLIT_MODE=none
# Shift twice to consume both the option flag and its value
shift
shift
;;
-sm|--split-mode)
SPLIT_MODE=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-ngl|--n-gpu-layers)
NGL=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-lv|--log-verbosity)
LOG_VERBOSE=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-h|--help)
Help
exit 0
;;
*)
# Handle unknown options or stop processing options
echo "Invalid option: $1"
# Optional: exit script or shift to treat remaining as positional args
exit 1
;;
esac
done
source /opt/intel/oneapi/setvars.sh
#export GGML_SYCL_DEBUG=1
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
#support malloc device memory more than 4GB.
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
if [ $GGML_SYCL_DEVICE -ne -1 ]; then
echo "Use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
else
echo "Use all Intel GPUs, including iGPU & dGPU"
GPUS_SETTING="-sm ${SPLIT_MODE}"
fi
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000

View File

@@ -38,7 +38,7 @@ SEED=0
GPUS_SETTING=""
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
MODEL_FILE=../models/llama-2-7b.Q4_0.gguf
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
NGL=99
CONTEXT=4096
GGML_SYCL_DEVICE=-1
@@ -122,10 +122,9 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
else
echo "Use all Intel GPUs, including iGPU & dGPU"
GPUS_SETTING="-sm ${SPLIT_MODE}"
echo "Use all Intel GPUs, including iGPU & dGPU"
fi
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap

View File

@@ -1,179 +0,0 @@
:: MIT license
:: Copyright (C) 2024 Intel Corporation
:: SPDX-License-Identifier: MIT
@echo off
setlocal EnableExtensions EnableDelayedExpansion
set "BIN_FILE=.\build\bin\llama-server.exe"
set "SEED=0"
set "GPUS_SETTING="
set "MODEL_FILE=..\models\Qwen3.5-4B-Q4_0.gguf"
set "NGL=99"
set "CONTEXT=4096"
set "GGML_SYCL_DEVICE=-1"
set "SPLIT_MODE=layer"
set "LOG_VERBOSE=3"
if "%~1"=="" goto after_args
:parse_args
if "%~1"=="" goto after_args
if /I "%~1"=="-c" (
if "%~2"=="" goto missing_value
set "CONTEXT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--context" (
if "%~2"=="" goto missing_value
set "CONTEXT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-m" (
if "%~2"=="" goto missing_value
set "MODEL_FILE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--model" (
if "%~2"=="" goto missing_value
set "MODEL_FILE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-mg" (
if "%~2"=="" goto missing_value
set "GGML_SYCL_DEVICE=%~2"
set "SPLIT_MODE=none"
shift
shift
goto parse_args
)
if /I "%~1"=="--main-gpu" (
if "%~2"=="" goto missing_value
set "GGML_SYCL_DEVICE=%~2"
set "SPLIT_MODE=none"
shift
shift
goto parse_args
)
if /I "%~1"=="-sm" (
if "%~2"=="" goto missing_value
set "SPLIT_MODE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--split-mode" (
if "%~2"=="" goto missing_value
set "SPLIT_MODE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-ngl" (
if "%~2"=="" goto missing_value
set "NGL=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--n-gpu-layers" (
if "%~2"=="" goto missing_value
set "NGL=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-lv" (
if "%~2"=="" goto missing_value
set "LOG_VERBOSE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--log-verbosity" (
if "%~2"=="" goto missing_value
set "LOG_VERBOSE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-h" goto help
if /I "%~1"=="--help" goto help
echo Invalid option: %~1
exit /b 1
:missing_value
echo Missing value for option: %~1
exit /b 1
:help
echo Usage: %~n0 [OPTIONS]
echo.
echo This script processes files with specified options.
echo.
echo Options:
echo -h, --help Display this help message and exit.
echo -c, --context ^<value^> Set context length. Bigger need more memory.
echo -m, --model ^<value^> Full model file path.
echo -mg,--main-gpu ^<value^> Set main GPU ID (0 - n) for single GPU mode.
echo -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
echo - none: use one GPU only
echo - layer (default): split layers and KV across GPUs
echo - row: split rows across GPUs
echo -ngl,--n-gpu-layers ^<value^> Max. number of layers to store in VRAM (default: -1)
echo -lv,--log-verbosity ^<value^> Set the verbosity threshold. Messages with a higher verbosity will be
echo ignored. Values:
echo - 0: generic output
echo - 1: error
echo - 2: warning
echo - 3: info
echo - 4: debug
exit /b 0
:after_args
REM In Windows CMD, source is not available; call oneAPI setvars if present.
if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
) else (
echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
)
REM Support malloc device memory more than 4GB.
set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
if not "%GGML_SYCL_DEVICE%"=="-1" (
echo Use %GGML_SYCL_DEVICE% as main GPU
REM Use single GPU only.
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
) else (
echo Use all Intel GPUs, including iGPU ^& dGPU
set "GPUS_SETTING=-sm %SPLIT_MODE%"
)
echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
set "ZES_ENABLE_SYSMAN=1"
%BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
endlocal

View File

@@ -2,200 +2,10 @@
:: Copyright (C) 2024 Intel Corporation
:: SPDX-License-Identifier: MIT
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
@echo off
setlocal EnableExtensions EnableDelayedExpansion
REM MIT license
REM Copyright (C) 2024 Intel Corporation
REM SPDX-License-Identifier: MIT
set "BIN_FILE=.\build\bin\llama-completion.exe"
set "SEED=0"
set "GPUS_SETTING="
set "INPUT_PROMPT=Building a website can be done in 10 simple steps:^nStep 1:"
set "MODEL_FILE=..\models\llama-2-7b.Q4_0.gguf"
set "NGL=99"
set "CONTEXT=4096"
set "GGML_SYCL_DEVICE=-1"
set "SPLIT_MODE=layer"
set "LOG_VERBOSE=3"
if "%~1"=="" goto after_args
:parse_args
if "%~1"=="" goto after_args
if /I "%~1"=="-c" (
if "%~2"=="" goto missing_value
set "CONTEXT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--context" (
if "%~2"=="" goto missing_value
set "CONTEXT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-p" (
if "%~2"=="" goto missing_value
set "INPUT_PROMPT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--promote" (
if "%~2"=="" goto missing_value
set "INPUT_PROMPT=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-m" (
if "%~2"=="" goto missing_value
set "MODEL_FILE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--model" (
if "%~2"=="" goto missing_value
set "MODEL_FILE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-mg" (
if "%~2"=="" goto missing_value
set "GGML_SYCL_DEVICE=%~2"
set "SPLIT_MODE=none"
shift
shift
goto parse_args
)
if /I "%~1"=="--main-gpu" (
if "%~2"=="" goto missing_value
set "GGML_SYCL_DEVICE=%~2"
set "SPLIT_MODE=none"
shift
shift
goto parse_args
)
if /I "%~1"=="-sm" (
if "%~2"=="" goto missing_value
set "SPLIT_MODE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--split-mode" (
if "%~2"=="" goto missing_value
set "SPLIT_MODE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-ngl" (
if "%~2"=="" goto missing_value
set "NGL=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--n-gpu-layers" (
if "%~2"=="" goto missing_value
set "NGL=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-lv" (
if "%~2"=="" goto missing_value
set "LOG_VERBOSE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="--log-verbosity" (
if "%~2"=="" goto missing_value
set "LOG_VERBOSE=%~2"
shift
shift
goto parse_args
)
if /I "%~1"=="-h" goto help
if /I "%~1"=="--help" goto help
echo Invalid option: %~1
exit /b 1
:missing_value
echo Missing value for option: %~1
exit /b 1
:help
echo Usage: %~n0 [OPTIONS]
echo.
echo This script processes files with specified options.
echo.
echo Options:
echo -h, --help Display this help message and exit.
echo -c, --context ^<value^> Set context length. Bigger need more memory.
echo -p, --promote ^<value^> Prompt to start generation with.
echo -m, --model ^<value^> Full model file path.
echo -mg,--main-gpu ^<value^> Set main GPU ID (0 - n) for single GPU mode.
echo -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
echo - none: use one GPU only
echo - layer (default): split layers and KV across GPUs
echo - row: split rows across GPUs
echo -ngl,--n-gpu-layers ^<value^> Max. number of layers to store in VRAM (default: -1)
echo -lv,--log-verbosity ^<value^> Set the verbosity threshold. Messages with a higher verbosity will be
echo ignored. Values:
echo - 0: generic output
echo - 1: error
echo - 2: warning
echo - 3: info
echo - 4: debug
exit /b 0
:after_args
REM In Windows CMD, source is not available; call oneAPI setvars if present.
if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
) else (
echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
)
REM Support malloc device memory more than 4GB.
set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
if not "%GGML_SYCL_DEVICE%"=="-1" (
echo Use %GGML_SYCL_DEVICE% as main GPU
REM Use single GPU only.
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
) else (
echo Use all Intel GPUs, including iGPU ^& dGPU
set "GPUS_SETTING=-sm %SPLIT_MODE%"
)
echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m %MODEL_FILE% -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
set "ZES_ENABLE_SYSMAN=1"
%BIN_FILE% -m "%MODEL_FILE%" -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
endlocal
:: support malloc device memory more than 4GB.
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
set LOAD_MODE="--mmap"
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%

View File

@@ -4,8 +4,8 @@ project("ggml" C CXX ASM)
### GGML Version
set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 10)
set(GGML_VERSION_PATCH 1)
set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 11)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@@ -213,7 +213,7 @@ set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIP_GRAPHS "ggml: use HIP graph" ON)
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
option(GGML_HIP_RCCL "ggml: use ROCm Collective Comm. Library" OFF)
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)

View File

@@ -470,10 +470,11 @@ endforeach()
target_link_libraries(ggml-base PRIVATE Threads::Threads)
if (DEFINED MATH_LIBRARY)
target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
elseif (NOT WIN32 AND NOT DEFINED ENV{ONEAPI_ROOT})
target_link_libraries(ggml-base PRIVATE m)
find_library(MATH_LIBRARY m)
if (MATH_LIBRARY)
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
target_link_libraries(ggml-base PRIVATE m)
endif()
endif()
if (CMAKE_SYSTEM_NAME MATCHES "Android")

View File

@@ -1133,7 +1133,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
if (t_ij->view_src != nullptr && ggml_backend_buffer_is_meta(t_ij->view_src->buffer)) {
t_ij->view_src = ggml_backend_meta_buffer_simple_tensor(tensor->view_src, j);
if (t_ij->view_offs > 0 && split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
GGML_ASSERT(tensor->ne[split_dim] != 0);
GGML_ASSERT(ne[split_dim] != 0 && tensor->ne[split_dim] != 0);
const int split_dim_view_src = ggml_backend_meta_get_split_state(tensor->view_src, /*assume_sync =*/ true).axis;
GGML_ASSERT(split_dim_view_src >= 0 && split_dim_view_src < GGML_MAX_DIMS);
@@ -1170,28 +1170,6 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
simple_tensors.push_back(t_ij);
}
// If one of the sources has a zero-sized slice, disable the computation:
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (tensor->src[i] == nullptr || !ggml_backend_buffer_is_meta(tensor->src[i]->buffer)) {
continue;
}
const ggml_backend_meta_split_state split_state_src = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
if (split_state_src.axis < 0 || split_state_src.axis >= GGML_MAX_DIMS) {
continue;
}
for (size_t j = 0; j < n_simple_bufs; j++) {
int64_t ne_sum = 0;
for (size_t s = 0; s < split_state_src.n_segments; s++) {
ne_sum += split_state_src.ne[s*n_simple_bufs + j];
}
if (ne_sum == 0) {
simple_tensors[j]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
}
}
}
buf_ctx->simple_tensors[tensor] = simple_tensors;
return GGML_STATUS_SUCCESS;
@@ -1205,57 +1183,40 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
if (split_state.n_segments != 1) {
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(tensor->ne[3] == 1);
size_t offset_data = 0;
std::vector<size_t> simple_offsets(n_bufs, 0);
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
GGML_ASSERT(tensor->ne[2] == 1);
const size_t row_stride = tensor->nb[1];
GGML_ASSERT(offset % row_stride == 0);
GGML_ASSERT(size % row_stride == 0);
const int64_t r_start = offset / row_stride;
const int64_t r_count = size / row_stride;
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
const int64_t blck_size = ggml_blck_size(tensor->type);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
r_count, simple_tensor->nb[1], tensor->nb[1]);
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*r_count == size);
GGML_ASSERT(offset_data*tensor->ne[1] == size);
return;
}
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
const size_t row_stride = tensor->nb[2];
GGML_ASSERT(offset % row_stride == 0);
GGML_ASSERT(size % row_stride == 0);
const int64_t r_start = offset / row_stride;
const int64_t r_count = size / row_stride;
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
r_count, simple_tensor->nb[2], tensor->nb[2]);
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*r_count == size);
GGML_ASSERT(offset_data*tensor->ne[2] == size);
return;
}
@@ -1312,57 +1273,40 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
if (split_state.n_segments != 1) {
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));
GGML_ASSERT(tensor->ne[3] == 1);
size_t offset_data = 0;
std::vector<size_t> simple_offsets(n_bufs, 0);
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
GGML_ASSERT(tensor->ne[2] == 1);
const size_t row_stride = tensor->nb[1];
GGML_ASSERT(offset % row_stride == 0);
GGML_ASSERT(size % row_stride == 0);
const int64_t r_start = offset / row_stride;
const int64_t r_count = size / row_stride;
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
const int64_t blck_size = ggml_blck_size(tensor->type);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
r_count, simple_tensor->nb[1], tensor->nb[1]);
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*r_count == size);
GGML_ASSERT(offset_data*tensor->ne[1] == size);
return;
}
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
const size_t row_stride = tensor->nb[2];
GGML_ASSERT(offset % row_stride == 0);
GGML_ASSERT(size % row_stride == 0);
const int64_t r_start = offset / row_stride;
const int64_t r_count = size / row_stride;
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
for (size_t s = 0; s < split_state.n_segments; s++) {
for (size_t j = 0; j < n_bufs; j++) {
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
r_count, simple_tensor->nb[2], tensor->nb[2]);
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
offset_data += nbytes;
simple_offsets[j] += nbytes;
}
}
GGML_ASSERT(offset_data*r_count == size);
GGML_ASSERT(offset_data*tensor->ne[2] == size);
return;
}
@@ -1498,20 +1442,17 @@ struct ggml_backend_meta_context {
struct backend_config {
ggml_backend_t backend;
std::vector<cgraph_config> cgraphs;
std::vector<ggml_tensor *> nodes;
std::vector<ggml_backend_buffer_ptr> bufs;
std::vector<cgraph_config> cgraphs;
std::vector<ggml_tensor *> nodes;
ggml_backend_buffer_ptr buf;
backend_config(ggml_backend_t backend, const size_t n_reduce_steps) : backend(backend) {
bufs.resize(n_reduce_steps);
}
backend_config(ggml_backend_t backend) : backend(backend) {}
};
std::string name;
std::vector<backend_config> backend_configs;
ggml_context_ptr ctx;
std::vector<ggml_cgraph *> cgraphs_aux;
std::vector<ggml_tensor *> nodes_aux;
size_t n_reduce_steps;
int max_nnodes = 0;
size_t max_tmp_size = 0;
size_t max_subgraphs = 0;
@@ -1523,7 +1464,6 @@ struct ggml_backend_meta_context {
ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
n_reduce_steps = std::ceil(std::log2(n_devs));
name = "Meta(";
std::vector<ggml_backend_t> simple_backends;
backend_configs.reserve(n_devs);
@@ -1535,7 +1475,7 @@ struct ggml_backend_meta_context {
}
name += ggml_backend_dev_name(simple_dev);
simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
backend_configs.emplace_back(simple_backends.back(), n_reduce_steps);
backend_configs.emplace_back(simple_backends.back());
}
name += ")";
@@ -1565,6 +1505,10 @@ struct ggml_backend_meta_context {
ggml_backend_free(bc.backend);
}
}
size_t n_reduce_steps() const {
return std::ceil(std::log2(backend_configs.size()));
}
};
static const char * ggml_backend_meta_get_name(ggml_backend_t backend) {
@@ -1717,36 +1661,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
ggml_tensor * node = cgraph->nodes[id];
int32_t n_used = ggml_node_get_use_count(cgraph, id);
// Skip MIRRORED nodes that don't consume node
auto skip_unrelated = [&]() {
while (id + 1 < cgraph->n_nodes) {
ggml_tensor * next = cgraph->nodes[id+1];
if (ggml_backend_meta_get_split_state(next, false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
break;
}
bool safe = true;
for (int s = 0; s < GGML_MAX_SRC; s++) {
if (next->src[s] == nullptr) {
continue;
}
if (next->src[s] == node) {
safe = false;
break;
}
if (ggml_backend_meta_get_split_state(next->src[s], false).axis != GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
safe = false;
break;
}
}
if (!safe) {
break;
}
id++;
}
};
skip_unrelated();
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
@@ -1761,12 +1675,10 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
n_used = ggml_node_get_use_count(cgraph, id);
}
}
// Chain of MULs with MIRRORED src[1]
while (true) {
skip_unrelated();
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
if (id + 1 >= cgraph->n_nodes) {
return idr;
}
{
ggml_tensor * next = cgraph->nodes[id+1];
if (next->op == GGML_OP_MUL && next->src[0] == node &&
ggml_backend_meta_get_split_state(next->src[1], false).axis == GGML_BACKEND_SPLIT_AXIS_MIRRORED) {
@@ -1774,8 +1686,6 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
id++;
idr = id;
n_used = ggml_node_get_use_count(cgraph, id);
} else {
break;
}
}
@@ -1826,24 +1736,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
continue;
}
const int i_delayed = get_i_delayed(i);
// If we can delay the AllReduce we need to consider the interaction with zero-sized tensor slices.
// A backend with such a slice would normally have valid data after participating in the AllReduce with a node that has
// its compute flag disabled and thus gets its data zeroed out.
// If the AllReduce is delayed then the nodes until that point also need to have their compute flag disabled.
if (i_delayed > i) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
if ((bcj.nodes[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
for (int ii = i + 1; ii <= i_delayed; ii++) {
bcj.nodes[ii]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
}
}
}
}
i = i_delayed;
i = get_i_delayed(i);
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
@@ -1861,17 +1754,16 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
if (max_tmp_size > backend_ctx->max_tmp_size) {
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
for (size_t i = 0; i < backend_ctx->n_reduce_steps; i++) {
bcj.bufs[i].reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
}
bcj.buf.reset(ggml_backend_alloc_buffer(bcj.backend, max_tmp_size));
}
backend_ctx->max_tmp_size = max_tmp_size;
}
if (max_nnodes_raised || n_subgraphs > backend_ctx->max_subgraphs) {
backend_ctx->max_subgraphs = std::max(backend_ctx->max_subgraphs, n_subgraphs);
const size_t n_nodes_per_device = 3 * backend_ctx->n_reduce_steps; // tmp + ADD (+zeroing) graph per step and device
const size_t n_cgraphs_per_device = 2 * backend_ctx->n_reduce_steps; // ADD ( + zeroing) graph per step and device
const size_t n_reduce_steps = backend_ctx->n_reduce_steps();
const size_t n_nodes_per_device = 2 * n_reduce_steps; // tmp + ADD per step
const size_t n_cgraphs_per_device = n_reduce_steps; // 1 ADD graph per step
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
@@ -1920,6 +1812,11 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
size_t iga = 0; // i graph aux
size_t ina = 0; // i node aux
// FIXME usage_counts
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
return ret;
};
auto get_node_aux = [&](ggml_tensor * t) -> ggml_tensor * {
ggml_tensor * ret = backend_ctx->nodes_aux[ina++];
memset(ret, 0, sizeof(ggml_tensor));
@@ -1931,110 +1828,75 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
}
return ret;
};
auto set_tmp_data = [&](ggml_tensor * tensor, const size_t j, const size_t i_buf) {
auto & bcj = backend_ctx->backend_configs[j];
ggml_backend_buffer_ptr & buf_ptr = bcj.bufs[i_buf];
if (!buf_ptr || ggml_backend_buffer_get_size(buf_ptr.get()) < backend_ctx->max_tmp_size) {
buf_ptr.reset(ggml_backend_alloc_buffer(bcj.backend, backend_ctx->max_tmp_size));
}
tensor->buffer = buf_ptr.get();
tensor->data = ggml_backend_buffer_get_base(buf_ptr.get());
};
// FIXME usage_counts
auto get_cgraph_aux = [&]() -> ggml_cgraph * {
ggml_cgraph * ret = backend_ctx->cgraphs_aux[iga++];
return ret;
};
// Preferentially use backend-specific allreduce_tensor_async (e.g. NCCL for CUDA), use a generic fallback if unavailable:
auto allreduce_fallback = [&](size_t i) -> ggml_status {
std::vector<ggml_cgraph *> step_cgraphs(n_backends, nullptr);
// Zero out nodes that were disabled due to having a zero-sized slice:
for (size_t j = 0; j < n_backends; j++) {
auto & bcj = backend_ctx->backend_configs[j];
ggml_tensor * node = bcj.cgraphs[i].cgraph_main->nodes[bcj.cgraphs[i].cgraph_main->n_nodes - 1];
if (node->flags & GGML_TENSOR_FLAG_COMPUTE) {
continue;
}
ggml_tensor * node_zero = get_node_aux(node);
node_zero->op = GGML_OP_SCALE; // FIXME 0.0f * NaN == NaN
node_zero->src[0] = node;
ggml_set_op_params_f32(node_zero, 0, 0.0f);
node_zero->data = node->data;
node_zero->flags |= GGML_TENSOR_FLAG_COMPUTE;
step_cgraphs[j] = get_cgraph_aux();
step_cgraphs[j]->nodes[0] = node_zero;
step_cgraphs[j]->n_nodes = 1;
const ggml_status status = ggml_backend_graph_compute_async(bcj.backend, step_cgraphs[j]);
if (status != GGML_STATUS_SUCCESS) {
return status;
}
}
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
auto push_data = [&](const size_t j_src, const size_t j_dst, const size_t i_buf) {
assert(step_cgraphs[j_dst] == nullptr);
auto & bcj_src = backend_ctx->backend_configs[j_src];
auto & bcj_dst = backend_ctx->backend_configs[j_dst];
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
GGML_ASSERT(ggml_is_contiguous(node_src));
GGML_ASSERT(ggml_is_contiguous(node_dst));
ggml_tensor * node_tmp = get_node_aux(node_dst);
set_tmp_data(node_tmp, j_dst, i_buf);
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_tmp);
ggml_tensor * node_red = get_node_aux(node_dst);
node_red->view_src = node_dst->view_src == nullptr ? node_dst : node_dst->view_src;
node_red->view_offs = node_dst->view_offs;
node_red->op = GGML_OP_ADD;
node_red->src[0] = node_dst;
node_red->src[1] = node_tmp;
node_red->flags |= GGML_TENSOR_FLAG_COMPUTE;
ggml_backend_view_init(node_red);
ggml_cgraph * cgraph_aux = get_cgraph_aux();
cgraph_aux->nodes[0] = node_red;
cgraph_aux->n_nodes = 1;
step_cgraphs[j_dst] = cgraph_aux;
};
size_t offset_j = n_backends/2;
while ((offset_j & (offset_j - 1)) != 0) {
offset_j--;
}
const size_t offset_j_max = offset_j;
size_t i_buf = 0;
// If n_backends is not a power of 2, fold in the excess prior to butterfly reduction:
for (size_t j_src = 2*offset_j_max; j_src < n_backends; j_src++) {
const size_t j_dst = j_src - 2*offset_j_max;
push_data(j_src, j_dst, i_buf);
const ggml_status status = ggml_backend_graph_compute_async(backend_ctx->backend_configs[j_dst].backend, step_cgraphs[j_dst]);
if (status != GGML_STATUS_SUCCESS) {
return status;
}
i_buf = 1;
}
// Butterfly reduction:
for (; offset_j >= 1; offset_j /= 2) {
for (size_t offset_j = 1; offset_j < n_backends; offset_j *= 2) {
std::fill(step_cgraphs.begin(), step_cgraphs.end(), nullptr);
for (size_t j = 0; j < 2*offset_j_max; j++) {
for (size_t j = 0; j < n_backends; j++) {
const size_t j_other = j ^ offset_j;
if (j_other >= n_backends) {
if (j_other > j) {
continue;
}
push_data(j, j_other, i_buf);
auto & bcj1 = backend_ctx->backend_configs[j];
auto & bcj2 = backend_ctx->backend_configs[j_other];
ggml_tensor * node1 = bcj1.cgraphs[i].cgraph_main->nodes[bcj1.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_tensor * node2 = bcj2.cgraphs[i].cgraph_main->nodes[bcj2.cgraphs[i].cgraph_main->n_nodes - 1];
GGML_ASSERT(ggml_is_contiguous(node1));
GGML_ASSERT(ggml_is_contiguous(node2));
// Tmp tensors to receive P2P copies
ggml_tensor * node_tmp_1 = get_node_aux(node1);
node_tmp_1->buffer = bcj1.buf.get();
node_tmp_1->data = ggml_backend_buffer_get_base(bcj1.buf.get());
ggml_tensor * node_tmp_2 = get_node_aux(node2);
node_tmp_2->buffer = bcj2.buf.get();
node_tmp_2->data = ggml_backend_buffer_get_base(bcj2.buf.get());
// 2 P2P copies: exchange full buffers
ggml_backend_tensor_copy_async(bcj1.backend, bcj2.backend, node1, node_tmp_2);
ggml_backend_tensor_copy_async(bcj2.backend, bcj1.backend, node2, node_tmp_1);
// Local ADD: node1 += tmp1 (in-place via view)
ggml_tensor * node_red_1 = get_node_aux(node1);
node_red_1->view_src = node1->view_src == nullptr ? node1 : node1->view_src;
node_red_1->view_offs = node1->view_offs;
node_red_1->op = GGML_OP_ADD;
node_red_1->src[0] = node1;
node_red_1->src[1] = node_tmp_1;
node_red_1->flags |= GGML_TENSOR_FLAG_COMPUTE;
ggml_backend_view_init(node_red_1);
// Local ADD: node2 += tmp2 (in-place via view)
ggml_tensor * node_red_2 = get_node_aux(node2);
node_red_2->view_src = node2->view_src == nullptr ? node2 : node2->view_src;
node_red_2->view_offs = node2->view_offs;
node_red_2->op = GGML_OP_ADD;
node_red_2->src[0] = node2;
node_red_2->src[1] = node_tmp_2;
node_red_2->flags |= GGML_TENSOR_FLAG_COMPUTE;
ggml_backend_view_init(node_red_2);
// Build 1-node cgraphs for the ADD ops
ggml_cgraph * cgraph_aux_1 = get_cgraph_aux();
cgraph_aux_1->nodes[0] = node_red_1;
cgraph_aux_1->n_nodes = 1;
step_cgraphs[j] = cgraph_aux_1;
ggml_cgraph * cgraph_aux_2 = get_cgraph_aux();
cgraph_aux_2->nodes[0] = node_red_2;
cgraph_aux_2->n_nodes = 1;
step_cgraphs[j_other] = cgraph_aux_2;
}
for (size_t j = 0; j < 2*offset_j_max; j++) {
// Execute local ADDs for this step
for (size_t j = 0; j < n_backends; j++) {
if (step_cgraphs[j] == nullptr) {
continue;
}
@@ -2044,20 +1906,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
return status;
}
}
i_buf++;
}
assert(i_buf == backend_ctx->n_reduce_steps);
// If n_backends is not a power of 2, copy back the reduced tensors to the excess:
for (size_t j = 2*offset_j_max; j < n_backends; j++) {
auto & bcj_src = backend_ctx->backend_configs[j - 2*offset_j_max];
auto & bcj_dst = backend_ctx->backend_configs[j];
ggml_tensor * node_src = bcj_src.cgraphs[i].cgraph_main->nodes[bcj_src.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_tensor * node_dst = bcj_dst.cgraphs[i].cgraph_main->nodes[bcj_dst.cgraphs[i].cgraph_main->n_nodes - 1];
ggml_backend_tensor_copy_async(bcj_src.backend, bcj_dst.backend, node_src, node_dst);
}
return GGML_STATUS_SUCCESS;
};
@@ -2100,8 +1949,8 @@ static const ggml_backend_i ggml_backend_meta_i = {
/* .free = */ ggml_backend_meta_free,
/* .set_tensor_async = */ ggml_backend_meta_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_meta_get_tensor_async,
/* .set_tensor_2d_async = */ nullptr,
/* .get_tensor_2d_async = */ nullptr,
/* .set_tensor_2d_async = */ nullptr,
/* .cpy_tensor_async = */ nullptr,
/* .synchronize = */ ggml_backend_meta_synchronize,
/* .graph_plan_create = */ nullptr,

View File

@@ -181,12 +181,6 @@ struct ggml_backend_registry {
return;
}
for (auto & entry : backends) {
if (entry.reg == reg) {
return;
}
}
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
@@ -198,12 +192,6 @@ struct ggml_backend_registry {
}
void register_device(ggml_backend_dev_t device) {
for (auto & dev : devices) {
if (dev == device) {
return;
}
}
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif

View File

@@ -262,9 +262,9 @@ static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_get_name,
/* .free = */ ggml_backend_blas_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,

View File

@@ -25,7 +25,6 @@
#include "ggml-impl.h"
#include "ggml.h"
#include <aclnnop/aclnn_add.h>
#include <aclnnop/aclnn_add_rms_norm.h>
#include <aclnnop/aclnn_addcdiv.h>
@@ -46,9 +45,7 @@
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
#include <aclnnop/aclnn_ger.h>
#include <aclnnop/aclnn_group_norm.h>
#include <aclnnop/aclnn_gather_v2.h>
#include <aclnnop/aclnn_grouped_matmul_v3.h>
#include <aclnnop/aclnn_scatter.h>
#include <aclnnop/aclnn_gt_scalar.h>
#include <aclnnop/aclnn_im2col.h>
#include <aclnnop/aclnn_index_copy.h>
@@ -65,7 +62,6 @@
#include <aclnnop/aclnn_permute.h>
#include <aclnnop/aclnn_pow.h>
#include <aclnnop/aclnn_pow_tensor_tensor.h>
#include <aclnnop/aclnn_recurrent_gated_delta_rule.h>
#include <aclnnop/aclnn_reduce_sum.h>
#include <aclnnop/aclnn_reflection_pad1d.h>
#include <aclnnop/aclnn_repeat.h>
@@ -73,15 +69,11 @@
#include <aclnnop/aclnn_rms_norm.h>
#include <aclnnop/aclnn_roll.h>
#include <aclnnop/aclnn_softmax.h>
#include <aclnnop/aclnn_softmax_cross_entropy_with_logits.h>
#include <aclnnop/aclnn_sub.h>
#include <aclnnop/aclnn_sum.h>
#include <aclnnop/aclnn_threshold.h>
#include <aclnnop/aclnn_tril.h>
#include <aclnnop/aclnn_triangular_solve.h>
#include <aclnnop/aclnn_triu.h>
#include <aclnnop/aclnn_logical_not.h>
#include <aclnnop/aclnn_masked_fill_scalar.h>
#include <aclnnop/aclnn_upsample_nearest_2d.h>
#include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
#include <aclnnop/aclnn_zero.h>
@@ -159,107 +151,6 @@ void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, ac
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst.get(), acl_src1.get());
}
// Fused SwiGLU using aclnnSwiGlu: splits input along innermost dim, applies
// SiLU to left half, multiplies by right half.
//
// Falls back to the generic two-kernel path when src[1] != nullptr (two
// independent halves) or swapped != 0 (reversed activation order), as
// aclnnSwiGlu only handles the single interleaved tensor in standard order.
//
// CANN tiling for SwiGlu requires (storageShapeDim + viewDims) to be even.
// aclCreateTensor always uses storageShapeDim=1, so viewDims must be odd.
// We use a 3D view (1+3=4, even) to satisfy this constraint while preserving
// correct split semantics along the innermost (ne[0]) dimension.
void ggml_cann_swiglu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
auto silu_fn = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
GGML_CANN_CALL_ACLNN_OP(ctx, Silu, acl_src, acl_dst);
};
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
if (dst->src[1] != nullptr || swapped != 0) {
ggml_cann_op_unary_gated(silu_fn, ctx, dst);
return;
}
// aclnnSwiGlu requires the split dim (src->ne[0]) to be even; fall back otherwise.
if (dst->src[0]->ne[0] % 2 != 0) {
ggml_cann_op_unary_gated(silu_fn, ctx, dst);
return;
}
ggml_tensor * src0 = dst->src[0];
size_t elem_size = ggml_element_size(src0);
// src0 GGML: [2*ne0, ne1, ne2, ne3] → 3D view [2*ne0, ne1, ne2*ne3]
// CANN reversed: [ne2*ne3, ne1, 2*ne0], split along CANN dim 2 (last).
int64_t ne0_x2 = src0->ne[0];
int64_t ne1 = src0->ne[1];
int64_t ne23 = src0->ne[2] * src0->ne[3];
int64_t src3d_ne[] = { ne0_x2, ne1, ne23 };
size_t src3d_nb[] = { (size_t)src0->nb[0], (size_t)src0->nb[1], (size_t)src0->nb[2] };
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type),
elem_size, src3d_ne, src3d_nb, 3);
// dst GGML: [ne0, ne1, ne2, ne3] → 3D view [ne0, ne1, ne2*ne3]
int64_t ne0 = dst->ne[0];
int64_t dst3d_ne[] = { ne0, ne1, ne23 };
size_t dst3d_nb[] = { (size_t)dst->nb[0], (size_t)dst->nb[1], (size_t)dst->nb[2] };
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ggml_cann_type_mapping(dst->type),
elem_size, dst3d_ne, dst3d_nb, 3);
// CANN tensor [ne23, ne1, 2*ne0]: split along CANN dim 2 (last) = 2*ne0.
GGML_CANN_CALL_ACLNN_OP(ctx, SwiGlu, acl_src.get(), (int64_t)2, acl_dst.get());
}
// Fused GeGLU using aclnnGeGluV3: splits input along ne[0] (CANN last dim),
// activates the LEFT half with GELU, multiplies by right half.
// approximate: 0=tanh, 1=none(erf). activateLeft=true matches GGML convention.
// outGelu is a required-but-discard output buffer.
//
// Falls back to the generic two-kernel path when src[1] != nullptr (two
// independent halves) or swapped != 0 (reversed activation order), as
// aclnnGeGluV3 only handles the single interleaved tensor in standard order.
void ggml_cann_geglu(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t approximate) {
auto gelu_fn = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
GGML_CANN_CALL_ACLNN_OP(ctx, Gelu, acl_src, acl_dst);
};
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
if (dst->src[1] != nullptr || swapped != 0) {
ggml_cann_op_unary_gated(gelu_fn, ctx, dst);
return;
}
// aclnnGeGluV3 requires the split dim (src->ne[0]) to be even; fall back otherwise.
if (dst->src[0]->ne[0] % 2 != 0) {
ggml_cann_op_unary_gated(gelu_fn, ctx, dst);
return;
}
ggml_tensor * src0 = dst->src[0];
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
// Allocate a temporary buffer for the required outGelu output (same shape as dst).
// Build contiguous strides since the pool allocation is a fresh buffer.
size_t elem_size = ggml_element_size(dst);
int64_t ne[GGML_MAX_DIMS] = { dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3] };
size_t nb[GGML_MAX_DIMS];
nb[0] = elem_size;
for (int i = 1; i < GGML_MAX_DIMS; i++) {
nb[i] = nb[i - 1] * ne[i - 1];
}
size_t gelu_out_size = nb[GGML_MAX_DIMS - 1] * ne[GGML_MAX_DIMS - 1];
ggml_cann_pool_alloc gelu_out_alloc(ctx.pool(), gelu_out_size);
acl_tensor_ptr acl_gelu_out = ggml_cann_create_tensor(
gelu_out_alloc.get(), ggml_cann_type_mapping(dst->type), elem_size, ne, nb, GGML_MAX_DIMS);
// V3 adds activateLeft param; true → Gelu(left)*right, matching GGML convention.
// GGML dim 0 → CANN last dim (index GGML_MAX_DIMS-1 = 3 for 4D tensor).
GGML_CANN_CALL_ACLNN_OP(ctx, GeGluV3, acl_src.get(), (int64_t)(GGML_MAX_DIMS - 1), approximate, true,
acl_dst.get(), acl_gelu_out.get());
}
/**
* @brief Repeats elements of a tensor along each dimension according to the
* specified repeat array.
@@ -554,33 +445,28 @@ void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes);
void * buffer = temp_buffer_allocator.get();
int64_t norm_ne[] = { 1, src->ne[1], src->ne[2], src->ne[3] };
size_t norm_nb[GGML_MAX_DIMS];
norm_nb[0] = sizeof(float);
int64_t div_ne[] = { 1, src->ne[1], src->ne[2], src->ne[3] };
size_t div_nb[GGML_MAX_DIMS];
div_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
norm_nb[i] = norm_nb[i - 1] * norm_ne[i - 1];
div_nb[i] = div_nb[i - 1] * div_ne[i - 1];
}
acl_tensor_ptr acl_norm = ggml_cann_create_tensor(buffer, ACL_FLOAT, sizeof(float), norm_ne, norm_nb, GGML_MAX_DIMS);
acl_tensor_ptr acl_div = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, div_ne, div_nb, GGML_MAX_DIMS);
std::vector<int64_t> norm_dims = { 3 };
acl_int_array_ptr dims_array = ggml_cann_create_int_array(norm_dims.data(), norm_dims.size());
float p_value = 2.0f;
acl_scalar_ptr p_scalar = ggml_cann_create_scalar(&p_value, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_norm.get());
GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_div.get());
ggml_cann_pool_alloc clamp_buffer_allocator(ctx.pool());
acl_tensor_ptr acl_clamped;
// Clamp norm to at least eps: scale = 1/fmaxf(norm, eps)
acl_scalar_ptr acl_min = ggml_cann_create_scalar(&eps, aclDataType::ACL_FLOAT);
float flt_max = FLT_MAX;
acl_scalar_ptr acl_max = ggml_cann_create_scalar(&flt_max, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_div.get(), acl_min.get(), acl_max.get(), acl_div.get());
if (eps > 0.0f) {
void * clamp_buf = clamp_buffer_allocator.alloc(n_bytes);
acl_clamped = ggml_cann_create_tensor(clamp_buf, ACL_FLOAT, sizeof(float), norm_ne, norm_nb, GGML_MAX_DIMS);
acl_scalar_ptr eps_scalar = ggml_cann_create_scalar(&eps, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, ClampMin, acl_norm.get(), eps_scalar.get(), acl_clamped.get());
}
aclTensor * acl_div_input = acl_clamped ? acl_clamped.get() : acl_norm.get();
GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div_input, acl_dst.get());
GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div.get(), acl_dst.get());
}
void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
@@ -596,30 +482,56 @@ void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor *
logits_nb[1] = logits_nb[0] * logits_ne[0];
acl_tensor_ptr acl_logits = ggml_cann_create_tensor(src0->data, ACL_FLOAT, sizeof(float), logits_ne, logits_nb, 2);
size_t log_softmax_type_size = sizeof(float);
int64_t log_softmax_n_bytes = nr * nc * log_softmax_type_size;
ggml_cann_pool_alloc log_softmax_allocator(ctx.pool(), log_softmax_n_bytes);
void * log_softmax_buffer = log_softmax_allocator.get();
int64_t log_softmax_ne[] = { nc, nr };
size_t log_softmax_nb[2];
log_softmax_nb[0] = log_softmax_type_size;
log_softmax_nb[1] = log_softmax_nb[0] * log_softmax_ne[0];
acl_tensor_ptr acl_log_softmax = ggml_cann_create_tensor(log_softmax_buffer, ACL_FLOAT, log_softmax_type_size,
log_softmax_ne, log_softmax_nb, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, LogSoftmax, acl_logits.get(), 1, acl_log_softmax.get());
int64_t labels_ne[] = { nc, nr };
size_t labels_nb[2];
labels_nb[0] = ggml_type_size(src1->type);
labels_nb[1] = labels_nb[0] * labels_ne[0];
acl_tensor_ptr acl_labels = ggml_cann_create_tensor(src1->data, ACL_FLOAT, sizeof(float), labels_ne, labels_nb, 2);
size_t loss_per_sample_type_size = sizeof(float);
int64_t loss_per_sample_n_bytes = nr * loss_per_sample_type_size;
ggml_cann_pool_alloc loss_per_sample_allocator(ctx.pool(), loss_per_sample_n_bytes);
void * loss_per_sample_buffer = loss_per_sample_allocator.get();
size_t mul_type_size = sizeof(float);
int64_t mul_n_bytes = nr * nc * mul_type_size;
ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_n_bytes);
void * mul_buffer = mul_allocator.get();
int64_t loss_per_sample_ne[] = { nr };
size_t loss_per_sample_nb[1];
loss_per_sample_nb[0] = loss_per_sample_type_size;
acl_tensor_ptr acl_loss_per_sample = ggml_cann_create_tensor(
loss_per_sample_buffer, ACL_FLOAT, loss_per_sample_type_size, loss_per_sample_ne, loss_per_sample_nb, 1);
int64_t mul_ne[] = { nc, nr };
size_t mul_nb[2];
mul_nb[0] = mul_type_size;
mul_nb[1] = mul_nb[0] * mul_ne[0];
acl_tensor_ptr acl_mul_result = ggml_cann_create_tensor(mul_buffer, ACL_FLOAT, mul_type_size, mul_ne, mul_nb, 2);
size_t backprop_n_bytes = nr * nc * sizeof(float);
ggml_cann_pool_alloc backprop_allocator(ctx.pool(), backprop_n_bytes);
void * backprop_buffer = backprop_allocator.get();
acl_tensor_ptr acl_backprop = ggml_cann_create_tensor(backprop_buffer, ACL_FLOAT, sizeof(float), logits_ne, logits_nb, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_log_softmax.get(), acl_labels.get(), acl_mul_result.get());
GGML_CANN_CALL_ACLNN_OP(ctx, SoftmaxCrossEntropyWithLogits, acl_logits.get(), acl_labels.get(),
acl_loss_per_sample.get(), acl_backprop.get());
size_t sum_per_sample_type_size = sizeof(float);
int64_t sum_per_sample_n_bytes = nr * sum_per_sample_type_size;
ggml_cann_pool_alloc sum_per_sample_allocator(ctx.pool(), sum_per_sample_n_bytes);
void * sum_per_sample_buffer = sum_per_sample_allocator.get();
int64_t sum_per_sample_ne[] = { nr };
size_t sum_per_sample_nb[1];
sum_per_sample_nb[0] = sum_per_sample_type_size;
acl_tensor_ptr acl_sum_per_sample = ggml_cann_create_tensor(
sum_per_sample_buffer, ACL_FLOAT, sum_per_sample_type_size, sum_per_sample_ne, sum_per_sample_nb, 1);
std::vector<int64_t> sum_dims = { 1 };
acl_int_array_ptr dims_array = ggml_cann_create_int_array(sum_dims.data(), sum_dims.size());
bool keep_dims = false;
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_mul_result.get(), dims_array.get(), keep_dims, ACL_FLOAT,
acl_sum_per_sample.get());
size_t total_sum_type_size = sizeof(float);
int64_t total_sum_n_bytes = 1 * total_sum_type_size;
@@ -635,12 +547,11 @@ void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor *
std::vector<int64_t> total_sum_dims = { 0 };
acl_int_array_ptr total_sum_dims_array = ggml_cann_create_int_array(total_sum_dims.data(), total_sum_dims.size());
bool keep_dims = false;
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_loss_per_sample.get(), total_sum_dims_array.get(), keep_dims, ACL_FLOAT,
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_sum_per_sample.get(), total_sum_dims_array.get(), keep_dims, ACL_FLOAT,
acl_total_sum.get());
float value = 1.0f / static_cast<float>(nr);
float value = -1.0f / static_cast<float>(nr);
acl_scalar_ptr scale_factor = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
acl_tensor_ptr acl_dst =
ggml_cann_create_tensor(dst->data, ACL_FLOAT, sizeof(float), total_sum_ne, total_sum_nb, 1);
@@ -678,33 +589,6 @@ void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
acl_mean_out.get(), acl_rstd_out.get());
}
void ggml_cann_set(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
ggml_tensor * src1 = dst->src[1];
size_t nb1 = ((int32_t *) dst->op_params)[0];
size_t nb2 = ((int32_t *) dst->op_params)[1];
size_t nb3 = ((int32_t *) dst->op_params)[2];
size_t offset = ((int32_t *) dst->op_params)[3];
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
size_t param_nb[] = { ggml_element_size(src0), nb1, nb2, nb3 };
// Create a view of dst at the target offset with src1's dimensions
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
if (!inplace) {
// First copy src0 to dst entirely
size_t cpy_size = ggml_nbytes(dst);
ACL_CHECK(
aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
}
// Copy src1 into the target region of dst
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst.get(), acl_src1.get());
}
void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
ggml_tensor * src1 = dst->src[1];
@@ -768,113 +652,6 @@ void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
aclnn_reduce_sum(ctx, dst, reduce_dims, 4);
}
void ggml_cann_cumsum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src = dst->src[0];
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
// GGML cumsum operates along dim 0 (innermost / ne[0]).
// ggml_cann_create_tensor reverses dimensions to [ne3,ne2,ne1,ne0],
// so GGML dim 0 maps to CANN dim 3 (the last dim of the 4-D tensor).
GGML_CANN_CALL_ACLNN_OP(ctx, Cumsum, acl_src.get(), (int64_t)3,
ggml_cann_type_mapping(dst->type), acl_dst.get());
}
void ggml_cann_solve_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0]; // A: [N, N, B2, B3] lower triangular
ggml_tensor * src1 = dst->src[1]; // B: [K, N, B2, B3]
acl_tensor_ptr acl_a = ggml_cann_create_tensor(src0);
acl_tensor_ptr acl_b = ggml_cann_create_tensor(src1);
acl_tensor_ptr acl_x = ggml_cann_create_tensor(dst);
// mOut: triangular copy of A (required output), same shape as A.
const size_t a_bytes = ggml_nbytes(src0);
ggml_cann_pool_alloc m_alloc(ctx.pool(), a_bytes);
acl_tensor_ptr acl_m = ggml_cann_create_tensor(
m_alloc.get(), ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
// Solve AX = B: upper=false (lower tri), transpose=false, unitriangular=false.
GGML_CANN_CALL_ACLNN_OP(ctx, TriangularSolve,
acl_b.get(), acl_a.get(), false, false, false,
acl_x.get(), acl_m.get());
}
void ggml_cann_diag(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src = dst->src[0];
GGML_ASSERT(src->ne[1] == 1);
const int64_t N = src->ne[0];
const int64_t n_batch = src->ne[2] * src->ne[3];
const size_t nb_f32 = sizeof(float);
// Fill dst with zeros.
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
{
float zero = 0.0f;
acl_scalar_ptr acl_zero = ggml_cann_create_scalar(&zero, ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst.get(), acl_zero.get());
}
// Copy src vector onto the diagonal of dst via strided views.
// src viewed as [N, n_batch], contiguous strides.
int64_t ne_vec[2] = { N, n_batch };
size_t nb_src_vec[2] = { nb_f32, N * nb_f32 };
// dst diagonal view: stride (N+1)*4 steps along the diagonal.
size_t nb_dst_diag[2] = { (N + 1) * nb_f32, N * N * nb_f32 };
acl_tensor_ptr acl_src_vec = ggml_cann_create_tensor(src->data, ACL_FLOAT, nb_f32, ne_vec, nb_src_vec, 2);
acl_tensor_ptr acl_dst_diag = ggml_cann_create_tensor(dst->data, ACL_FLOAT, nb_f32, ne_vec, nb_dst_diag, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst_diag.get(), acl_src_vec.get());
}
void ggml_cann_fill(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
float c = ggml_get_op_params_f32(dst, 0);
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
acl_scalar_ptr acl_c = ggml_cann_create_scalar(&c, ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst.get(), acl_c.get());
}
void ggml_cann_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src = dst->src[0];
const int64_t S = src->ne[0];
const int64_t n_batch = src->ne[2] * src->ne[3];
const size_t nb_f32 = sizeof(float);
int64_t ne3d[3] = { S, S, n_batch };
size_t nb3d[3] = { nb_f32, S * nb_f32, S * S * nb_f32 };
const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src->data, ACL_FLOAT, nb_f32, ne3d, nb3d, 3);
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ACL_FLOAT, nb_f32, ne3d, nb3d, 3);
switch (ttype) {
case GGML_TRI_TYPE_LOWER:
// Tril(-1): preserve row > col (strict lower), zero upper + diagonal.
GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), (int64_t)-1, acl_dst.get());
break;
case GGML_TRI_TYPE_UPPER_DIAG:
// Triu(0): preserve row <= col (upper + diagonal), zero strict lower.
GGML_CANN_CALL_ACLNN_OP(ctx, Triu, acl_src.get(), (int64_t)0, acl_dst.get());
break;
case GGML_TRI_TYPE_UPPER:
// Triu(1): preserve row < col (strict upper), zero lower + diagonal.
GGML_CANN_CALL_ACLNN_OP(ctx, Triu, acl_src.get(), (int64_t)1, acl_dst.get());
break;
case GGML_TRI_TYPE_LOWER_DIAG:
// Tril(0): preserve row >= col (lower + diagonal), zero strict upper.
GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), (int64_t)0, acl_dst.get());
break;
default:
GGML_ABORT("unsupported tri type");
}
}
void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src = dst->src[0];
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
@@ -1918,90 +1695,152 @@ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
aclnn_softmax(ctx, softmax_tensor.get(), 3, acl_dst.get());
}
/**
* @brief Performs index select operation on a 4D tensor using the CANN backend.
*
* This function applies the `IndexSelect` operation along a specific dimension
* of the source tensor (`src_buffer`) using the indices from the index tensor (`index`).
* It iterates over the last two dimensions of the source tensor, creates the corresponding
* CANN tensors for the source, index, and output slices, and executes the `IndexSelect`
* operation for each slice.
*
* @param ctx The context for CANN backend operations.
* @param src_buffer The source buffer containing the 4D input tensor data.
* @param src_ne The dimensions of the source tensor.
* @param src_nb The strides (byte offsets) of the source tensor.
* @param dst_buffer The destination buffer where the output tensor data will be written.
* @param dst_ne The dimensions of the destination tensor.
* @param dst_nb The strides (byte offsets) of the destination tensor.
* @param index The index tensor specifying the indices to select from the source tensor.
* @param type The data type of the source and destination tensors.
*/
static void aclnn_index_select_4d(ggml_backend_cann_context & ctx,
void * src_buffer,
int64_t * src_ne,
size_t * src_nb,
void * dst_buffer,
int64_t * dst_ne,
size_t * dst_nb,
ggml_tensor * index,
ggml_type type) {
for (int64_t i = 0; i < src_ne[3]; i++) {
for (int64_t j = 0; j < src_ne[2]; j++) {
// src
acl_tensor_ptr acl_src_tensor =
ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
// index
acl_tensor_ptr acl_index = ggml_cann_create_tensor(
(char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
// out
acl_tensor_ptr acl_out =
ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor.get(), 0, acl_index.get(), acl_out.get());
}
}
}
/**
* @brief Performs inplace index copy operation on a 4D tensor using the CANN backend.
*
* This function applies the `IndexCopy` operation along a specific dimension of the
* destination tensor (`dst_buffer`) by copying elements from the source tensor (`src_buffer`)
* to positions specified by the index tensor (`index`).
* It iterates over the last two dimensions of the tensors, creates the corresponding
* CANN tensors for source, index, and destination slices, and performs the index copy
* operation for each slice.
*
* @param ctx The context for CANN backend operations.
* @param src_buffer The source buffer containing the 4D input tensor data to be copied.
* @param src_ne The dimensions of the source tensor.
* @param src_nb The strides (byte offsets) of the source tensor.
* @param dst_buffer The destination buffer where values will be copied to.
* @param dst_ne The dimensions of the destination tensor.
* @param dst_nb The strides (byte offsets) of the destination tensor.
* @param index The index tensor specifying target positions in the destination tensor.
* @param type The data type of the source and destination tensors.
*/
static void aclnn_index_copy_4d(ggml_backend_cann_context & ctx,
void * src_buffer,
int64_t * src_ne,
size_t * src_nb,
void * dst_buffer,
int64_t * dst_ne,
size_t * dst_nb,
ggml_tensor * index,
ggml_type type) {
for (int64_t i = 0; i < src_ne[3]; i++) {
for (int64_t j = 0; j < src_ne[2]; j++) {
// src
acl_tensor_ptr acl_src_tensor =
ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
// index
acl_tensor_ptr acl_index = ggml_cann_create_tensor(
(char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
// out
acl_tensor_ptr acl_out =
ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out.get(), 0, acl_index.get(), acl_src_tensor.get());
}
}
}
void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0]; // weight
ggml_tensor * src0 = dst->src[0]; // src
ggml_tensor * src1 = dst->src[1]; // index
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16
|| dst->type == GGML_TYPE_BF16);
// n_idx: number of row indices per (i2, i3) batch slice.
// ggml guarantees: src0->ne[2] == src1->ne[1], src0->ne[3] == src1->ne[2], src1->ne[3] == 1.
const int64_t n_idx = src1->ne[0];
// Gather all (i2, i3) batch slices from src into dst.
// ggml_cann_create_tensor reverses dims, so ACL sees [ne1, ne0].
// GatherV2 with dim=0 gathers along ACL dim-0 == ggml ne[1] (the vocabulary / row axis).
// nb: the 4 strides of the source buffer (nb[0..1] for the 2D slice shape,
// nb[2..3] for computing per-batch-slice base pointer offsets).
auto gather_batched = [&](void * src_base, aclDataType acl_type, size_t type_size,
const size_t * nb) {
int64_t src_ne[2] = { src0->ne[0], src0->ne[1] };
size_t src_nb_2d[2] = { nb[0], nb[1] };
int64_t dst_ne[2] = { src0->ne[0], n_idx };
size_t dst_nb_2d[2] = { dst->nb[0], dst->nb[1] };
int64_t idx_ne[1] = { n_idx };
size_t idx_nb[1] = { (size_t)ggml_element_size(src1) };
for (int64_t i3 = 0; i3 < src0->ne[3]; i3++) {
for (int64_t i2 = 0; i2 < src0->ne[2]; i2++) {
acl_tensor_ptr acl_src = ggml_cann_create_tensor(
(char *)src_base + i3 * nb[3] + i2 * nb[2],
acl_type, type_size, src_ne, src_nb_2d, 2);
acl_tensor_ptr acl_idx = ggml_cann_create_tensor(
(char *)src1->data + i3 * src1->nb[2] + i2 * src1->nb[1],
ggml_cann_type_mapping(src1->type), (size_t)ggml_element_size(src1),
idx_ne, idx_nb, 1);
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(
(char *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2],
acl_type, type_size, dst_ne, dst_nb_2d, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, GatherV2, acl_src.get(), 0, acl_idx.get(), acl_dst.get());
}
}
};
switch (src0->type) {
case GGML_TYPE_BF16:
case GGML_TYPE_F16:
case GGML_TYPE_F32:
if (src0->type == dst->type) {
gather_batched(src0->data,
ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
src0->nb);
aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1,
dst->type);
} else {
// Cast src0 to dst type, then gather.
ggml_cann_pool_alloc src_cast_allocator(ctx.pool(),
ggml_nelements(src0) * ggml_element_size(dst));
size_t src_cast_nb[GGML_MAX_DIMS];
src_cast_nb[0] = ggml_type_size(dst->type);
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
void * src_trans_buffer = src_buffer_allocator.get();
size_t src_trans_nb[GGML_MAX_DIMS];
src_trans_nb[0] = dst->nb[0];
for (int i = 1; i < GGML_MAX_DIMS; i++) {
src_cast_nb[i] = src_cast_nb[i - 1] * src0->ne[i - 1];
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
}
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
acl_tensor_ptr acl_src_cast = ggml_cann_create_tensor(
src_cast_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
src0->ne, src_cast_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src0.get(), acl_src_cast.get(), ggml_cann_type_mapping(dst->type));
gather_batched(src_cast_allocator.get(),
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
src_cast_nb);
acl_tensor_ptr src_trans_tensor =
ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
dst->type);
}
break;
case GGML_TYPE_Q8_0:
{
// Dequantize Q8_0 to dst type, then gather.
// add 1 dim for bcast mul.
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], dequant_nb[GGML_MAX_DIMS + 1];
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], *dequant_ne;
weight_ne[0] = QK8_0;
weight_ne[1] = src0->ne[0] / QK8_0;
weight_nb[0] = sizeof(int8_t);
weight_nb[1] = weight_nb[0] * weight_ne[0];
int64_t scale_offset = 0;
// [3,4,5,64] -> [3,4,5,2,32]
weight_ne[0] = QK8_0;
weight_ne[1] = src0->ne[0] / QK8_0;
weight_nb[0] = sizeof(int8_t);
weight_nb[1] = weight_nb[0] * weight_ne[0];
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
weight_ne[i] = src0->ne[i - 1];
weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
}
// [3,4,5,64] -> [3,4,5,2,1]
scale_ne[0] = 1;
scale_ne[1] = src0->ne[0] / QK8_0;
scale_nb[0] = sizeof(uint16_t);
@@ -2010,33 +1849,31 @@ void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
scale_ne[i] = src0->ne[i - 1];
scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
}
// [3,4,5,64] -> [3,4,5,2,32]
dequant_ne = weight_ne;
dequant_nb[0] = ggml_type_size(dst->type);
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
}
const int64_t scale_offset = ggml_nelements(src0) * sizeof(int8_t);
ggml_cann_pool_alloc dequant_allocator(ctx.pool(),
ggml_nelements(src0) * ggml_type_size(dst->type));
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t),
weight_ne, weight_nb, GGML_MAX_DIMS + 1);
acl_tensor_ptr acl_scale = ggml_cann_create_tensor(
src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
acl_tensor_ptr acl_dequant = ggml_cann_create_tensor(
dequant_allocator.get(), ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
aclnn_mul(ctx, acl_weight.get(), acl_scale.get(), acl_dequant.get());
// Reinterpret dequant buffer as 4D [src0->ne] with contiguous strides.
dequant_ne = src0->ne;
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
ggml_cann_pool_alloc dequant_buffer_allocator(ctx.pool(),
ggml_nelements(src0) * ggml_type_size(dst->type));
acl_tensor_ptr acl_weight_tensor = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t),
weight_ne, weight_nb, GGML_MAX_DIMS + 1);
acl_tensor_ptr acl_scale_tensor =
ggml_cann_create_tensor(src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
acl_tensor_ptr dequant_tensor =
ggml_cann_create_tensor(dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
aclnn_mul(ctx, acl_weight_tensor.get(), acl_scale_tensor.get(), dequant_tensor.get());
dequant_nb[0] = ggml_type_size(dst->type);
dequant_ne = src0->ne;
for (int i = 1; i < GGML_MAX_DIMS; i++) {
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
}
gather_batched(dequant_allocator.get(),
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
dequant_nb);
aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), dequant_ne, dequant_nb, dst->data, dst->ne,
dst->nb, src1, dst->type);
break;
}
default:
@@ -2046,70 +1883,31 @@ void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
}
void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0]; // source values
ggml_tensor * src1 = dst->src[1]; // row indices
// n_idx: number of source rows to scatter per batch slice.
// ggml guarantees: src0->ne[1] == src1->ne[0].
const int64_t n_idx = src1->ne[0];
// Copy n_idx rows of src [ne0, n_idx] into dst [ne0, ne1] at positions given by a 1D index.
// ggml_cann_create_tensor reverses dims, so ACL sees [ne1, ne0] for dst.
// InplaceIndexCopy with dim=0 copies along ACL dim-0 == ggml ne[1] (the row axis).
// src_nb: the 4 strides of the source buffer (nb[0..1] for the 2D slice shape,
// nb[2..3] for computing per-batch-slice base pointer offsets).
auto scatter_batched = [&](void * src_base, aclDataType acl_type, size_t type_size,
const size_t * src_nb) {
int64_t d_ne[2] = { dst->ne[0], dst->ne[1] };
size_t d_nb[2] = { dst->nb[0], dst->nb[1] };
int64_t s_ne[2] = { dst->ne[0], n_idx };
size_t s_nb_2d[2] = { src_nb[0], src_nb[1] };
int64_t i_ne[1] = { n_idx };
size_t i_nb[1] = { (size_t)ggml_element_size(src1) };
for (int64_t i3 = 0; i3 < dst->ne[3]; i3++) {
for (int64_t i2 = 0; i2 < dst->ne[2]; i2++) {
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(
(char *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2],
acl_type, type_size, d_ne, d_nb, 2);
acl_tensor_ptr acl_idx = ggml_cann_create_tensor(
(char *)src1->data + (i3 % src1->ne[2]) * src1->nb[2] + (i2 % src1->ne[1]) * src1->nb[1],
ggml_cann_type_mapping(src1->type), (size_t)ggml_element_size(src1),
i_ne, i_nb, 1);
acl_tensor_ptr acl_src = ggml_cann_create_tensor(
(char *)src_base + i3 * src_nb[3] + i2 * src_nb[2],
acl_type, type_size, s_ne, s_nb_2d, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_dst.get(), 0, acl_idx.get(), acl_src.get());
}
}
};
ggml_tensor * src0 = dst->src[0]; // src
ggml_tensor * src1 = dst->src[1]; // index
switch (dst->type) {
case GGML_TYPE_F32:
scatter_batched(src0->data,
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
src0->nb);
break;
{
aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1, dst->type);
break;
}
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
{
// Cast src0 (F32) to dst type first.
ggml_cann_pool_alloc src_cast_allocator(ctx.pool(),
ggml_nelements(src0) * ggml_type_size(dst->type));
size_t src_cast_nb[GGML_MAX_DIMS];
src_cast_nb[0] = ggml_type_size(dst->type);
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
void * src_trans_buffer = src_buffer_allocator.get();
size_t src_trans_nb[GGML_MAX_DIMS];
src_trans_nb[0] = sizeof(uint16_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
src_cast_nb[i] = src_cast_nb[i - 1] * src0->ne[i - 1];
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
}
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
acl_tensor_ptr acl_src_cast = ggml_cann_create_tensor(
src_cast_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
src0->ne, src_cast_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src0.get(), acl_src_cast.get(), ggml_cann_type_mapping(dst->type));
scatter_batched(src_cast_allocator.get(),
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
src_cast_nb);
acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
dst->type);
break;
}
default:
@@ -3470,50 +3268,29 @@ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst
int64_t paddingsArray[2] = { opts[0], opts[1] };
acl_int_array_ptr paddings = ggml_cann_create_int_array(paddingsArray, 2);
// Collapsing ne[2]*ne[3] into a single batch dimension requires that dim3
// is contiguous with respect to dim2 in both src and dst.
GGML_ASSERT(src0->nb[3] == src0->nb[2] * src0->ne[2]);
GGML_ASSERT(dst->nb[3] == dst->nb[2] * dst->ne[2]);
for (int64_t i = 0; i < src0->ne[3]; i++) {
acl_tensor_ptr acl_src =
ggml_cann_create_tensor((char *) src0->data + i * src0->ne[3], ggml_cann_type_mapping(src0->type),
ggml_element_size(src0), src0->ne, src0->nb, 3);
int64_t src_ne_3d[3] = { src0->ne[0], src0->ne[1], src0->ne[2] * src0->ne[3] };
int64_t dst_ne_3d[3] = { dst->ne[0], dst->ne[1], dst->ne[2] * dst->ne[3] };
acl_tensor_ptr acl_dst =
ggml_cann_create_tensor((char *) dst->data + i * src0->ne[3], ggml_cann_type_mapping(dst->type),
ggml_element_size(dst), dst->ne, dst->nb, 3);
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type),
ggml_element_size(src0), src_ne_3d, src0->nb, 3);
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ggml_cann_type_mapping(dst->type),
ggml_element_size(dst), dst_ne_3d, dst->nb, 3);
GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src.get(), paddings.get(), acl_dst.get());
GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src.get(), paddings.get(), acl_dst.get());
}
}
void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
ggml_tensor * src1 = dst->src[1];
// Write element-wise equality (0 or 1) into a temporary buffer to avoid
// modifying src0 in-place. Use the same type as src0 so ReduceSum can
// consume it directly without a type cast.
ggml_cann_pool_alloc eq_alloc(ctx.pool(), ggml_nelements(src0) * ggml_element_size(src0));
size_t eq_nb[GGML_MAX_DIMS];
eq_nb[0] = ggml_element_size(src0);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
eq_nb[i] = eq_nb[i - 1] * src0->ne[i - 1];
}
acl_tensor_ptr acl_eq = ggml_cann_create_tensor(
eq_alloc.get(), ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
src0->ne, eq_nb, GGML_MAX_DIMS);
acl_tensor_ptr acl_self = ggml_cann_create_tensor(src0);
acl_tensor_ptr acl_other = ggml_cann_create_tensor(src1);
GGML_CANN_CALL_ACLNN_OP(ctx, EqTensor, acl_self.get(), acl_other.get(), acl_eq.get());
// Sum the 0/1 values into dst.
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
int64_t dims[4] = { 0, 1, 2, 3 };
acl_int_array_ptr dims_arr = ggml_cann_create_int_array(dims, 4);
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_eq.get(), dims_arr.get(), true,
ggml_cann_type_mapping(dst->type), acl_dst.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self.get(), acl_other.get());
ggml_cann_sum(ctx, dst);
}
void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
@@ -3529,27 +3306,6 @@ void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src.get(), alpha.get(), acl_dst.get());
}
void ggml_cann_softplus(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
float beta_val = 1.0f;
float threshold_val = 20.0f;
acl_scalar_ptr beta = ggml_cann_create_scalar(&beta_val, ACL_FLOAT);
acl_scalar_ptr threshold = ggml_cann_create_scalar(&threshold_val, ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, Softplus, acl_src.get(), beta.get(), threshold.get(), acl_dst.get());
}
void ggml_cann_geglu_quick(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
auto gelu_quick_fn = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
};
ggml_cann_op_unary_gated(gelu_quick_fn, ctx, dst);
}
/**
* @brief Performs expert-specific matrix multiplication (MoE) with
* floating-point precision using the CANN backend.
@@ -4136,65 +3892,46 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
}
static void ggml_cann_out_prod_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0]; // weight [ne00=m, ne01=K, ne02, ne03]
ggml_tensor * src1 = dst->src[1]; // input [ne10=n, ne11=K, ne12, ne13]
ggml_tensor * src0 = dst->src[0]; // weight
ggml_tensor * src1 = dst->src[1]; // input
GGML_TENSOR_BINARY_OP_LOCALS
// dst[i,j] = sum_k src0[i,k] * src1[j,k] i.e. dst = src0 @ src1^T.
//
// ggml_cann_create_tensor reverses dimension order, so ACL sees:
// acl_src0 slice: ggml[m,K] -> ACL[K,m]
// acl_src1 slice: ggml[n,K] -> ACL[K,n]
// acl_dst slice: ggml[m,n] -> ACL[n,m]
//
// Build a transposed view of src1 by swapping ne[0]/ne[1]:
// src1_t: ggml[K,n] (swapped strides) -> ACL[n,K]
//
// Matmul(src1_t [n,K], src0 [K,m]) = [n,m] = acl_dst ✓
//
// The outer batch loop is kept because src0 may have fewer batch slices than
// dst (ne02 <= ne2, ne03 <= ne3): this is a strided-broadcast not supported
// by standard CANN Matmul broadcasting.
const aclDataType src0_acl_type = ggml_cann_type_mapping(src0->type);
const aclDataType src1_acl_type = ggml_cann_type_mapping(src1->type);
const aclDataType dst_acl_type = ggml_cann_type_mapping(dst->type);
const size_t src0_type_sz = ggml_type_size(src0->type);
const size_t src1_type_sz = ggml_type_size(src1->type);
const size_t dst_type_sz = ggml_type_size(dst->type);
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
const int64_t dps2 = ne2 / ne02;
const int64_t dps3 = ne3 / ne03;
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t i02 = i2 / dps2;
const int64_t i03 = i3 / dps3;
// src0 2D slice at [i02, i03]: ggml [m, K] -> ACL [K, m]
int64_t src0_ne[2] = { ne00, ne01 };
size_t src0_nb[2] = { nb00, nb01 };
acl_tensor_ptr acl_src0_s = ggml_cann_create_tensor(
(char *) src0->data + i02 * nb02 + i03 * nb03,
src0_acl_type, src0_type_sz, src0_ne, src0_nb, 2);
const int64_t i12 = i2;
const int64_t i13 = i3;
acl_tensor_ptr accumulator =
ggml_cann_create_tensor((char *) dst->data + i2 * nb2 + i3 * nb3, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), dst->ne, dst->nb, 2);
// src1 transposed 2D slice at [i2, i3]: swap ne/nb -> ggml[K,n] -> ACL[n,K]
int64_t src1_t_ne[2] = { ne11, ne10 };
size_t src1_t_nb[2] = { nb11, nb10 };
acl_tensor_ptr acl_src1_t = ggml_cann_create_tensor(
(char *) src1->data + i2 * nb12 + i3 * nb13,
src1_acl_type, src1_type_sz, src1_t_ne, src1_t_nb, 2);
// The outer product needs to be accumulated in this dimension.
for (int64_t i1 = 0; i1 < ne11; i1++) {
acl_tensor_ptr acl_input = ggml_cann_create_tensor(
(char *) src1->data + i1 * nb11 + i12 * nb12 + i13 * nb13, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src1->ne, src1->nb, 1);
// dst 2D slice at [i2, i3]: ggml [m, n] -> ACL [n, m]
int64_t dst_ne[2] = { ne0, ne1 };
size_t dst_nb[2] = { nb0, nb1 };
acl_tensor_ptr acl_dst_s = ggml_cann_create_tensor(
(char *) dst->data + i2 * nb2 + i3 * nb3,
dst_acl_type, dst_type_sz, dst_ne, dst_nb, 2);
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(
(char *) src0->data + i1 * nb01 + i02 * nb02 + i03 * nb03, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src0->ne, src0->nb, 1);
// Matmul(src1_t [n,K], src0 [K,m]) = [n,m] = acl_dst_s ✓
GGML_CANN_CALL_ACLNN_OP(ctx, Matmul,
acl_src1_t.get(), acl_src0_s.get(), acl_dst_s.get(), (int8_t) 1);
ggml_cann_pool_alloc output_allocator(ctx.pool());
void * output_buffer = output_allocator.alloc(ggml_nbytes(dst));
acl_tensor_ptr acl_out = ggml_cann_create_tensor(output_buffer, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), dst->ne, dst->nb, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, Ger, acl_input.get(), acl_weight.get(), acl_out.get());
float alpha_value = 1.0f;
aclScalar * alpha = aclCreateScalar(&alpha_value, ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, accumulator.get(), acl_out.get(), alpha);
}
}
}
}
@@ -4433,4 +4170,3 @@ void ggml_cann_gated_linear_attn(ggml_backend_cann_context & ctx, ggml_tensor *
}
}
}

View File

@@ -32,9 +32,6 @@
#include <aclnnop/aclnn_cat.h>
#include <aclnnop/aclnn_clamp.h>
#include <aclnnop/aclnn_cos.h>
#include <aclnnop/aclnn_cumsum.h>
#include <aclnnop/aclnn_tril.h>
#include <aclnnop/aclnn_triu.h>
#include <aclnnop/aclnn_exp.h>
#include <aclnnop/aclnn_gelu.h>
#include <aclnnop/aclnn_gelu_v2.h>
@@ -50,9 +47,6 @@
#include <aclnnop/aclnn_sign.h>
#include <aclnnop/aclnn_silu.h>
#include <aclnnop/aclnn_sin.h>
#include <aclnnop/aclnn_softplus.h>
#include <aclnnop/aclnn_swi_glu.h>
#include <aclnnop/aclnn_geglu.h>
#include <aclnnop/aclnn_slice.h>
#include <aclnnop/aclnn_sqrt.h>
#include <aclnnop/aclnn_tanh.h>
@@ -75,9 +69,6 @@
*/
void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
void ggml_cann_swiglu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
void ggml_cann_geglu(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t approximate);
/**
* @brief Applies the Leaky ReLU activation function to a tensor using the CANN
* backend.
@@ -334,48 +325,6 @@ void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst);
/**
* @brief Computes the cumulative sum of a ggml tensor along dim 0 using the
* CANN backend.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor. dst->op is `GGML_OP_CUMSUM`.
*/
void ggml_cann_cumsum(ggml_backend_cann_context & ctx, ggml_tensor * dst);
/**
* @brief Computes a triangular mask (tril/triu) of a square ggml tensor
* using the CANN backend.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor. dst->op is `GGML_OP_TRI`.
*/
void ggml_cann_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst);
/**
* @brief Solves a triangular linear system AX=B using the CANN backend.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor. dst->op is `GGML_OP_SOLVE_TRI`.
*/
void ggml_cann_solve_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst);
/**
* @brief Creates a diagonal matrix from a vector using the CANN backend.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor. dst->op is `GGML_OP_DIAG`.
*/
void ggml_cann_diag(ggml_backend_cann_context & ctx, ggml_tensor * dst);
/**
* @brief Fills a tensor with a constant scalar value using the CANN backend.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor. dst->op is `GGML_OP_FILL`.
*/
void ggml_cann_fill(ggml_backend_cann_context & ctx, ggml_tensor * dst);
/**
* @brief Upsamples a ggml tensor using nearest neighbor interpolation using
* the CANN backend.
@@ -512,9 +461,6 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor *
// @see ggml_cann_dup.
void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst);
// @see ggml_cann_acc, but copies src1 into dst instead of adding.
void ggml_cann_set(ggml_backend_cann_context & ctx, ggml_tensor * dst);
/**
* @brief Computes the softmax activation with optional masking.
*
@@ -867,8 +813,6 @@ void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst);
* dst->op is expected to be `GGML_OP_STEP`.
*/
void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst);
void ggml_cann_softplus(ggml_backend_cann_context & ctx, ggml_tensor * dst);
void ggml_cann_geglu_quick(ggml_backend_cann_context & ctx, ggml_tensor * dst);
/**
* @brief Performs the Flash Attention extended operator using the CANN backend.

View File

@@ -1428,22 +1428,6 @@ static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
return false;
}
/**
* @brief Set a region of a tensor's device memory to a specified value.
*
* @param buffer The CANN buffer containing the tensor.
* @param tensor Pointer to the tensor whose memory will be set.
* @param value The value to which each byte in the region will be set.
* @param offset Byte offset within the tensor's data to start setting.
* @param size Number of bytes to set.
*/
static void ggml_backend_cann_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
ggml_cann_set_device(ctx->device);
ACL_CHECK(aclrtMemset((char *) tensor->data + offset, size, value, size));
}
/**
* @brief Clear a CANN buffer by setting all its memory to a specified value.
*
@@ -1470,7 +1454,7 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
/* .get_base = */ ggml_backend_cann_buffer_get_base,
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
/* .memset_tensor = */ ggml_backend_cann_buffer_memset_tensor,
/* .memset_tensor = */ NULL,
/* .set_tensor = */ ggml_backend_cann_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cann_buffer_get_tensor,
/* .set_tensor_2d = */ NULL,
@@ -1851,9 +1835,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
case GGML_UNARY_OP_STEP:
ggml_cann_step(ctx, dst);
break;
case GGML_UNARY_OP_SOFTPLUS:
ggml_cann_softplus(ctx, dst);
break;
default:
return false;
}
@@ -1864,16 +1845,20 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
GGML_CANN_CALL_OP_UNARY_GATED(Relu);
break;
case GGML_GLU_OP_GEGLU:
ggml_cann_geglu(ctx, dst, 0); // approximate=0 → tanh
break;
case GGML_GLU_OP_GEGLU_ERF:
ggml_cann_geglu(ctx, dst, 1); // approximate=1 → erf
// aclnnGelu internally uses the erf-based approximation.
GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
break;
case GGML_GLU_OP_SWIGLU:
ggml_cann_swiglu(ctx, dst);
GGML_CANN_CALL_OP_UNARY_GATED(Silu);
break;
case GGML_GLU_OP_GEGLU_QUICK:
ggml_cann_geglu_quick(ctx, dst);
{
auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
};
ggml_cann_op_unary_gated(lambda, ctx, dst);
}
break;
default:
return false;
@@ -1935,9 +1920,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
case GGML_OP_CPY:
ggml_cann_cpy(ctx, dst);
break;
case GGML_OP_SET:
ggml_cann_set(ctx, dst);
break;
case GGML_OP_CONT:
ggml_cann_dup(ctx, dst);
break;
@@ -2007,21 +1989,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
case GGML_OP_SSM_CONV:
ggml_cann_ssm_conv(ctx, dst);
break;
case GGML_OP_CUMSUM:
ggml_cann_cumsum(ctx, dst);
break;
case GGML_OP_TRI:
ggml_cann_tri(ctx, dst);
break;
case GGML_OP_FILL:
ggml_cann_fill(ctx, dst);
break;
case GGML_OP_DIAG:
ggml_cann_diag(ctx, dst);
break;
case GGML_OP_SOLVE_TRI:
ggml_cann_solve_tri(ctx, dst);
break;
default:
return false;
}
@@ -2357,7 +2324,6 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
if (use_cann_graph) {
// If no matching graph is found, the graph needs to be recaptured.
graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph);
if (graph_capture_required) {
// If no matching graph is found, add a new ACL graph.
ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
@@ -2416,7 +2382,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
case GGML_UNARY_OP_SGN:
case GGML_UNARY_OP_STEP:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_SOFTPLUS:
return true;
default:
return false;
@@ -2607,7 +2572,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
case GGML_OP_SET:
case GGML_OP_GROUP_NORM:
return true;
case GGML_OP_PAD:
@@ -2685,16 +2649,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
}
case GGML_OP_SSM_CONV:
return true;
case GGML_OP_CUMSUM:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_TRI:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_FILL:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_DIAG:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_SOLVE_TRI:
return op->src[0]->type == GGML_TYPE_F32;
default:
return false;
}
@@ -2746,8 +2700,8 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .free = */ ggml_backend_cann_free,
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cann_synchronize,
/* .graph_plan_create = */ NULL,

View File

@@ -485,13 +485,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (GGML_RV_ZIHINTPAUSE)
string(APPEND MARCH_STR "_zihintpause")
endif()
if (GGML_CPU_RISCV64_SPACEMIT)
# `xsmtvdotii' is only required for GCC >= 15.
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
string(APPEND MARCH_STR "_xsmtvdotii")
endif()
endif()
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
else()

View File

@@ -2005,12 +2005,12 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
const int lda = KB * sizeof(TA);
//const int ldb = KB * sizeof(TB);
alignas(64) static thread_local packed_B_t Tile0[TILE_N * TILE_K];
alignas(64) static thread_local packed_B_t Tile1[TILE_N * TILE_K];
alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];
static thread_local packed_B_t Tile0[TILE_N * TILE_K];
static thread_local packed_B_t Tile1[TILE_N * TILE_K];
static thread_local int8_t Tile23[TILE_M * TILE_K];
alignas(64) static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
alignas(64) static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
// double buffering C to interleave avx512 and amx
int32_t * C_cur = TileC0;
@@ -2187,21 +2187,21 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
const int m1 = std::max(M - TILE_M, 0);
//const int lda = KB * sizeof(TA);
alignas(64) static thread_local int8_t Tile0[TILE_N * TILE_K];
alignas(64) static thread_local int8_t Tile1[TILE_N * TILE_K];
alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];
static thread_local int8_t Tile0[TILE_N * TILE_K];
static thread_local int8_t Tile1[TILE_N * TILE_K];
static thread_local int8_t Tile23[TILE_M * TILE_K];
// mat mul result for each group
alignas(64) static thread_local int32_t Tile4[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Tile5[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Tile6[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Tile7[TILE_M * TILE_N];
static thread_local int32_t Tile4[TILE_M * TILE_N];
static thread_local int32_t Tile5[TILE_M * TILE_N];
static thread_local int32_t Tile6[TILE_M * TILE_N];
static thread_local int32_t Tile7[TILE_M * TILE_N];
// sum of each QK_K block, contains 8 groups, int32
alignas(64) static thread_local int32_t Sumi4[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Sumi5[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Sumi6[TILE_M * TILE_N];
alignas(64) static thread_local int32_t Sumi7[TILE_M * TILE_N];
static thread_local int32_t Sumi4[TILE_M * TILE_N];
static thread_local int32_t Sumi5[TILE_M * TILE_N];
static thread_local int32_t Sumi6[TILE_M * TILE_N];
static thread_local int32_t Sumi7[TILE_M * TILE_N];
const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
for (int i = 0; i < KB; ++i) {

View File

@@ -83,6 +83,7 @@
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// quants.c
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4

View File

@@ -151,6 +151,8 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const block_q1_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
float sumf = 0.0f;
#if defined(__ARM_NEON)
float32x4_t sumv = vdupq_n_f32(0.0f);
@@ -210,13 +212,31 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
}
*s = vaddvq_f32(sumv);
sumf = vaddvq_f32(sumv);
#else
UNUSED(nb);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
// Scalar fallback
for (int i = 0; i < nb; i++) {
const float d0 = GGML_FP16_TO_FP32(x[i].d);
// Process 4 Q8_0 blocks
for (int k = 0; k < 4; k++) {
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
int sumi = 0;
for (int j = 0; j < QK8_0; j++) {
const int bit_index = k * QK8_0 + j;
const int byte_index = bit_index / 8;
const int bit_offset = bit_index % 8;
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
sumi += xi * y[i*4 + k].qs[j];
}
sumf += d0 * d1 * sumi;
}
}
#endif
*s = sumf;
}

View File

@@ -5023,71 +5023,6 @@ void ggml_gemm_q8_0_4x8_q8_0(int n,
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
if (svcntb() * 8 == 256) {
const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
static const uint32_t idx_arr[8] = {0, 1, 4, 5, 2, 3, 6, 7};
svuint32_t idx = svld1(svptrue_b32(), idx_arr);
static const uint32_t idx_arr1[8] = {0, 1, 2, 3, 1, 2, 3, 0};
svuint32_t idx_sc1 = svld1(svptrue_b32(), idx_arr1);
static const uint32_t idx_arr2[8] = {0, 1, 2, 3, 0, 1, 2, 3};
svuint32_t idx_sc2 = svld1(svptrue_b32(), idx_arr2);
for (int y = 0; y < nr; y += 4) {
const block_q8_0x4 * a_ptr_base = (const block_q8_0x4 *) vy + (y / 4) * nb;
for (int x = 0; x < nc; x += ncols_interleaved) {
const block_q8_0x4 * b_ptr = b_ptr_base + (x / 4) * nb;
const block_q8_0x4 * a_ptr = a_ptr_base;
svfloat32_t acc_f32_01 = svdup_f32(0);
svfloat32_t acc_f32_23 = svdup_f32(0);
for (int b = 0; b < nb; b++) {
svint32_t acc_01 = svdup_s32(0);
svint32_t acc_23 = svdup_s32(0);
// Process 4 chunks of 8 positions each
for (int chunk = 0; chunk < 4; chunk++) {
svint8_t s_a01 = svld1rq_s8(svptrue_b8(), a_ptr->qs + chunk * 32);
svint8_t s_a23 = svld1rq_s8(svptrue_b8(), a_ptr->qs + chunk * 32 + 16);
svint8_t s_b0123 = svld1_s8(svptrue_b8(), b_ptr->qs + chunk * 32);
acc_01 = svmmla_s32(acc_01, s_a01, s_b0123);
acc_23 = svmmla_s32(acc_23, s_a23, s_b0123);
}
// Reorder outputs from 2×2 tiles to row-major
// acc[01] = [r0c0, r0c1, r1c0, r1c1, r0c2, r0c3, r1c2, r1c3]
// acc[23] = [r2c0, r2c1, r3c0, r3c1, r2c2, r2c3, r3c2, r3c3]
svint32_t row01 = svtbl_s32(acc_01, idx);
svint32_t row23 = svtbl_s32(acc_23, idx);
svfloat16_t temp1 = svld1_f16(svptrue_pat_b16(SV_VL4), (const __fp16 *) a_ptr->d);
svfloat16_t temp2 = svld1_f16(svptrue_pat_b16(SV_VL4), (const __fp16 *) b_ptr->d);
svfloat32_t sv_a_d = svtbl_f32(svcvt_f32_f16_x(svptrue_b32(), svzip1_f16(temp1, temp1)), idx_sc1);
svfloat32_t sv_b_d = svtbl_f32(svcvt_f32_f16_x(svptrue_b32(), svzip1_f16(temp2, temp2)), idx_sc2);
acc_f32_01 = svmla_f32_x(svptrue_b32(), acc_f32_01, svcvt_f32_s32_x(svptrue_b32(), row01), svmul_lane_f32(sv_b_d, sv_a_d, 0));
acc_f32_23 = svmla_f32_x(svptrue_b32(), acc_f32_23, svcvt_f32_s32_x(svptrue_b32(), row23), svmul_lane_f32(sv_b_d, sv_a_d, 2));
a_ptr++;
b_ptr++;
}
svbool_t pg4 = svptrue_pat_b32(SV_VL4);
svst1_f32(pg4, s + (y+0) * bs + x, acc_f32_01);
svst1_f32(pg4, s + (y+1) * bs + x, svext_f32(acc_f32_01, acc_f32_01, 4));
svst1_f32(pg4, s + (y+2) * bs + x, acc_f32_23);
svst1_f32(pg4, s + (y+3) * bs + x, svext_f32(acc_f32_23, acc_f32_23, 4));
}
}
return;
}
#endif // SVE compile-time end
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;

View File

@@ -274,18 +274,6 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
}
#endif
#elif defined(__SSSE3__)
static inline __m128i bytes_from_bits_16(const uint8_t * x) {
uint16_t x16;
memcpy(&x16, x, sizeof(uint16_t));
const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
__m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
bytes = _mm_or_si128(bytes, bit_mask);
return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
}
// horizontally add 4x4 floats
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
__m128 res_0 =_mm_hadd_ps(a, b);
@@ -552,152 +540,6 @@ static inline __m128i get_scale_shuffle(int i) {
}
#endif
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK1_0;
const int nb = n / qk;
assert(n % qk == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q1_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
#if defined(__AVX2__)
const __m256i ones_8 = _mm256_set1_epi8(1);
const __m256i ones_16 = _mm256_set1_epi16(1);
const __m256i byte_shuf = _mm256_setr_epi8(
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
const __m256i bit_masks = _mm256_setr_epi8(
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128,
1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128);
const __m256i zero = _mm256_setzero_si256();
__m256 acc = _mm256_setzero_ps();
for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
const uint32_t * GGML_RESTRICT qs32 = (const uint32_t *) x[ib].qs;
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
__m256 acc_block;
{
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[0].qs);
const __m256i sm = _mm256_cmpeq_epi8(
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[0]), byte_shuf), bit_masks), zero);
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), _mm256_cvtepi32_ps(s32));
}
for (int K = 1; K < 4; ++K) {
const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[K].qs);
const __m256i sm = _mm256_cmpeq_epi8(
_mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[K]), byte_shuf), bit_masks), zero);
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
acc_block = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), _mm256_cvtepi32_ps(s32), acc_block);
}
acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
}
*s = hsum_float_8(acc);
#elif defined(__AVX__)
const __m128i ones_8 = _mm_set1_epi8(1);
const __m128i ones_16 = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
__m256 acc = _mm256_setzero_ps();
for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
__m256 acc_block;
{
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[0]);
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[0]);
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[0].qs[16]);
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), q);
}
for(int K = 1; K < 4; ++K) {
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[(K) * 4]);
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[0]);
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(K)].qs[16]);
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero);
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero);
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0);
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1);
const __m128i sum16_0 = _mm_maddubs_epi16(ones_8, sy_0);
const __m128i sum16_1 = _mm_maddubs_epi16(ones_8, sy_1);
const __m128i sum32_0 = _mm_madd_epi16(sum16_0, ones_16);
const __m128i sum32_1 = _mm_madd_epi16(sum16_1, ones_16);
const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0));
acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(K)].d)), q));
}
#undef Q1_AVX_BLOCK
acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
}
*s = hsum_float_8(acc);
#elif defined(__SSSE3__)
const __m128i ones_8 = _mm_set1_epi8(1);
const __m128i ones_16 = _mm_set1_epi16(1);
const __m128i zero = _mm_setzero_si128();
__m128 acc_0 = _mm_setzero_ps();
__m128 acc_1 = _mm_setzero_ps();
__m128 acc_2 = _mm_setzero_ps();
__m128 acc_3 = _mm_setzero_ps();
for (int ib = 0; ib < nb; ++ib) {
const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
#define Q1_SSSE3_BLOCK(QS_OFF, Y_IDX, ACC) \
{ \
const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 0]); \
const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[(QS_OFF) + 2]); \
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[0]); \
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y_ptr[(Y_IDX)].qs[16]); \
const __m128i sign_mask_0 = _mm_cmpeq_epi8(bit_mask_0, zero); \
const __m128i sign_mask_1 = _mm_cmpeq_epi8(bit_mask_1, zero); \
const __m128i sy_0 = _mm_sub_epi8(_mm_xor_si128(qy_0, sign_mask_0), sign_mask_0); \
const __m128i sy_1 = _mm_sub_epi8(_mm_xor_si128(qy_1, sign_mask_1), sign_mask_1); \
const __m128i sum_0 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_0), ones_16); \
const __m128i sum_1 = _mm_madd_epi16(_mm_maddubs_epi16(ones_8, sy_1), ones_16); \
const __m128 q = _mm_cvtepi32_ps(_mm_add_epi32(sum_0, sum_1)); \
(ACC) = _mm_add_ps((ACC), _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(Y_IDX)].d))), q)); \
}
Q1_SSSE3_BLOCK(0, 0, acc_0)
Q1_SSSE3_BLOCK(4, 1, acc_1)
Q1_SSSE3_BLOCK(8, 2, acc_2)
Q1_SSSE3_BLOCK(12, 3, acc_3)
#undef Q1_SSSE3_BLOCK
}
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#else
UNUSED(nb);
UNUSED(x);
UNUSED(y);
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;
@@ -2300,8 +2142,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
#if defined __AVX2__
const __m256i m3 = _mm256_set1_epi8(3);
const __m256i m15 = _mm256_set1_epi8(15);
const __m256i m4 = _mm256_set1_epi8(0xF);
const __m256i m2 = _mm256_set1_epi8(3);
const __m256i m32s = _mm256_set1_epi8(32);
__m256 acc = _mm256_setzero_ps();
@@ -2313,39 +2156,13 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
const __m256i scales_16 = _mm256_cvtepi8_epi16(scales);
const __m256i q8sclsub = _mm256_slli_epi32(_mm256_madd_epi16(q8sums, scales_16), 5);
__m256i sumi = _mm256_setzero_si256();
int is = 0;
for (int j = 0; j < QK_K/128; ++j) {
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m3), 4);
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(12)), 2);
const __m256i q4h_2 = _mm256_and_si256(q4bitsH, _mm256_set1_epi8(48));
const __m256i q4h_3 = _mm256_srli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(-64)), 2);
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m15), q4h_0);
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m15), q4h_1);
const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m15), q4h_2);
const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m15), q4h_3);
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
__m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
__m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
__m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
__m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
@@ -2353,6 +2170,40 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
is += 4;
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
__m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
__m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
__m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
__m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
__m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
__m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
__m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
__m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
@@ -2363,7 +2214,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
sumi = _mm256_sub_epi32(sumi, q8sclsub);
acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
}

View File

@@ -195,8 +195,8 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .free = */ ggml_backend_cpu_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,

View File

@@ -2321,9 +2321,6 @@ class tinyBLAS_Q0_PPC {
}
void matmul(int64_t m, int64_t n) {
#if defined(_AIX) || defined(__BIG_ENDIAN__)
mnpack(0, m, 0, n);
#else
const int64_t mc = 64;
const int64_t kc = 64;
int64_t nc = 64;
@@ -2337,6 +2334,7 @@ class tinyBLAS_Q0_PPC {
} else {
n_aligned = (n / 64) * 64;
}
if (n_aligned > 0) {
if (n_aligned % 64 == 0) nc = 64;
else if (n_aligned == n) nc = n;
@@ -2354,7 +2352,6 @@ class tinyBLAS_Q0_PPC {
} else {
mnpack(0, m, 0, n);
}
#endif
}
private:
@@ -3194,16 +3191,12 @@ class tinyBLAS_PPC {
}
void matmul(int64_t m, int64_t n) {
#if defined(_AIX) || defined(__BIG_ENDIAN__)
mnpack(0, m, 0, n);
#else
int64_t mc = 256; int64_t nc = 256; int64_t kc = 256;
if (m % mc == 0 && n % nc == 0 && k % kc == 0) {
matmul_tiled(m, n, mc, nc, kc);
} else {
mnpack(0, m, 0, n);
}
#endif
}
private:

View File

@@ -137,28 +137,22 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
float sumf = 0.0;
for (int i = 0; i < nb; i++) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
const float d0 = GGML_FP16_TO_FP32(x[i].d);
float sumi = 0.0f;
for (int k = 0; k < 4; k++) {
const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
int sumi_block = 0;
const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
const int8_t * GGML_RESTRICT qy = yb->qs;
for (int j = 0; j < QK8_0; j++) {
const int bit_index = k * QK8_0 + j;
const int byte_index = bit_index / 8;
const int bit_offset = bit_index % 8;
for (int b = 0; b < 4; ++b, qy += 8) {
const unsigned mask = bits[b];
sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+ ((mask & 0x02) ? qy[1] : -qy[1])
+ ((mask & 0x04) ? qy[2] : -qy[2])
+ ((mask & 0x08) ? qy[3] : -qy[3])
+ ((mask & 0x10) ? qy[4] : -qy[4])
+ ((mask & 0x20) ? qy[5] : -qy[5])
+ ((mask & 0x40) ? qy[6] : -qy[6])
+ ((mask & 0x80) ? qy[7] : -qy[7]);
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
sumi_block += xi * y[i*4 + k].qs[j];
}
sumi += d1 * sumi_block;

View File

@@ -1036,12 +1036,12 @@ inline static float ggml_gelu_quick_f32(float x) {
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
}
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
const uint16_t * i16 = (const uint16_t *) x;
for (int i = 0; i < n; ++i) {
y[i] = ggml_table_gelu_quick_f16[i16[i]];
}
}
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
// const uint16_t * i16 = (const uint16_t *) x;
// for (int i = 0; i < n; ++i) {
// y[i] = ggml_table_gelu_quick_f16[i16[i]];
// }
//}
#ifdef GGML_GELU_QUICK_FP16
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
@@ -1060,6 +1060,13 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
}
#endif
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
float v = GGML_CPU_FP16_TO_FP32(x[i]);
y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
}
}
// Sigmoid Linear Unit (SiLU) function
inline static float ggml_silu_f32(float x) {
return x/(1.0f + expf(-x));

View File

@@ -830,18 +830,6 @@ static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
}
static __device__ __forceinline__ uint8_t ggml_cuda_fp32_to_ue4m3(float x) {
#if defined(BLACKWELL_MMA_AVAILABLE) // This is used for NVFP4 subblock scale quantizations only
if (!(x > 0.0f)) {
return 0;
}
const __nv_fp8_e4m3 xf(x);
return xf.__x;
#else
NO_DEVICE_CODE; // Used only for NVFP4 Scales for Activations, only for Blackwell
#endif // defined(BLACKWELL_MMA_AVAILABLE)
}
__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
const uint8_t sign_bit = (x < 0.0f) << 3;
float ax = fabsf(x) * e;

View File

@@ -1,79 +1,96 @@
#include "concat.cuh"
// contiguous kernels
template <int dim>
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x,
const float * y,
float * dst,
int64_t ne00,
int64_t ne01,
int64_t ne02,
int64_t ne0,
int64_t ne1,
int64_t ne2) {
static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]");
static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
const int64_t n = ne0 * ne1 * ne2;
int offset_dst =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
for (int64_t i = (int64_t) blockIdx.x * blockDim.x + threadIdx.x; i < n; i += (int64_t) blockDim.x * gridDim.x) {
if constexpr (dim == 0) {
const int64_t row = i / ne0;
const int64_t i0 = i - row * ne0;
if (i0 < ne00) {
dst[i] = x[row * ne00 + i0];
} else {
dst[i] = y[row * (ne0 - ne00) + (i0 - ne00)];
}
} else if constexpr (dim == 1) {
const int64_t dst_plane = ne0 * ne1;
const int64_t src0_plane = ne0 * ne01;
const int64_t src1_plane = dst_plane - src0_plane;
const int64_t i2 = i / dst_plane;
const int64_t i01 = i - i2 * dst_plane;
if (i01 < src0_plane) {
dst[i] = x[i2 * src0_plane + i01];
} else {
dst[i] = y[i2 * src1_plane + (i01 - src0_plane)];
}
} else {
const int64_t src0_size = ne0 * ne1 * ne02;
if (i < src0_size) {
dst[i] = x[i];
} else {
dst[i] = y[i - src0_size];
}
}
if (nidx < ne00) { // src0
int offset_src =
nidx +
blockIdx.y * ne00 +
blockIdx.z * ne00 * gridDim.y;
dst[offset_dst] = x[offset_src];
} else {
int offset_src =
(nidx - ne00) +
blockIdx.y * (ne0 - ne00) +
blockIdx.z * (ne0 - ne00) * gridDim.y;
dst[offset_dst] = y[offset_src];
}
}
static void concat_f32_cuda(const float * x,
const float * y,
float * dst,
int64_t ne00,
int64_t ne01,
int64_t ne02,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int dim,
cudaStream_t stream) {
const int64_t n = ne0 * ne1 * ne2;
const int num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
int offset_dst =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
if (blockIdx.y < (unsigned)ne01) { // src0
int offset_src =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * ne01;
dst[offset_dst] = x[offset_src];
} else {
int offset_src =
nidx +
(blockIdx.y - ne01) * ne0 +
blockIdx.z * ne0 * (gridDim.y - ne01);
dst[offset_dst] = y[offset_src];
}
}
static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
int offset_dst =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
if (blockIdx.z < (unsigned)ne02) { // src0
int offset_src =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
dst[offset_dst] = x[offset_src];
} else {
int offset_src =
nidx +
blockIdx.y * ne0 +
(blockIdx.z - ne02) * ne0 * gridDim.y;
dst[offset_dst] = y[offset_src];
}
}
static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne1, ne2);
if (dim == 0) {
concat_f32_cont<0>
<<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
return;
}
if (dim == 1) {
concat_f32_cont<1>
<<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
return;
}
concat_f32_cont<2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
}
// non-contiguous kernel (slow)

View File

@@ -66,9 +66,6 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 32, 128, 128, 128, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 32, 128, 128, 128, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 32, 128, 128, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1, 32, 128, 128, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 8, 64, 4, 32, 256, 256, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 64, 4, 32, 256, 256, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, false);
@@ -88,9 +85,6 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 64, 128, 128, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 64, 128, 128, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 32, 128, 128, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1, 32, 128, 128, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 8, 64, 4, 32, 96, 64, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 64, 4, 32, 96, 64, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, false);
@@ -124,9 +118,6 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 64, 128, 128, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 64, 128, 128, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 64, 160, 128, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 128, 2, 64, 160, 128, 64, 2, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 64, 4, 32, 128, 128, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 256, 1, 32, 128, 128, 128, 1, false);
@@ -1226,7 +1217,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
float KQ_max_scale[cols_per_thread];
#pragma unroll
for (int col = 0; col < cols_per_thread; ++col) {
const int jc = (threadIdx.y/np)*cols_per_warp + (cols_per_warp == 8 ? T_C_KQ::get_j(col) : T_C_KQ::get_i(2*col));
const int jc = cols_per_warp == 8 ? T_C_KQ::get_j(col) : T_C_KQ::get_i(2*col);
const float sink = sinks_f[jc % ncols2];
const float KQ_max_new = fmaxf(KQ_max[col], sink);
@@ -1834,10 +1825,6 @@ extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
// Mistral Small 4 (DKQ=320, DV=256), GQA=32-only build:
extern DECL_FATTN_MMA_F16_CASE(320, 256, 1, 32);
extern DECL_FATTN_MMA_F16_CASE(320, 256, 2, 32);
// For GLM 4.7 Flash
extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 4);
extern DECL_FATTN_MMA_F16_CASE(576, 512, 8, 4);

View File

@@ -38,10 +38,6 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_cuda_flash_attn_ext_tile_case<256, 256>(ctx, dst);
} break;
case 320: {
GGML_ASSERT(V->ne[0] == 256);
ggml_cuda_flash_attn_ext_tile_case<320, 256>(ctx, dst);
} break;
case 512: {
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_cuda_flash_attn_ext_tile_case<512, 512>(ctx, dst);

View File

@@ -68,8 +68,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
@@ -130,8 +128,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 32, 64)
@@ -199,8 +195,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 512, 1, 128, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
@@ -270,8 +264,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5, 32, 256)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3, 64, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 128, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 4, 64, 64)
@@ -1124,7 +1116,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
constexpr size_t nbytes_shared = 0;
#ifdef GGML_USE_HIP
if constexpr (DKQ <= 128) {
if constexpr (DV <= 128) {
if (Q->ne[1] > 32/ncols2) {
constexpr int cols_per_block = 64;
const int nwarps = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
@@ -1138,7 +1130,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
#endif // GGML_USE_HIP
#ifndef GGML_USE_HIP
if constexpr (DKQ <= 256)
if constexpr (DV <= 256)
#endif // GGML_USE_HIP
{
if (Q->ne[1] > 16/ncols2) {
@@ -1152,16 +1144,14 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
}
}
if constexpr (ncols2 <= 16) {
if (Q->ne[1] > 8/ncols2) {
constexpr int cols_per_block = 16;
const int nwarps = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
const int nbatch_fa = ggml_cuda_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
fattn_kernel_t fattn_kernel = flash_attn_tile<DKQ, DV, cols_per_block/ncols2, ncols2, use_logit_softcap>;
launch_fattn<DV, cols_per_block/ncols2, ncols2>
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, nbatch_fa, true, true, false, warp_size);
return;
}
if (Q->ne[1] > 8/ncols2) {
constexpr int cols_per_block = 16;
const int nwarps = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
const int nbatch_fa = ggml_cuda_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
fattn_kernel_t fattn_kernel = flash_attn_tile<DKQ, DV, cols_per_block/ncols2, ncols2, use_logit_softcap>;
launch_fattn<DV, cols_per_block/ncols2, ncols2>
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, nbatch_fa, true, true, false, warp_size);
return;
}
if constexpr (ncols2 <= 8) {
@@ -1220,25 +1210,6 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;
if constexpr (DKQ == 320) {
// This branch is only used for Mistral Small 4 which has a GQA ratio of 32.
// On AMD, simply use that GQA ratio with 32 columns / block since we always have enough SRAM.
// On NVIDIA however, the tile kernel is only used for GPUs that can't use the mma kernel (Pascal and older).
// Therefore, use a GQA ratio of 16 with 16 columns / block to stay below 48 kiB of SRAM / block.
#ifdef GGML_USE_HIP
if (use_gqa_opt && gqa_ratio % 32 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 32, use_logit_softcap>(ctx, dst);
return;
}
#else
if (use_gqa_opt && gqa_ratio % 16 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
return;
}
#endif // GGML_USE_HIP
GGML_ABORT("flash-attn tile (320/256): expected GQA ratio multiple of 32");
}
if constexpr (DKQ == 576) {
if (use_gqa_opt && gqa_ratio % 16 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
@@ -1250,7 +1221,7 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
}
}
if constexpr (DKQ <= 512 && DKQ != 320) {
if constexpr (DKQ <= 512) {
if (use_gqa_opt && gqa_ratio % 8 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
return;
@@ -1304,6 +1275,5 @@ extern DECL_FATTN_TILE_CASE( 96, 96);
extern DECL_FATTN_TILE_CASE(112, 112);
extern DECL_FATTN_TILE_CASE(128, 128);
extern DECL_FATTN_TILE_CASE(256, 256);
extern DECL_FATTN_TILE_CASE(320, 256);
extern DECL_FATTN_TILE_CASE(512, 512);
extern DECL_FATTN_TILE_CASE(576, 512);

View File

@@ -143,22 +143,6 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
GGML_ASSERT(V->ne[0] == 256);
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
break;
case 320:
// For Mistral Small 4, go straight to the ncols1 switch (ncols2=32-only build).
GGML_ASSERT(V->ne[0] == 256);
{
float max_bias = 0.0f;
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
const bool use_gqa_opt = mask && max_bias == 0.0f;
GGML_ASSERT(use_gqa_opt);
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
const int gqa_ratio = Q->ne[2] / K->ne[2];
GGML_ASSERT(gqa_ratio % 32 == 0);
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<320, 256, 32>(ctx, dst);
}
break;
case 512:
GGML_ASSERT(V->ne[0] == 512);
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<512, 512>(ctx, dst);
@@ -368,14 +352,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_NONE;
}
break;
case 320:
if (V->ne[0] != 256 || !gqa_opt_applies) {
return BEST_FATTN_KERNEL_NONE;
}
if (gqa_ratio % 32 != 0) {
return BEST_FATTN_KERNEL_NONE;
}
break;
case 512:
if (V->ne[0] != K->ne[0]) {
return BEST_FATTN_KERNEL_NONE;

View File

@@ -368,21 +368,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
}
~ggml_cuda_pool_leg() {
clear_pool();
GGML_ASSERT(pool_size == 0);
}
void clear_pool() {
ggml_cuda_set_device(device);
for (int i = 0; i < MAX_BUFFERS; ++i) {
ggml_cuda_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
CUDA_CHECK(cudaFree(b.ptr));
pool_size -= b.size;
b.ptr = nullptr;
b.size = 0;
}
}
GGML_ASSERT(pool_size == 0);
}
void * alloc(size_t size, size_t * actual_size) override {
@@ -427,20 +421,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
size_t look_ahead_size = (size_t) (1.05 * size);
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
ggml_cuda_set_device(device);
cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
if (err == cudaErrorMemoryAllocation) {
(void)cudaGetLastError();
const size_t cached_bytes = pool_size;
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, flushing %.2f MiB of cached buffers and retrying\n",
device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0);
CUDA_CHECK(cudaDeviceSynchronize());
clear_pool();
err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
if (err == cudaSuccess) {
GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device);
}
}
CUDA_CHECK(err);
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
*actual_size = look_ahead_size;
pool_size += look_ahead_size;
#ifdef DEBUG_CUDA_MALLOC
@@ -1222,13 +1203,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
// For small tensors, simply reduce them as FP32.
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
for (size_t i = 0; i < n_backends; ++i) {
if ((tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
ggml_cuda_set_device(cuda_ctx->device);
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, ggml_nbytes(tensors[i]), cuda_ctx->stream()));
}
}
NCCL_CHECK(ncclGroupStart());
for (size_t i = 0; i < n_backends; ++i) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
@@ -1250,11 +1224,7 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
tmp[i].alloc(ne);
ggml_cuda_set_device(cuda_ctx->device);
if (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) {
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
} else {
CUDA_CHECK(cudaMemsetAsync(tmp[i].get(), 0, ne * sizeof(nv_bfloat16), cuda_ctx->stream()));
}
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
}
@@ -3556,9 +3526,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_SILU) {
const ggml_tensor * ssm_conv = cgraph->nodes[node_idx];
const ggml_tensor * silu = cgraph->nodes[node_idx+1];
if (ggml_get_unary_op(silu) != unary_ops.begin()[0]) {
return false;
}
if (ssm_conv->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32) {
return false;
@@ -3567,31 +3534,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
return true;
}
if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SSM_CONV && ops.begin()[1] == GGML_OP_ADD
&& ops.begin()[2] == GGML_OP_UNARY && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_SILU) {
const ggml_tensor * ssm_conv = cgraph->nodes[node_idx];
const ggml_tensor * add = cgraph->nodes[node_idx+1];
const ggml_tensor * silu = cgraph->nodes[node_idx+2];
if (ggml_get_unary_op(silu) != unary_ops.begin()[0]) {
return false;
}
if (ssm_conv->type != GGML_TYPE_F32 || add->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32) {
return false;
}
// ADD must consume ssm_conv's output and broadcast a 1-D channel-wise bias.
const ggml_tensor * bias = (add->src[0] == ssm_conv) ? add->src[1] : add->src[0];
if (bias->type != GGML_TYPE_F32 || !ggml_is_contiguous(bias)) {
return false;
}
if (ggml_nelements(bias) != ssm_conv->ne[0] || bias->ne[0] != ssm_conv->ne[0]) {
return false;
}
return true;
}
if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_MUL
&& unary_ops.size() == 1 && (unary_ops.begin()[0] == GGML_UNARY_OP_SILU || unary_ops.begin()[0] == GGML_UNARY_OP_SIGMOID || unary_ops.begin()[0] == GGML_UNARY_OP_SOFTPLUS)) {
const ggml_tensor * unary = cgraph->nodes[node_idx];
@@ -3620,30 +3562,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
return true;
}
if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_SQR
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_RELU) {
const ggml_tensor * unary = cgraph->nodes[node_idx];
const ggml_tensor * sqr = cgraph->nodes[node_idx+1];
if (ggml_get_unary_op(unary) != GGML_UNARY_OP_RELU) {
return false;
}
if (unary->type != GGML_TYPE_F32 && unary->type != GGML_TYPE_F16) {
return false;
}
if (unary->type != sqr->type) {
return false;
}
if (!ggml_is_contiguous(unary->src[0])) {
return false;
}
return true;
}
if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
const ggml_tensor *scale = cgraph->nodes[node_idx];
@@ -3668,362 +3586,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
return false;
}
// try and fuse nodes and return the number of nodes to skip
static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, int i) {
static bool disable_fusion = getenv("GGML_CUDA_DISABLE_FUSION") != nullptr && std::atoi(getenv("GGML_CUDA_DISABLE_FUSION"));
if (disable_fusion) {
return 0;
}
ggml_tensor * node = cgraph->nodes[i];
//topk-moe
if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
ggml_cuda_topk_moe_args args;
const bool can_fuse = ggml_cuda_topk_moe_fusion(cgraph, i, args);
std::vector<ggml_op> ops;
if (can_fuse) {
const ggml_tensor * logits = node->src[0];
ggml_tensor * weights = nullptr;
ggml_tensor * ids = nullptr;
const ggml_tensor * bias = nullptr;
const ggml_tensor * clamp = nullptr;
const ggml_tensor * scale = nullptr;
if (!args.delayed_softmax) {
ggml_op gating_op = args.sigmoid ? GGML_OP_UNARY : GGML_OP_SOFT_MAX;
int out_nodes[2]; // nodes which can't be elided
if (args.prob_bias) {
bias = cgraph->nodes[i + 2]->src[1];
ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ADD, GGML_OP_ARGSORT, GGML_OP_VIEW,
GGML_OP_GET_ROWS });
out_nodes[0] = i + 4;
ids = cgraph->nodes[i + 4];
} else {
ops.insert(ops.end(),
{ gating_op, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS });
out_nodes[0] = i + 3;
ids = cgraph->nodes[i + 3];
}
if (args.norm) {
ops.insert(ops.end(),
{ GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV, GGML_OP_RESHAPE });
clamp = cgraph->nodes[i + ops.size() - 3];
}
if (args.scale) {
ops.insert(ops.end(), { GGML_OP_SCALE });
scale = cgraph->nodes[i + ops.size() - 1];
}
weights = cgraph->nodes[i + ops.size() - 1];
out_nodes[1] = i + ops.size() - 1;
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
ggml_cuda_should_use_topk_moe(node, logits, weights, ids) &&
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/true)) {
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
return ops.size() - 1;
}
} else if (!args.norm && !args.prob_bias) {
//special case gpt-oss, no norm, no bias.
ops.insert(ops.end(), { GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
GGML_OP_SOFT_MAX, GGML_OP_RESHAPE });
weights = cgraph->nodes[i + 5];
ids = cgraph->nodes[i + 1];
const ggml_tensor * softmax = cgraph->nodes[i + 4];
int out_nodes[2] = { i + 1, i + 5 };
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
ggml_cuda_should_use_topk_moe(softmax, logits, weights, ids) &&
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/true)) {
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
return ops.size() - 1;
}
}
}
}
//RoPE + view + set-rows
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) {
ggml_tensor * rope = cgraph->nodes[i];
ggml_tensor * set_rows = cgraph->nodes[i + 2];
ggml_cuda_op_rope_fused(*cuda_ctx, rope, set_rows);
return 2;
}
// multi-(add or mul)
if (node->op == GGML_OP_ADD || node->op == GGML_OP_MUL) {
int n_fuse = 0;
ggml_op ops[8];
std::fill(ops, ops + 8, node->op);
for (; n_fuse <= 6; ++n_fuse) {
if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) {
break;
}
if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) {
break;
}
if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) {
break;
}
}
n_fuse++;
if (n_fuse > 1) {
ggml_tensor fused_node;
memcpy(&fused_node, node, sizeof(ggml_tensor));
for (int j = 0; j < n_fuse - 1; ++j) {
fused_node.src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
}
fused_node.data = cgraph->nodes[i + n_fuse - 1]->data;
if (node->op == GGML_OP_ADD) {
ggml_cuda_op_fused_add(*cuda_ctx, &fused_node, n_fuse);
} else {
ggml_cuda_op_fused_mul(*cuda_ctx, &fused_node, n_fuse);
}
return n_fuse - 1;
}
}
bool fused_mul_mat_vec = false;
int fused_node_count = 0;
// gate + glu + up
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
ggml_tensor * glu = cgraph->nodes[i + 4];
ggml_tensor * gate_bias_n = glu->src[0];
ggml_tensor * up_bias_n = glu->src[1];
//we don't assume the order for {gate, up}. Instead infer it from the bias tensor
ggml_tensor * gate_n = nullptr;
ggml_tensor * up_n = nullptr;
if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) {
gate_n = cgraph->nodes[i];
up_n = cgraph->nodes[i + 2];
} else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) {
gate_n = cgraph->nodes[i + 2];
up_n = cgraph->nodes[i];
} else {
continue;
}
auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
if (op_bias == GGML_OP_ADD) {
if (bias_node->src[0] == mul_node) {
return bias_node->src[1];
}
if (bias_node->src[1] == mul_node) {
return bias_node->src[0];
}
return (ggml_tensor *) nullptr;
}
GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
GGML_ASSERT(bias_node->src[0] == mul_node);
return bias_node->src[1];
};
ggml_tensor * up_bias_tensor = get_bias_tensor(up_bias_n, up_n, bias_op);
ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
if (!up_bias_tensor || !gate_bias_tensor) {
continue;
}
// we don't support repeating adds
if (bias_op == GGML_OP_ADD && (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
continue;
}
const ggml_tensor * src0 = up_n->src[0];
const ggml_tensor * src1 = up_n->src[1];
const ggml_tensor * ids = up_n->src[2];
if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate_n->src[0];
fusion_data.x_bias = up_bias_tensor;
fusion_data.gate_bias = gate_bias_tensor;
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 5;
break;
}
if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate_n->src[0];
fusion_data.x_bias = up_bias_tensor;
fusion_data.gate_bias = gate_bias_tensor;
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 5;
break;
}
} else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
ggml_tensor * glu = cgraph->nodes[i + 2];
ggml_tensor * gate = glu->src[0];
ggml_tensor * up = glu->src[1];
bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1]) ||
(gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]);
if (!ok) {
continue;
}
const ggml_tensor * src0 = up->src[0];
const ggml_tensor * src1 = up->src[1];
const ggml_tensor * ids = up->src[2];
if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate->src[0];
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 3;
break;
}
if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate->src[0];
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 3;
break;
}
}
}
if (fused_mul_mat_vec) {
return fused_node_count - 1;
}
fused_mul_mat_vec = false;
fused_node_count = 0;
// gate + add + glu + up + add
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
continue;
}
ggml_tensor * mm_node = cgraph->nodes[i];
ggml_tensor * bias_node = cgraph->nodes[i + 1];
ggml_tensor * bias_tensor = nullptr;
if (bias_op == GGML_OP_ADD) {
if (bias_node->src[0] == mm_node) {
bias_tensor = bias_node->src[1];
} else if (bias_node->src[1] == mm_node) {
bias_tensor = bias_node->src[0];
} else {
continue;
}
} else {
if (bias_node->src[0] != mm_node) {
continue;
}
bias_tensor = bias_node->src[1];
}
const ggml_tensor * src0 = mm_node->src[0];
const ggml_tensor * src1 = mm_node->src[1];
const ggml_tensor * ids = mm_node->src[2];
if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) {
continue;
}
if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
continue;
}
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.x_bias = bias_tensor;
if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 2;
break;
}
if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 2;
break;
}
}
if (fused_mul_mat_vec) {
return fused_node_count - 1;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD }, {})) {
ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i + 1], cgraph->nodes[i + 2]);
return 2;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i + 1]);
return 1;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SSM_CONV, GGML_OP_ADD, GGML_OP_UNARY }, { GGML_UNARY_OP_SILU })) {
ggml_cuda_op_ssm_conv(*cuda_ctx, node, cgraph->nodes[i + 1], cgraph->nodes[i + 2]);
return 2;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SSM_CONV, GGML_OP_UNARY }, { GGML_UNARY_OP_SILU })) {
ggml_cuda_op_ssm_conv(*cuda_ctx, node, /*bias_add_node=*/ nullptr, cgraph->nodes[i + 1]);
return 1;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SILU }) ||
ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SIGMOID }) ||
ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SOFTPLUS })) {
ggml_cuda_op_unary_mul(*cuda_ctx, node, cgraph->nodes[i + 1]);
return 1;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i + 1]);
return 1;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i + 2], node);
return 2;
}
return 0;
}
static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required, const void * graph_key) {
bool graph_evaluated_or_captured = false;
@@ -4170,11 +3732,349 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
continue;
}
int nodes_to_skip = ggml_cuda_try_fuse(cuda_ctx, cgraph, i);
// start of fusion operations
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
if (!disable_fusion) {
ggml_cuda_topk_moe_args args;
if (nodes_to_skip != 0) {
i += nodes_to_skip;
continue;
if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
const bool can_fuse = ggml_cuda_topk_moe_fusion(cgraph, i, args);
std::vector<ggml_op> ops;
if (can_fuse) {
const ggml_tensor * logits = node->src[0];
ggml_tensor * weights = nullptr;
ggml_tensor * ids = nullptr;
const ggml_tensor * bias = nullptr;
const ggml_tensor * clamp = nullptr;
const ggml_tensor * scale = nullptr;
if (!args.delayed_softmax) {
ggml_op gating_op = args.sigmoid ? GGML_OP_UNARY : GGML_OP_SOFT_MAX;
int out_nodes[2]; // nodes which can't be elided
if (args.prob_bias) {
bias = cgraph->nodes[i + 2]->src[1];
ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ADD, GGML_OP_ARGSORT,
GGML_OP_VIEW, GGML_OP_GET_ROWS });
out_nodes[0] = i + 4;
ids = cgraph->nodes[i + 4];
} else {
ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW,
GGML_OP_GET_ROWS });
out_nodes[0] = i + 3;
ids = cgraph->nodes[i + 3];
}
if (args.norm) {
ops.insert(ops.end(), { GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP,
GGML_OP_DIV, GGML_OP_RESHAPE });
clamp = cgraph->nodes[i + ops.size() - 3];
}
if (args.scale) {
ops.insert(ops.end(), { GGML_OP_SCALE });
scale = cgraph->nodes[i + ops.size() - 1];
}
weights = cgraph->nodes[i + ops.size() - 1];
out_nodes[1] = i + ops.size() - 1;
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
ggml_cuda_should_use_topk_moe(node, logits, weights, ids) &&
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/ true)) {
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
i += ops.size() - 1;
continue;
}
} else if (!args.norm && !args.prob_bias) {
//special case gpt-oss, no norm, no bias.
ops.insert(ops.end(), { GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS,
GGML_OP_RESHAPE, GGML_OP_SOFT_MAX, GGML_OP_RESHAPE });
weights = cgraph->nodes[i + 5];
ids = cgraph->nodes[i + 1];
const ggml_tensor * softmax = cgraph->nodes[i + 4];
int out_nodes[2] = { i + 1, i + 5 };
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
ggml_cuda_should_use_topk_moe(softmax, logits, weights, ids) &&
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/ true)) {
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
i += ops.size() - 1;
continue;
}
}
}
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) {
ggml_tensor * rope = cgraph->nodes[i];
ggml_tensor * set_rows = cgraph->nodes[i + 2];
ggml_cuda_op_rope_fused(*cuda_ctx, rope, set_rows);
i += 2;
continue;
}
if (node->op == GGML_OP_ADD || node->op == GGML_OP_MUL) {
int n_fuse = 0;
ggml_op ops[8];
std::fill(ops, ops + 8, node->op);
for (; n_fuse <= 6; ++n_fuse){
if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) {
break;
}
if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) {
break;
}
if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) {
break;
}
}
n_fuse++;
if (n_fuse > 1) {
ggml_tensor fused_node;
memcpy(&fused_node, node, sizeof(ggml_tensor));
for (int j = 0; j < n_fuse - 1; ++j) {
fused_node.src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
}
fused_node.data = cgraph->nodes[i + n_fuse - 1]->data;
if (node->op == GGML_OP_ADD) {
ggml_cuda_op_fused_add(*cuda_ctx, &fused_node, n_fuse);
} else {
ggml_cuda_op_fused_mul(*cuda_ctx, &fused_node, n_fuse);
}
i += n_fuse - 1;
continue;
}
}
bool fused_mul_mat_vec = false;
int fused_node_count = 0;
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
ggml_tensor * glu = cgraph->nodes[i + 4];
ggml_tensor * gate_bias_n = glu->src[0];
ggml_tensor * up_bias_n = glu->src[1];
//we don't assume the order for {gate, up}. Instead infer it from the bias tensor
ggml_tensor * gate_n = nullptr;
ggml_tensor * up_n = nullptr;
if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) {
gate_n = cgraph->nodes[i];
up_n = cgraph->nodes[i + 2];
} else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) {
gate_n = cgraph->nodes[i + 2];
up_n = cgraph->nodes[i];
} else {
continue;
}
auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
if (op_bias == GGML_OP_ADD) {
if (bias_node->src[0] == mul_node) {
return bias_node->src[1];
}
if (bias_node->src[1] == mul_node) {
return bias_node->src[0];
}
return (ggml_tensor *) nullptr;
}
GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
GGML_ASSERT(bias_node->src[0] == mul_node);
return bias_node->src[1];
};
ggml_tensor * up_bias_tensor = get_bias_tensor(up_bias_n, up_n, bias_op);
ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
if (!up_bias_tensor || !gate_bias_tensor) {
continue;
}
// we don't support repeating adds
if (bias_op == GGML_OP_ADD &&
(!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
continue;
}
const ggml_tensor * src0 = up_n->src[0];
const ggml_tensor * src1 = up_n->src[1];
const ggml_tensor * ids = up_n->src[2];
if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate_n->src[0];
fusion_data.x_bias = up_bias_tensor;
fusion_data.gate_bias = gate_bias_tensor;
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 5;
break;
}
if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate_n->src[0];
fusion_data.x_bias = up_bias_tensor;
fusion_data.gate_bias = gate_bias_tensor;
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 5;
break;
}
} else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
ggml_tensor * glu = cgraph->nodes[i + 2];
ggml_tensor * gate = glu->src[0];
ggml_tensor * up = glu->src[1];
bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1])
|| (gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]);
if (!ok) continue;
const ggml_tensor * src0 = up->src[0];
const ggml_tensor * src1 = up->src[1];
const ggml_tensor * ids = up->src[2];
if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate->src[0];
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 3;
break;
}
if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.gate = gate->src[0];
fusion_data.glu_op = ggml_get_glu_op(glu);
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 3;
break;
}
}
}
if (fused_mul_mat_vec) {
i += fused_node_count - 1;
continue;
}
fused_mul_mat_vec = false;
fused_node_count = 0;
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
continue;
}
ggml_tensor * mm_node = cgraph->nodes[i];
ggml_tensor * bias_node = cgraph->nodes[i + 1];
ggml_tensor * bias_tensor = nullptr;
if (bias_op == GGML_OP_ADD) {
if (bias_node->src[0] == mm_node) {
bias_tensor = bias_node->src[1];
} else if (bias_node->src[1] == mm_node) {
bias_tensor = bias_node->src[0];
} else {
continue;
}
} else {
if (bias_node->src[0] != mm_node) {
continue;
}
bias_tensor = bias_node->src[1];
}
const ggml_tensor * src0 = mm_node->src[0];
const ggml_tensor * src1 = mm_node->src[1];
const ggml_tensor * ids = mm_node->src[2];
if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) {
continue;
}
if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
continue;
}
ggml_cuda_mm_fusion_args_host fusion_data{};
fusion_data.x_bias = bias_tensor;
if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 2;
break;
}
if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
fused_mul_mat_vec = true;
fused_node_count = 2;
break;
}
}
if (fused_mul_mat_vec) {
i += fused_node_count - 1;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
i += 2;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) {
ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
i++;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SSM_CONV, GGML_OP_UNARY }, { GGML_UNARY_OP_SILU })) {
ggml_cuda_op_ssm_conv(*cuda_ctx, node, cgraph->nodes[i+1]);
i++;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SILU }) ||
ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SIGMOID }) ||
ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SOFTPLUS })) {
ggml_cuda_op_unary_mul(*cuda_ctx, node, cgraph->nodes[i+1]);
i++;
continue;
}
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
i += 2;
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
continue;
}
}
#ifndef NDEBUG
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
@@ -4588,8 +4488,8 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .free = */ ggml_backend_cuda_free,
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
/* .set_tensor_2d_async = */ ggml_backend_cuda_set_tensor_2d_async,
/* .get_tensor_2d_async = */ ggml_backend_cuda_get_tensor_2d_async,
/* .get_tensor_2d_async = */ ggml_backend_cuda_set_tensor_2d_async,
/* .set_tensor_2d_async = */ ggml_backend_cuda_get_tensor_2d_async,
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL,

View File

@@ -1015,35 +1015,25 @@ namespace ggml_cuda_mma {
#endif // AMD_MFMA_AVAILABLE
}
template <ggml_type type>
static __device__ __forceinline__ void mma_block_scaled_fp4(tile<16, 8, float> & D,
const tile<16, 8, int> & A,
const tile<8, 8, int> & B,
uint32_t a_scale,
uint32_t b_scale) {
static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D,
const tile<16, 8, int> & A,
const tile<8, 8, int> & B,
uint32_t a_scale,
uint32_t b_scale) {
#ifdef BLACKWELL_MMA_AVAILABLE
const int * Axi = (const int *) A.x;
const int * Bxi = (const int *) B.x;
float * Dxi = (float *) D.x;
if constexpr (type == GGML_TYPE_MXFP4) {
asm volatile(
"mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
"%10, {0, 0}, %11, {0, 0};"
: "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
} else {
asm volatile(
"mma.sync.aligned.kind::mxf4nvf4.block_scale.scale_vec::4X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue4m3 "
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
"%10, {0, 0}, %11, {0, 0};"
: "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
}
asm volatile(
"mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
"%10, {0, 0}, %11, {0, 0};"
: "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
#else
GGML_UNUSED_VARS(D, A, B, a_scale, b_scale);
#endif // BLACKWELL_MMA_AVAILABLE
#endif // BLACKWELL_MMA_AVAILABLE
}
static __device__ __forceinline__ void mma(

View File

@@ -122,7 +122,7 @@ void ggml_cuda_mul_mat_q(
|| GGML_CUDA_CC_IS_CDNA(cc);
// TODO: tighter pool buffer size vs q8 path
const bool use_native_fp4 = blackwell_mma_available(cc) && (src0->type == GGML_TYPE_MXFP4 || src0->type == GGML_TYPE_NVFP4);
const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4;
if (!ids) {
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -133,9 +133,9 @@ void ggml_cuda_mul_mat_q(
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[3] / ts_src1;
if (use_native_fp4) {
if (use_native_mxfp4) {
static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
quantize_mmq_fp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
ne11, ne12, ne13, stream);
} else {
@@ -146,8 +146,10 @@ void ggml_cuda_mul_mat_q(
}
// Stride depends on quantization format
const int64_t s12 = use_native_fp4 ?
ne11 * ne10_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int)) : // block_fp4_mmq holds 256 values
const int64_t s12 = use_native_mxfp4 ?
ne11 * ne10_padded * sizeof(block_fp4_mmq) /
(8 * QK_MXFP4 * sizeof(int)) // block_fp4_mmq holds 256 values (8 blocks of 32)
:
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
const int64_t s13 = ne12*s12;
@@ -196,8 +198,8 @@ void ggml_cuda_mul_mat_q(
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[3] / ts_src1;
if (use_native_fp4) {
quantize_mmq_fp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
if (use_native_mxfp4) {
quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
} else {
quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
@@ -206,9 +208,8 @@ void ggml_cuda_mul_mat_q(
CUDA_CHECK(cudaGetLastError());
}
static_assert(QK_K == 8 * QK_MXFP4, "QK_K needs to be 8 * QK_MXFP4");
const int64_t s12 = use_native_fp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int)) :
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) :
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
const int64_t s13 = ne12*s12;
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.

View File

@@ -10,9 +10,9 @@
using namespace ggml_cuda_mma;
#define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
#define MMQ_ITER_K 256
#define MMQ_ITER_K_FP4 512
#define MMQ_NWARPS 8
#define MMQ_ITER_K 256
#define MMQ_ITER_K_MXFP4_FP4 512
#define MMQ_NWARPS 8
typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride);
typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00);
@@ -46,12 +46,9 @@ struct block_q8_1_mmq {
int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each
};
// this struct is used for fp4 data types (currently only used for Blackwell)
// mxfp4 has block size 32, each int32 of d4 contains 2 e8m0 scales in the lower 16 bits
// nvfp4 has block size 16, each int32 of d4 contains 4 ue4m3 scales
struct block_fp4_mmq {
uint32_t d4[4];
int8_t qs[4 * 32]; // 256 FP4 values packed as 4-bit pairs (2 per byte)
uint32_t d4[4]; // 8 E8M0 scales (1 per 32 values), 2 packed per uint32: d4[0]={s0,s1}, d4[1]={s2,s3}, etc.
int8_t qs[4 * 32]; // 256 FP4 values packed as 4-bit pairs (2 per byte), 8 blocks of 32 values
};
static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
@@ -146,11 +143,10 @@ static int get_mmq_y_host(const int cc) {
static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) {
#if defined(BLACKWELL_MMA_AVAILABLE)
if (type == GGML_TYPE_NVFP4 || type == GGML_TYPE_MXFP4) {
return MMQ_ITER_K_FP4;
}
#endif // defined(BLACKWELL_MMA_AVAILABLE)
return type == GGML_TYPE_MXFP4 ? MMQ_ITER_K_MXFP4_FP4 : MMQ_ITER_K;
#else
return MMQ_ITER_K;
#endif // defined(BLACKWELL_MMA_AVAILABLE)
}
static constexpr __device__ int get_mmq_y_device() {
@@ -217,8 +213,8 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
}
#define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
#define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4) // MXFP4 and NVFP4 Blackwell
#define MMQ_MMA_TILE_X_K_NVFP4 (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) // NVFP4 Generic
#define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4) // MXFP4
#define MMQ_MMA_TILE_X_K_NVFP4 (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) // NVFP4
#define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
#define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K + 4)
#define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4)
@@ -244,11 +240,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0;
// tile sizes are the same for Q8_1 and FP4 for blackwell
case GGML_TYPE_MXFP4: return MMQ_MMA_TILE_X_K_Q8_1;
#if defined(BLACKWELL_MMA_AVAILABLE)
case GGML_TYPE_NVFP4: return MMQ_MMA_TILE_X_K_FP4;
#else
case GGML_TYPE_NVFP4: return MMQ_MMA_TILE_X_K_NVFP4;
#endif // defined(BLACKWELL_MMA_AVAILABLE)
case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K;
case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K;
case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1;
@@ -942,128 +934,6 @@ static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restr
}
}
#ifdef BLACKWELL_MMA_AVAILABLE
template <int mmq_y, bool need_check>
static __device__ __forceinline__ void load_tiles_nvfp4_nvfp4(const char * __restrict__ x,
int * __restrict__ x_tile,
const int kbx0,
const int i_max,
const int stride) {
constexpr int nwarps = mmq_get_nwarps_device();
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int iter_k = get_iter_k(GGML_TYPE_NVFP4);
constexpr int threads_per_row = iter_k / QK_NVFP4; // each thread processes 1 block
constexpr int rows_per_warp = warp_size / threads_per_row;
uint32_t * x_u32 = (uint32_t *) x_tile;
const int txi = threadIdx.x;
const int kbx = txi % threads_per_row;
const int row_in_warp = txi / threads_per_row;
const block_nvfp4 * bxi_base = (const block_nvfp4 *) x + kbx0 + kbx;
uint32_t * x_u32_scale = x_u32 + 64 + kbx;
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
if constexpr (need_check) {
i = min(i, i_max);
}
const block_nvfp4 * bxi = bxi_base + i * stride;
const int row_base = i * MMQ_MMA_TILE_X_K_FP4;
const int q_base = row_base + 8 * kbx;
const uint32_t * src_qs = reinterpret_cast<const uint32_t *>(bxi->qs);
#pragma unroll
for (int sub = 0; sub < QK_NVFP4 / QK_NVFP4_SUB; ++sub) {
x_u32[q_base + 2 * sub + 0] = src_qs[2 * sub + 0];
x_u32[q_base + 2 * sub + 1] = src_qs[2 * sub + 1];
}
x_u32_scale[row_base] = get_int_b4(bxi->d, 0);
}
}
// Shared MMA kernel for MXFP4 and NVFP4 on Blackwell.
// Both quantizations encode values as e2m1 (FP4) and produce one uint32 scale per
// m16n8k64 MMA call; only the PTX kind (scale_vec::2X ue8m0 vs scale_vec::4X ue4m3)
// and the per-type stride constant differ.
template <int mmq_x, int mmq_y, ggml_type type>
static __device__ __forceinline__ void vec_dot_fp4_fp4_mma(const int * __restrict__ x,
const int * __restrict__ y,
float * __restrict__ sum,
const int k00) {
static_assert(type == GGML_TYPE_MXFP4 || type == GGML_TYPE_NVFP4,
"vec_dot_fp4_fp4_mma: type must be MXFP4 or NVFP4");
typedef tile<16, 8, int> tile_A;
typedef tile<8, 8, int> tile_B;
typedef tile<16, 8, float> tile_C;
constexpr int stride = MMQ_MMA_TILE_X_K_FP4;
constexpr int granularity = mmq_get_granularity_device(mmq_x);
constexpr int rows_per_warp = 2 * granularity;
constexpr int ntx = rows_per_warp / tile_C::I;
constexpr int nfrags = MMQ_TILE_NE_K / tile_A::J;
y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_K);
const int * x_qs = (const int *) x;
const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
const int * y_qs = (const int *) y + 4;
const uint32_t * y_sc = (const uint32_t *) y;
// 2 threads per quad supply the packed scale register to the block_scale MMA,
// see https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
const int tidx_A = threadIdx.x / 4 + (threadIdx.x % 2) * 8;
const int tidx_B = threadIdx.x / 4;
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
tile_A A[ntx][nfrags];
uint32_t scaleA[ntx][nfrags];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
#pragma unroll
for (int frag = 0; frag < nfrags; ++frag) {
const int k0 = k00 + frag * tile_A::J;
load_ldmatrix(A[n][frag], x_qs + (i0 + n * tile_A::I) * stride + k0, stride);
scaleA[n][frag] = x_sc[(i0 + n * tile_A::I + tidx_A) * stride + k0 / tile_A::J];
}
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) {
tile_B B[nfrags];
uint32_t scaleB[nfrags];
#pragma unroll
for (int frag = 0; frag < nfrags; ++frag) {
const int k0 = frag * tile_B::J;
load_generic(B[frag], y_qs + j0 * MMQ_TILE_Y_K + k0, MMQ_TILE_Y_K);
scaleB[frag] = y_sc[(j0 + tidx_B) * MMQ_TILE_Y_K + frag];
}
#pragma unroll
for (int n = 0; n < ntx; ++n) {
#pragma unroll
for (int frag = 0; frag < nfrags; ++frag) {
tile_C C = {};
mma_block_scaled_fp4<type>(C, A[n][frag], B[frag], scaleA[n][frag], scaleB[frag]);
#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l];
}
}
}
}
}
#endif // BLACKWELL_MMA_AVAILABLE
template <int mmq_y, bool need_check>
static __device__ __forceinline__ void load_tiles_nvfp4(const char * __restrict__ x,
@@ -1293,6 +1163,77 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
#endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
}
template <int mmq_x, int mmq_y>
static __device__ __forceinline__ void vec_dot_mxfp4_mxfp4_mma(const int * __restrict__ x,
const int * __restrict__ y,
float * __restrict__ sum,
const int k00) {
typedef tile<16, 8, int> tile_A;
typedef tile<8, 8, int> tile_B;
typedef tile<16, 8, float> tile_C; // Output is float for native scaled MMA
constexpr int granularity = mmq_get_granularity_device(mmq_x);
constexpr int rows_per_warp = 2 * granularity;
constexpr int ntx = rows_per_warp / tile_C::I; // Number of x minitiles per warp.
y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_FP4_K);
// Match layout from load_tiles_mxfp4_fp4
const int * x_qs = (const int *) x;
const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
const int * y_qs = (const int *) y + 4;
const uint32_t * y_sc = (const uint32_t *) y;
// tile_A has a length of 64 logical values vs. 32 values in block_mxfp4
tile_A A[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
uint32_t scaleA[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
// Block scale
// Each thread has to point to a 4 byte scale value
// https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
#pragma unroll
for (int n = 0; n < ntx; ++n) {
#pragma unroll
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
const int k0 = k00 + k01;
load_ldmatrix(A[n][k01 / (2 * QI_MXFP4)], x_qs + (i0 + n * tile_A::I) * MMQ_MMA_TILE_X_K_FP4 + k0,
MMQ_MMA_TILE_X_K_FP4);
// based on block-scaling document, 2 threads in each quad need to supply to the scale value
const int tidx = threadIdx.x / 4 + (threadIdx.x % 2) * 8;
scaleA[n][k01 / (2 * QI_MXFP4)] =
*(x_sc + (i0 + n * tile_A::I + tidx) * MMQ_MMA_TILE_X_K_FP4 + k0 / (2 * QI_MXFP4));
}
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) {
#pragma unroll
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
tile_B B;
uint32_t scaleB; // 2xN scales
load_generic(B, y_qs + j0 * MMQ_TILE_Y_FP4_K + k01, MMQ_TILE_Y_FP4_K);
scaleB = y_sc[(j0 + threadIdx.x / 4) * MMQ_TILE_Y_FP4_K + k01 / (2 * QI_MXFP4)];
#pragma unroll
for (int n = 0; n < ntx; ++n) {
tile_C C;
mma_block_scaled(C, A[n][k01 / (2 * QI_MXFP4)], B, scaleA[n][k01 / (2 * QI_MXFP4)], scaleB);
#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l];
}
}
}
}
}
template <int mmq_x, int mmq_y>
static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
@@ -3318,7 +3259,7 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
static constexpr int vdr = VDR_MXFP4_Q8_1_MMQ;
#ifdef BLACKWELL_MMA_AVAILABLE
static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4_fp4<mmq_y, need_check>;
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_fp4_fp4_mma<mmq_x, mmq_y, GGML_TYPE_MXFP4>;
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_mxfp4_mxfp4_mma<mmq_x, mmq_y>;
#else
static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4<mmq_y, need_check>;
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
@@ -3329,13 +3270,8 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
template <int mmq_x, int mmq_y, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_NVFP4> {
static constexpr int vdr = VDR_NVFP4_Q8_1_MMQ;
#ifdef BLACKWELL_MMA_AVAILABLE
static constexpr load_tiles_mmq_t load_tiles = load_tiles_nvfp4_nvfp4<mmq_y, need_check>;
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_fp4_fp4_mma<mmq_x, mmq_y, GGML_TYPE_NVFP4>;
#else
static constexpr load_tiles_mmq_t load_tiles = load_tiles_nvfp4<mmq_y, need_check>;
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
#endif // BLACKWELL_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
};
@@ -3470,7 +3406,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
#if defined(BLACKWELL_MMA_AVAILABLE)
// FP4 tile stores 8 blocks
constexpr int ne_block = (type == GGML_TYPE_MXFP4 || type == GGML_TYPE_NVFP4) ? QK_K : 4 * QK8_1;
constexpr int ne_block = (type == GGML_TYPE_MXFP4) ? 8 * QK_MXFP4 : 4 * QK8_1;
#else
constexpr int ne_block = 4 * QK8_1;
#endif // defined(BLACKWELL_MMA_AVAILABLE)
@@ -3542,10 +3478,10 @@ template <ggml_type type, int mmq_x, bool need_check>
static __global__ void mul_mat_q(
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
const uint3 channel_ratio, const uint3 nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const uint3 sample_ratio, const uint3 nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const uint3 ntx) {
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const int ncols_max) {
// Skip unused template specializations for faster compilation:
if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
@@ -3559,7 +3495,8 @@ static __global__ void mul_mat_q(
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int mmq_y = get_mmq_y_device();
const uint32_t nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
const int ntx = (ncols_max + mmq_x - 1) / mmq_x; // Number of tiles x
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
// Initialize the ids for writing back data with just the index.
// For regular matrix multiplications this is never changed.
@@ -3580,9 +3517,8 @@ static __global__ void mul_mat_q(
// On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
#if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
{
const uint2 tmp2 = fast_div_modulo(blockIdx.z, nchannels_y);
const int wt = tmp2.x;
const int zt = tmp2.y;
const int wt = blockIdx.z / nchannels_y;
const int zt = blockIdx.z - wt*nchannels_y;
const int jt = blockIdx.y;
const int it = blockIdx.x;
@@ -3625,40 +3561,40 @@ static __global__ void mul_mat_q(
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
constexpr bool fixup = false;
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z);
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
return;
}
#endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
constexpr int ITER_K = get_iter_k(type);
constexpr int blocks_per_iter = ITER_K / qk;
constexpr int ITER_K = get_iter_k(type);
const int64_t blocks_per_ne00 = ncols_x / qk;
constexpr int blocks_per_iter = ITER_K / qk;
// kbc == k block continuous, current index in continuous ijk space.
int kbc = int64_t(blockIdx.x) *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
int kbc_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;
kbc_stop -= fastmodulo(kbc_stop, blocks_per_ne00) % blocks_per_iter;
kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter;
// kb0 == k index when doing the matrix multiplication for an output tile.
int kb0_start = fastmodulo(kbc, blocks_per_ne00);
int kb0_stop = min(blocks_per_ne00.z, uint32_t(kb0_start + kbc_stop - kbc));
while (kbc < kbc_stop && kb0_stop == int(blocks_per_ne00.z)) {
int tmp = fastdiv(kbc, blocks_per_ne00);
uint2 tmp2 = fast_div_modulo(tmp, ntx);
const int jt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nchannels_y);
const int zt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nsamples_y);
const int wt = tmp2.y;
const int it = tmp2.x;
int kb0_start = kbc % blocks_per_ne00;
int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
int tmp = kbc;
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
const int zt = tmp / (ntx*blocks_per_ne00);
tmp -= zt * (ntx*blocks_per_ne00);
const int jt = tmp / blocks_per_ne00;
// Defaults for regular matrix multiplication:
int col_low = 0;
@@ -3676,11 +3612,11 @@ static __global__ void mul_mat_q(
offset_dst = 0;
if (jt*mmq_x >= col_diff) {
kbc += blocks_per_ne00.z;
kbc -= fastmodulo(kbc, blocks_per_ne00);
kbc += blocks_per_ne00;
kbc -= kbc % blocks_per_ne00;
kb0_start = 0;
kb0_stop = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
continue;
}
@@ -3705,34 +3641,32 @@ static __global__ void mul_mat_q(
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
kbc += blocks_per_ne00.z;
kbc -= fastmodulo(kbc, blocks_per_ne00);
kbc += blocks_per_ne00;
kbc -= kbc % blocks_per_ne00;
kb0_start = 0;
kb0_stop = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
}
if (kbc >= kbc_stop) {
return;
}
int tmp = fastdiv(kbc, blocks_per_ne00);
uint2 tmp2 = fast_div_modulo(tmp, ntx);
const int jt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nchannels_y);
const int zt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nsamples_y);
const int wt = tmp2.y;
const int it = tmp2.x;
int tmp = kbc;
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
const int zt = tmp / (ntx*blocks_per_ne00);
tmp -= zt * (ntx*blocks_per_ne00);
const int jt = tmp / blocks_per_ne00;
// Defaults for regular matrix multiplication:
int col_low = 0;
@@ -3774,7 +3708,7 @@ static __global__ void mul_mat_q(
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
@@ -3783,37 +3717,46 @@ static __global__ void mul_mat_q(
}
template <ggml_type type, int mmq_x, bool need_check>
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device()/2, 1)
static __global__ void mul_mat_q_stream_k_fixup(
const int32_t * __restrict__ ids_dst, const int32_t * __restrict__ expert_bounds, float * __restrict__ dst,
float * __restrict__ tmp_last_tile, const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst,
const int stride_col_dst, const uint3 nchannels_y, const int stride_channel_dst, const uint3 nsamples_y,
const int stride_sample_dst, const uint3 ntx) {
constexpr int mmq_y = get_mmq_y_device();
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int ITER_K = get_iter_k(type);
constexpr int blocks_per_iter = ITER_K / qk;
static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
const int32_t * expert_bounds,
float * __restrict__ dst,
const float * __restrict__ tmp_last_tile,
const int ncols_x,
const int nrows_x,
const int ncols_dst,
const size_t stride_col_dst,
const int nchannels_y,
const size_t stride_channel_dst,
const int nsamples_y,
const size_t stride_sample_dst,
const int ncols_max) {
constexpr int mmq_y = get_mmq_y_device();
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int ITER_K = get_iter_k(type);
constexpr int nwarps = mmq_get_nwarps_device()/2;
constexpr int blocks_per_iter = ITER_K / qk;
const int64_t blocks_per_ne00 = ncols_x / qk;
constexpr int nwarps = mmq_get_nwarps_device();
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
float sum[mmq_x / nwarps] = {0.0f};
const int i = blockIdx.y*warp_size + threadIdx.x;
float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f};
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
const int ntx = (ncols_max + mmq_x - 1) / mmq_x;
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
const int bidx0 = blockIdx.x;
// kbc == k block continuous, current index in continuous ijk space.
int kbc0 = int64_t(blockIdx.x) *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
int kbc0_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
int64_t kbc0 = (int64_t) bidx0 *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
kbc0 -= fastmodulo(kbc0, blocks_per_ne00) % blocks_per_iter;
kbc0_stop -= fastmodulo(kbc0_stop, blocks_per_ne00) % blocks_per_iter;
kbc0 -= (kbc0 % blocks_per_ne00) % blocks_per_iter;
kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter;
const bool did_not_have_any_data = kbc0 == kbc0_stop;
const bool wrote_beginning_of_tile = fastmodulo(kbc0, blocks_per_ne00) == 0;
const bool did_not_write_last = fastdiv(kbc0, blocks_per_ne00) == fastdiv(kbc0_stop, blocks_per_ne00) && fastmodulo(kbc0_stop, blocks_per_ne00) != 0;
const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0;
const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0;
if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) {
return;
}
@@ -3822,11 +3765,11 @@ static __global__ void mul_mat_q_stream_k_fixup(
// Iterate over previous blocks and sum up partial sums written to fixup buffer.
// All CUDA blocks that get here must have a previous block that needs a fixup.
int bidx = bidx0 - 1;
int kbc_stop = kbc0;
int64_t bidx = bidx0 - 1;
int64_t kbc_stop = kbc0;
while(true) {
int kbc = int64_t(bidx)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;
int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
if (kbc == kbc_stop) { // Did not have any data.
bidx--;
@@ -3836,16 +3779,20 @@ static __global__ void mul_mat_q_stream_k_fixup(
any_fixup = true;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = j0 + threadIdx.y;
sum[j0/nwarps] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
const int i = i0 + threadIdx.x;
sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
}
}
// If this block started in a previous tile we are done and don't need to combine additional partial results.
if (fastmodulo(kbc, blocks_per_ne00) == 0 || fastdiv(kbc, blocks_per_ne00) < fastdiv(kbc0, blocks_per_ne00)) {
if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) {
break;
}
bidx--;
@@ -3856,16 +3803,14 @@ static __global__ void mul_mat_q_stream_k_fixup(
return;
}
int tmp = fastdiv(kbc0, blocks_per_ne00);
uint2 tmp2 = fast_div_modulo(tmp, ntx);
const int jt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nchannels_y);
const int zt = tmp2.y;
tmp = tmp2.x;
tmp2 = fast_div_modulo(tmp, nsamples_y);
const int wt = tmp2.y;
const int it = tmp2.x;
int tmp = kbc0;
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
const int zt = tmp / (ntx*blocks_per_ne00);
tmp -= zt * (ntx*blocks_per_ne00);
const int jt = tmp / blocks_per_ne00;
if (!ids_dst) {
const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
@@ -3873,9 +3818,6 @@ static __global__ void mul_mat_q_stream_k_fixup(
const int i_max = nrows_x - it*mmq_y - 1;
const int j_max = ncols_dst - jt*mmq_x - 1;
if (need_check && i > i_max) {
return;
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -3885,7 +3827,16 @@ static __global__ void mul_mat_q_stream_k_fixup(
return;
}
dst[j*stride_col_dst + i] += sum[j0/nwarps];
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
const int i = i0 + threadIdx.x;
if (need_check && i > i_max) {
continue;
}
dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
}
}
return;
}
@@ -3905,9 +3856,6 @@ static __global__ void mul_mat_q_stream_k_fixup(
const int i_max = nrows_x - it*mmq_y - 1;
const int j_max = col_diff - jt*mmq_x - 1;
if (need_check && i > i_max) {
return;
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -3917,7 +3865,16 @@ static __global__ void mul_mat_q_stream_k_fixup(
return;
}
dst[ids_dst_shared[j]*stride_col_dst + i] += sum[j0/nwarps];
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
const int i = i0 + threadIdx.x;
if (need_check && i > i_max) {
continue;
}
dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
}
}
}
@@ -3965,44 +3922,29 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const int channel_ratio = args.nchannels_y / args.nchannels_x;
const int sample_ratio = args.nsamples_y / args.nsamples_x;
const uint3 blocks_per_ne00_fd = init_fastdiv_values(args.ncols_x / ggml_cuda_type_traits<type>::qk);
const uint3 ntx_fd = init_fastdiv_values(ntx);
const uint3 nchannels_y_fd = init_fastdiv_values(args.nchannels_y);
const uint3 nsamples_y_fd = init_fastdiv_values(args.nsamples_y);
const uint3 channel_ratio_fd = init_fastdiv_values(channel_ratio);
const uint3 sample_ratio_fd = init_fastdiv_values(sample_ratio);
if (!args.use_stream_k) {
if (args.nrows_x % mmq_y == 0) {
constexpr bool need_check = false;
mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
ntx_fd);
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
args.ncols_max);
} else {
constexpr bool need_check = true;
mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
ntx_fd);
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
args.ncols_max);
}
return;
}
// For the stream-k kernel it is possible to run it with tiling by setting the number of CUDA blocks equal to the number of tiles.
// This is worthwhile if the efficiency of tiling is high and skipping the fixup kernel is more important.
const int ntiles_dst = ntx * nty * ntzw;
const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm;
const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves);
const dim3 block_nums_stream_k(GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm, 1, 1);
GGML_ASSERT(ntiles_dst * blocks_per_ne00_fd.z < (1 << 30)); // Assert that variable kbc will not overflow.
const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;
const dim3 block_nums_stream_k(nsm, 1, 1);
const bool fixup_needed = ntx*nty*ntzw % nsm != 0;
ggml_cuda_pool & pool = ctx.pool(id);
ggml_cuda_pool_alloc<float> tmp_fixup(pool);
@@ -4010,45 +3952,40 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y);
}
const dim3 block_nums_fixup(block_nums_stream_k.x, mmq_y/warp_size, 1);
const dim3 block_dims_fixup(block_dims.x, block_dims.y/2, block_dims.z);
if (args.nrows_x % mmq_y == 0) {
constexpr bool need_check = false;
mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
ntx_fd);
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
args.ncols_max);
if (!fixup_needed) {
return;
}
CUDA_CHECK(cudaGetLastError());
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
ntx_fd);
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
args.ncols_max);
} else {
constexpr bool need_check = true;
mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
ntx_fd);
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
args.ncols_max);
if (!fixup_needed) {
return;
}
CUDA_CHECK(cudaGetLastError());
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
ntx_fd);
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
args.ncols_max);
}
}

View File

@@ -115,7 +115,6 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_pascal_older(gg
case GGML_TYPE_IQ4_NL: return 6;
case GGML_TYPE_IQ4_XS: return 5;
case GGML_TYPE_MXFP4: return 4;
case GGML_TYPE_NVFP4: return 4;
case GGML_TYPE_Q2_K: return 4;
case GGML_TYPE_Q3_K: return 4;
case GGML_TYPE_Q4_0: return 6;
@@ -136,7 +135,6 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_turing_plus(ggm
case GGML_TYPE_IQ3_S: return 6;
case GGML_TYPE_IQ3_XXS: return 7;
case GGML_TYPE_MXFP4: return 7;
case GGML_TYPE_NVFP4: return 8;
case GGML_TYPE_Q2_K: return 7;
case GGML_TYPE_Q3_K: return 5;
default: return MMVQ_MAX_BATCH_SIZE;
@@ -223,7 +221,6 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna4(ggml_type
case GGML_TYPE_IQ4_NL: return 7;
case GGML_TYPE_IQ4_XS: return 5;
case GGML_TYPE_MXFP4: return 5;
case GGML_TYPE_NVFP4: return 5;
case GGML_TYPE_Q3_K: return 4;
case GGML_TYPE_Q4_0: return 7;
case GGML_TYPE_Q4_1: return 7;

View File

@@ -70,102 +70,6 @@ __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
return static_cast<uint8_t>(biased);
}
static __global__ void quantize_mmq_nvfp4(
const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2) {
#if defined(BLACKWELL_MMA_AVAILABLE)
const int64_t i0_base = ((int64_t) blockDim.x * blockIdx.y + threadIdx.x) * QK_NVFP4_SUB;
if (i0_base >= ne0) {
return;
}
const int64_t i1 = blockIdx.x;
const int64_t i2 = blockIdx.z % ne2;
const int64_t i3 = blockIdx.z / ne2;
const int64_t i01 = ids ? ids[i1] : i1;
const int64_t k_block = i0_base / QK_K;
const int64_t blocks_per_col = (ne0 + QK_K - 1) / QK_K;
if (k_block >= blocks_per_col) {
return;
}
const int64_t ib = blockIdx.z * ((int64_t) blocks_per_col * ne1) + k_block * ne1 + blockIdx.x;
block_fp4_mmq * y = (block_fp4_mmq *) vy;
block_fp4_mmq * yb = y + ib;
const int sub = (i0_base % QK_K) / QK_NVFP4_SUB;
float vals_raw[QK_NVFP4_SUB];
float amax_raw = 0.0f;
const int64_t base_idx = i3 * s03 + i2 * s02 + i01 * s01;
#pragma unroll
for (int k = 0; k < QK_NVFP4_SUB; k++) {
const int64_t i00 = i0_base + k;
if (i00 < ne00) {
const float v = x[base_idx + i00];
vals_raw[k] = v;
amax_raw = fmaxf(amax_raw, fabsf(v));
} else {
vals_raw[k] = 0.0f;
}
}
static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2};
const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f);
float best_err = FLT_MAX;
uint8_t fp8_code = 0;
float subblock_scale = 0.0f;
#pragma unroll // Check +/- 2 to find best code to reduce NVFP4 activation loss. Negligible overhead on Blackwell.
for (int i = 0; i < 5; i++) {
const int test_code = first_fp8_code + test_offsets[i];
if (test_code < 0 || test_code > 0x7e) {
continue;
}
const uint8_t code = (uint8_t) test_code;
const float test_scale = ggml_cuda_ue4m3_to_fp32(code);
const float test_inv_scale = test_scale > 0.0f ? 0.5f / test_scale : 0.0f;
float cur_err = 0.0f;
#pragma unroll
for (int k = 0; k < QK_NVFP4_SUB; ++k) {
const float v = vals_raw[k];
const uint8_t q = ggml_cuda_float_to_fp4_e2m1(v, test_inv_scale);
const float err_diff = fabsf(v) - fabsf(kvalues_mxfp4[q & 0x7]) * test_scale;
cur_err = fmaf(err_diff, err_diff, cur_err);
}
if (cur_err < best_err) {
best_err = cur_err;
fp8_code = test_code;
subblock_scale = test_scale;
}
}
const float inv_scale = subblock_scale > 0.0f ? 0.5f / subblock_scale : 0.0f;
uint32_t q0 = 0;
uint32_t q1 = 0;
#pragma unroll // this is faster than the previous __nv_fp4x4_e2m1
for (int k = 0; k < QK_NVFP4_SUB / 4; ++k) {
q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 0], inv_scale) << (8 * k);
q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 8], inv_scale) << (8 * k + 4);
q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 4], inv_scale) << (8 * k);
q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 12], inv_scale) << (8 * k + 4);
}
uint32_t * yqs = reinterpret_cast<uint32_t *>(yb->qs);
yqs[2 * sub + 0] = q0;
yqs[2 * sub + 1] = q1;
reinterpret_cast<uint8_t *>(yb->d4)[sub] = fp8_code;
#else
NO_DEVICE_CODE; // This is for Blackwell NVFP4 activations only.
#endif // defined(BLACKWELL_MMA_AVAILABLE)
}
// quantize values in the format mxfp4 is stored which is interleaved nibbles
// i.e. a block a0-a31 is represented as a0a16,a1a17 ...a15a31
static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x,
@@ -412,32 +316,28 @@ void quantize_mmq_q8_1_cuda(
}
}
void quantize_mmq_fp4_cuda(
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
GGML_ASSERT(type_src0 == GGML_TYPE_MXFP4 || type_src0 == GGML_TYPE_NVFP4);
GGML_ASSERT(ne0 > 0);
void quantize_mmq_mxfp4_cuda(const float * x,
const int32_t * ids,
void * vy,
[[maybe_unused]] const ggml_type type_src0,
const int64_t ne00,
const int64_t s01,
const int64_t s02,
const int64_t s03,
const int64_t ne0,
const int64_t ne1,
const int64_t ne2,
const int64_t ne3,
cudaStream_t stream) {
GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0);
if (type_src0 == GGML_TYPE_NVFP4) {
GGML_ASSERT(ne00 % QK_NVFP4 == 0);
constexpr int nvfp4_block_size = 128;
const int64_t block_num_y = (ne0 + QK_NVFP4_SUB * nvfp4_block_size - 1) / (QK_NVFP4_SUB * nvfp4_block_size);
const dim3 block_size(nvfp4_block_size, 1, 1);
const dim3 num_blocks(ne1, block_num_y, ne2 * ne3);
quantize_mmq_nvfp4<<<num_blocks, block_size, 0, stream>>>(
x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
} else {
GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0);
constexpr int nwarps = 8;
constexpr int vals_per_warp = 2 * QK_MXFP4;
constexpr int vals_per_block = nwarps * vals_per_warp;
constexpr int nwarps = 8;
constexpr int vals_per_warp = 2 * QK_MXFP4;
constexpr int vals_per_block = nwarps * vals_per_warp;
const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block;
const dim3 num_blocks(ne1, block_num_y, ne2 * ne3);
const dim3 block_size(WARP_SIZE, nwarps, 1);
const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block;
const dim3 num_blocks(ne1, block_num_y, ne2 * ne3);
const dim3 block_size(WARP_SIZE, nwarps, 1);
quantize_mmq_mxfp4<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
}
quantize_mmq_mxfp4<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
}

View File

@@ -26,7 +26,7 @@ void quantize_mmq_q8_1_cuda(
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
void quantize_mmq_fp4_cuda(const float * x,
void quantize_mmq_mxfp4_cuda(const float * x,
const int32_t * ids,
void * vy,
ggml_type type_src0,

View File

@@ -3,7 +3,6 @@
template <bool apply_silu, size_t split_d_inner, size_t d_conv>
static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float * __restrict__ src1,
const float * __restrict__ bias,
const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
const int64_t n_t) {
@@ -28,8 +27,6 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float
w[j] = w_block[tid * stride_w + j];
}
float b = bias != nullptr ? bias[bidy * split_d_inner + tid] : 0.0f;
for (int64_t i = 0; i < n_t; i++) {
float sumf = 0.0f;
@@ -45,14 +42,12 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float
for (size_t j = 0; j < d_conv; j++) {
sumf += x[(i + j) % d_conv] * w[j];
}
sumf += b;
y_block[i * stride_y + tid] = apply_silu ? ggml_cuda_op_silu_single(sumf) : sumf;
}
}
template <bool apply_silu, size_t split_d_inner, size_t d_conv, int64_t split_n_t>
static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, const float * __restrict__ src1,
const float * __restrict__ bias,
const int src0_nb0, const int src0_nb1, const int src0_nb2,
const int src1_nb1, float * __restrict__ dst, const int dst_nb0,
const int dst_nb1, const int dst_nb2, const int64_t n_t) {
@@ -102,8 +97,6 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
w[j] = w_block[tid * stride_w + j];
}
float b = bias != nullptr ? bias[bidy * split_d_inner + tid] : 0.0f;
// Compute from shared memory
for (int64_t i = 0; i < local_n_t; i++) {
float sumf = 0.0f;
@@ -111,13 +104,12 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
for (size_t j = 0; j < d_conv; j++) {
sumf += smem[tid * n_cols + i + j] * w[j];
}
sumf += b;
y_block[i * stride_y + tid] = apply_silu ? ggml_cuda_op_silu_single(sumf) : sumf;
}
}
template <bool apply_silu>
static void ssm_conv_f32_cuda(const float * src0, const float * src1, const float * bias, const int src0_nb0, const int src0_nb1,
static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int src0_nb0, const int src0_nb1,
const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1,
const int dst_nb2, const int64_t nc, const int64_t nr, const int64_t n_t,
const int64_t n_s, cudaStream_t stream) {
@@ -128,14 +120,14 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa
constexpr int kNC = decltype(NC)::value;
if (n_t <= 32) {
const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
ssm_conv_f32<apply_silu, threads, kNC><<<blocks, threads, 0, stream>>>(src0, src1, bias, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
ssm_conv_f32<apply_silu, threads, kNC><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else {
const int64_t split_n_t = 32;
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
const size_t smem_size = threads * (kNC - 1 + split_n_t) * sizeof(float);
ssm_conv_long_token_f32<apply_silu, threads, kNC, split_n_t><<<blocks, threads, smem_size, stream>>>(
src0, src1, bias, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
}
};
@@ -148,18 +140,11 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa
}
}
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * bias_add_node, ggml_tensor * silu_dst) {
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * silu_dst) {
const struct ggml_tensor * src0 = dst->src[0]; // conv_x
const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
const bool fuse_bias = bias_add_node != nullptr;
const bool fuse_silu = silu_dst != nullptr;
// bias always comes with silu.
GGML_ASSERT(!fuse_bias || fuse_silu);
// The bias (when fused) is the non-conv operand of the ADD node.
const struct ggml_tensor * bias = fuse_bias ? (bias_add_node->src[0] == dst ? bias_add_node->src[1] : bias_add_node->src[0]) : nullptr;
// When fusing, write to silu_dst (the node downstream references).
const struct ggml_tensor * out = fuse_silu ? silu_dst : dst;
@@ -175,23 +160,16 @@ void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, g
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
const float * bias_d = fuse_bias ? (const float *) bias->data : nullptr;
float * dst_d = (float *) out->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(out->type == GGML_TYPE_F32);
if (fuse_bias) {
GGML_ASSERT(bias->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(bias));
GGML_ASSERT(ggml_nelements(bias) == nr);
}
if (fuse_silu) {
ssm_conv_f32_cuda<true>(src0_d, src1_d, bias_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1],
ssm_conv_f32_cuda<true>(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1],
out->nb[2], nc, nr, n_t, n_s, stream);
} else {
ssm_conv_f32_cuda<false>(src0_d, src1_d, bias_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1],
ssm_conv_f32_cuda<false>(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1],
out->nb[2], nc, nr, n_t, n_s, stream);
}
}

View File

@@ -1,3 +1,3 @@
#include "common.cuh"
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * bias_add_node = nullptr, ggml_tensor * silu_dst = nullptr);
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * silu_dst = nullptr);

View File

@@ -2,5 +2,4 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(320, 256, 1, 32);
DECL_FATTN_MMA_F16_CASE(576, 512, 1, 32);

View File

@@ -2,5 +2,4 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(320, 256, 2, 32);
DECL_FATTN_MMA_F16_CASE(576, 512, 2, 32);

View File

@@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(320, 256);

View File

@@ -3,7 +3,7 @@
from glob import glob
import os
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 320, 512, 576]
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 512, 576]
TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_BF16"]
@@ -62,7 +62,7 @@ for filename in glob("*.cu"):
os.remove(filename)
for head_size_kq in HEAD_SIZES_KQ:
head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
head_size_v = head_size_kq if head_size_kq != 576 else 512
with open(f"fattn-tile-instance-dkq{head_size_kq}-dv{head_size_v}.cu", "w") as f:
f.write(SOURCE_FATTN_TILE.format(head_size_kq=head_size_kq, head_size_v=head_size_v))
@@ -84,16 +84,13 @@ for ncols in [8, 16, 32, 64]:
continue
if head_size_kq == 72:
continue
# Skip compilation of unused ncols2 values for niche head sizes:
if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
if head_size_kq == 512 and ncols2 not in (4, 8):
continue
if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
if head_size_kq != 576 and ncols2 in (16, 32):
continue
if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
if head_size_kq == 576 and ncols2 not in (4, 16, 32):
continue
if head_size_kq not in (320, 576) and ncols2 in (16, 32):
continue
head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
head_size_v = head_size_kq if head_size_kq != 576 else 512
f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))
for type in TYPES_MMQ:

View File

@@ -65,11 +65,6 @@ static __device__ __forceinline__ float op_sqr(float x) {
return x * x;
}
static __device__ __forceinline__ float op_relu_sqr(float x) {
const float r = fmaxf(x, 0.0f);
return r * r;
}
static __device__ __forceinline__ float op_sqrt(float x) {
return sqrtf(x);
}
@@ -620,21 +615,3 @@ void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary
GGML_ABORT("Unsupported unary op for fused unary+mul");
}
}
/* fused relu + sqr */
void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node) {
const ggml_tensor * src = relu_node->src[0];
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src));
GGML_ASSERT(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
GGML_ASSERT(src->type == sqr_node->type);
const int k = ggml_nelements(src);
if (src->type == GGML_TYPE_F16) {
unary_cuda<op_relu_sqr>((const half *)src->data, (half *)sqr_node->data, k, stream);
} else {
unary_cuda<op_relu_sqr>((const float *)src->data, (float *)sqr_node->data, k, stream);
}
}

View File

@@ -91,8 +91,6 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);
void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node);
__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
return x / (1.0f + expf(-x));
}

View File

@@ -58,7 +58,6 @@
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaErrorMemoryAllocation hipErrorOutOfMemory
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags hipEventCreateWithFlags

Some files were not shown because too many files have changed in this diff Show More