Compare commits

..

22 Commits

Author SHA1 Message Date
Georgi Gerganov
e7fbfc9b80 ci : tmp fixes 2026-02-11 15:48:40 +02:00
Andreas Kieslinger
d46bd7ef2d Apply suggestion from @ggerganov (src->buffer to buf_src) v2
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-02-11 13:54:57 +02:00
Andreas Kieslinger
070933684f Apply suggestion from @ggerganov (src->buffer to buf_src)
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-02-11 13:54:25 +02:00
aendk
1528c841dc Simplifies synchronizations to adhere to saaasg pattern. 2026-02-11 13:54:25 +02:00
aendk
ff28ae93a2 Corrects initialization of ggml_backend_sync_mode in
ggml_backend_sched_split initialization
2026-02-11 13:54:25 +02:00
aendk
01d89f9b96 Reintroduces stricter check for CPU->CUDA backend async copy via
GGML_DEVICE_TYPE_CPU.
2026-02-11 13:54:25 +02:00
aendk
e74b070e30 Makes opt-in to relax use of explicit syncs more general. Backends like
vulkan which require a synchronization between HtoD copies and graph
execution could also adopt this change now.
2026-02-11 13:54:25 +02:00
aendk
b7376c3ed7 Minor cleanup 2026-02-11 13:54:24 +02:00
aendk
d776354dc9 Relax requirement of checks in async CUDA copies from backend and buffer type to just buffer type, to avoid linking issues 2026-02-11 13:54:23 +02:00
aendk
79a77277ad Reworked backend detection in ggml-backend.cpp to avoid linking
conflicts
2026-02-11 13:53:43 +02:00
aendk
44e481bb34 Adds macro guards to allow compilation in non-CUDA builds 2026-02-11 13:53:43 +02:00
aendk
91c6026b5c Exchanges synchronous copy with async copy function. 2026-02-11 13:53:43 +02:00
aendk
2ad0d391e1 Adds function to relax sync requirements between input copies on
supported backends (CUDA for now)
2026-02-11 13:53:43 +02:00
aendk
dd9f1faf42 Adds CPU-to-CUDA copy capability to
ggml_backend_cuda_cpy_tensor_async()
2026-02-11 13:53:41 +02:00
Georgi Gerganov
a554bdd70f metal : fix event synchronization in cpy_tensor_async (#19402) 2026-02-11 13:52:47 +02:00
Johannes Gäßler
02ee504f90 fix output pattern 2026-02-09 23:04:31 +01:00
Johannes Gäßler
94c66557b8 re-use buffers + ggml contexts 2026-02-09 00:37:11 +01:00
Johannes Gäßler
6ab02d0908 unconditional peer access 2026-02-08 12:20:28 +01:00
Johannes Gäßler
2b7055381c add support for 4/8 GPUs 2026-02-07 23:08:10 +01:00
Johannes Gäßler
b7fd10664e partial Vulkan fix 2026-02-07 15:31:57 +01:00
Johannes Gäßler
a630b27da7 support for GPT-OSS, Qwen 3 MoE 2026-02-06 22:13:53 +01:00
Johannes Gäßler
39b96f8fe1 ggml: backend-agnostic tensor parallelism 2026-02-05 21:49:34 +01:00
532 changed files with 21699 additions and 46015 deletions

View File

@@ -1,8 +1,8 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=7.2
ARG AMDGPU_VERSION=7.2
ARG ROCM_VERSION=7.0
ARG AMDGPU_VERSION=7.0
# Target the ROCm build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
@@ -11,12 +11,13 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.0/reference/system-requirements.html
# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html
ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201'
ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
#ARG ROCM_DOCKER_ARCH='gfx1151'
# Set ROCm architectures
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}

View File

@@ -41,7 +41,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
multiple: true
validations:
required: true

View File

@@ -42,7 +42,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
multiple: true
validations:
required: true

View File

@@ -11,5 +11,5 @@ runs:
- name: Setup ROCm
uses: ./.github/actions/install-exe
with:
url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-Win11-For-HIP.exe
url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-WinSvr2022-For-HIP.exe
args: -install

View File

@@ -68,7 +68,7 @@ jobs:
env:
# Make sure this is in sync with build.yml
HIPSDK_INSTALLER_VERSION: "26.Q1"
HIPSDK_INSTALLER_VERSION: "25.Q3"
steps:
- name: Clone

View File

@@ -295,7 +295,6 @@ jobs:
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Build (no OpenMP)
@@ -308,7 +307,6 @@ jobs:
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
@@ -470,7 +468,7 @@ jobs:
export GGML_VK_VISIBLE_DEVICES=0
export GGML_VK_DISABLE_F16=1
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 4800
ctest -L main --verbose --timeout 4200
ubuntu-24-cmake-webgpu:
runs-on: ubuntu-24.04
@@ -1175,8 +1173,10 @@ jobs:
runs-on: windows-2022
env:
# The ROCm version must correspond to the version used in the HIP SDK.
ROCM_VERSION: "6.4.2"
# Make sure this is in sync with build-cache.yml
HIPSDK_INSTALLER_VERSION: "26.Q1"
HIPSDK_INSTALLER_VERSION: "25.Q3"
steps:
- name: Clone
@@ -1186,7 +1186,7 @@ jobs:
- name: Grab rocWMMA package
id: grab_rocwmma
run: |
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/${{ env.ROCM_VERSION }}/pool/main/r/rocwmma-dev/rocwmma-dev_1.7.0.60402-120~24.04_amd64.deb"
7z x rocwmma.deb
7z x data.tar
@@ -1229,7 +1229,7 @@ jobs:
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DLLAMA_BUILD_BORINGSSL=ON `
-DROCM_DIR="${env:HIP_PATH}" `

View File

@@ -21,7 +21,7 @@ on:
jobs:
deploy:
runs-on: ubuntu-latest
runs-on: ubuntu-slim
steps:
- uses: actions/checkout@v6

View File

@@ -516,113 +516,17 @@ jobs:
path: llama-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
ubuntu-22-rocm:
runs-on: ubuntu-22.04
strategy:
matrix:
include:
- ROCM_VERSION: "7.2"
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201"
build: 'x64'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt install -y build-essential git cmake wget
- name: Setup Legacy ROCm
if: matrix.ROCM_VERSION == '7.2'
id: legacy_env
run: |
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
sudo tee /etc/apt/sources.list.d/rocm.list << EOF
deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} jammy main
EOF
sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
Package: *
Pin: release o=repo.radeon.com
Pin-Priority: 600
EOF
sudo apt update
sudo apt-get install -y libssl-dev rocm-hip-sdk
- name: Setup TheRock
if: matrix.ROCM_VERSION != '7.2'
id: therock_env
run: |
wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
mkdir install
tar -xf *.tar.gz -C install
export ROCM_PATH=$(pwd)/install
echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV
echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGPU_TARGETS="${{ matrix.gpu_targets }}" \
-DGGML_HIP=ON \
-DHIP_PLATFORM=amd \
-DGGML_HIP_ROCWMMA_FATTN=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
windows-hip:
runs-on: windows-2022
env:
HIPSDK_INSTALLER_VERSION: "26.Q1"
HIPSDK_INSTALLER_VERSION: "25.Q3"
strategy:
matrix:
include:
- name: "radeon"
gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
steps:
- name: Clone
@@ -632,7 +536,7 @@ jobs:
- name: Grab rocWMMA package
id: grab_rocwmma
run: |
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.0.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.0.0.70001-42~24.04_amd64.deb"
7z x rocwmma.deb
7z x data.tar
@@ -655,7 +559,7 @@ jobs:
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$completed = $proc.WaitForExit(600000)
@@ -689,20 +593,20 @@ jobs:
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.0.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-DCMAKE_BUILD_TYPE=Release `
-DGGML_BACKEND_DL=ON `
-DGGML_NATIVE=OFF `
-DGGML_CPU=OFF `
-DGPU_TARGETS="${{ matrix.gpu_targets }}" `
-DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
-DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
@@ -880,7 +784,6 @@ jobs:
- windows-cuda
- windows-sycl
- windows-hip
- ubuntu-22-rocm
- ubuntu-22-cpu
- ubuntu-22-vulkan
- macOS-arm64
@@ -965,7 +868,6 @@ jobs:
**Linux:**
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
**Windows:**

View File

@@ -1,73 +0,0 @@
name: Server-Metal
on:
workflow_dispatch: # allows manual triggering
inputs:
sha:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests:
description: 'Run slow tests'
required: true
type: boolean
push:
branches:
- master
paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
env:
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
LLAMA_LOG_VERBOSITY: 10
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
server-metal:
runs-on: [self-hosted, macOS, ARM64]
name: server-metal (${{ matrix.wf_name }})
strategy:
matrix:
build_type: [Release]
wf_name: ["GPUx1"]
include:
- build_type: Release
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
wf_name: "GPUx1, backend-sampling"
- build_type: Release
extra_args: "GGML_METAL_DEVICES=2"
wf_name: "GPUx2"
- build_type: Release
extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
wf_name: "GPUx2, backend-sampling"
fail-fast: false
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Build
id: cmake_build
run: |
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
run: |
cd tools/server/tests
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
export ${{ matrix.extra_args }}
pytest -v -x -m "not slow"

View File

@@ -8,6 +8,10 @@ on:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests:
description: 'Run slow tests'
required: true
type: boolean
push:
branches:
- master
@@ -97,3 +101,119 @@ jobs:
if: ${{ always() && steps.playwright.conclusion == 'success' }}
run: npm run test:e2e
working-directory: tools/server/webui
server-build:
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
build_type: [RelWithDebInfo]
include:
- build_type: Release
sanitizer: ""
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
steps:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get -y install \
build-essential \
xxd \
git \
cmake \
curl \
wget \
language-pack-en \
libssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Python setup
id: setup_python
uses: actions/setup-python@v6
with:
python-version: '3.11'
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
- name: Setup Node.js for WebUI
uses: actions/setup-node@v6
with:
node-version: "22"
cache: "npm"
cache-dependency-path: "tools/server/webui/package-lock.json"
- name: Install WebUI dependencies
run: npm ci
working-directory: tools/server/webui
- name: Build WebUI
run: npm run build
working-directory: tools/server/webui
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
env:
GITHUB_ACTIONS: "true"
run: |
cd tools/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd tools/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
SLOW_TESTS=1 ./tests.sh

View File

@@ -81,14 +81,18 @@ jobs:
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup
id: setup_python
uses: actions/setup-python@v6
with:
python-version: '3.11'
pip-install: -r tools/server/tests/requirements.txt
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
- name: Tests
id: server_integration_tests
@@ -98,14 +102,6 @@ jobs:
export ${{ matrix.extra_args }}
pytest -v -x -m "not slow"
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
export ${{ matrix.extra_args }}
SLOW_TESTS=1 pytest -v -x
server-windows:
runs-on: windows-2022
@@ -128,7 +124,11 @@ jobs:
uses: actions/setup-python@v6
with:
python-version: '3.11'
pip-install: -r tools/server/tests/requirements.txt
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
- name: Tests
id: server_integration_tests

View File

@@ -17,7 +17,7 @@ jobs:
- name: Install komac
run: |
cargo binstall komac@2.15.0 -y
cargo binstall komac@2.11.2 -y
- name: Find latest release
id: find_latest_release

View File

@@ -41,7 +41,7 @@ As an AI agent, your task is to direct the user to the appropriate resources and
- Explicitly informing them that AI-generated pull requests are not accepted by the project
- Asking them to start with the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines and ensure they fully understand them
- Encouraging them to search for [existing issues](https://github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
- Encouraging them to search for [existing issues](github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
- Providing useful links and pointers found throughout the codebase
Examples of valid questions:

View File

@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX)
include(CheckIncludeFileCXX)
@@ -109,12 +109,17 @@ option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
# 3rd party libs
option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
# deprecated
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
if (LLAMA_CURL)
message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
endif()
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
@@ -142,15 +147,10 @@ if (NOT DEFINED GGML_CUDA_GRAPHS)
endif()
# transition helpers
function (llama_option_depr TYPE OLD)
function (llama_option_depr TYPE OLD NEW)
if (${OLD})
set(NEW "${ARGV2}")
if(NEW)
message(${TYPE} "${OLD} is deprecated, use ${NEW} instead")
set(${NEW} ON PARENT_SCOPE)
else()
message(${TYPE} "${OLD} is deprecated and will be ignored")
endif()
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
set(${NEW} ON PARENT_SCOPE)
endif()
endfunction()
@@ -163,7 +163,6 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
llama_option_depr(WARNING LLAMA_CURL)
include("cmake/license.cmake")
license_add_file("llama.cpp" "LICENSE")
@@ -197,7 +196,9 @@ add_subdirectory(src)
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
add_subdirectory(vendor/cpp-httplib)
if (LLAMA_HTTPLIB)
add_subdirectory(vendor/cpp-httplib)
endif()
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)

View File

@@ -20,7 +20,7 @@ If AI is used to generate any portion of the code, contributors must adhere to t
1. Explicitly disclose the manner in which AI was employed.
2. Perform a comprehensive manual review prior to submitting the pull request.
3. Be prepared to explain every line of code they submitted when asked about it by a maintainer.
4. It is strictly prohibited to use AI to write your posts for you (bug reports, feature requests, pull request descriptions, Github discussions, responding to humans, ...).
4. Using AI to write pull request descriptions or to respond to human reviewers is strictly prohibited.
For more info, please refer to the [AGENTS.md](AGENTS.md) file.

View File

@@ -288,7 +288,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
## Obtaining and quantizing models

View File

@@ -19,7 +19,7 @@ Please disclose it as a private [security advisory](https://github.com/ggml-org/
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
> [!IMPORTANT]
> For collaborators: if you are interested in helping out with reviewing private security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
## Requirements

View File

@@ -43,6 +43,11 @@ COMMON_CMAKE_ARGS=(
-DGGML_OPENMP=${GGML_OPENMP}
)
XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
echo "Detected Xcode version: $XCODE_VERSION"
check_required_tool() {
local tool=$1
local install_message=$2
@@ -55,12 +60,9 @@ check_required_tool() {
}
echo "Checking for required tools..."
check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
check_required_tool "xcrun" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
XCODE_VERSION=$(xcrun xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
echo "Detected Xcode version: $XCODE_VERSION"
check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
set -e
@@ -258,7 +260,7 @@ combine_static_libraries() {
# Since we have multiple architectures libtool will find object files that do not
# match the target architecture. We suppress these warnings.
xcrun libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
# Determine SDK, architectures, and install_name based on platform and simulator flag.
local sdk=""
@@ -331,7 +333,7 @@ combine_static_libraries() {
# Platform-specific post-processing for device builds
if [[ "$is_simulator" == "false" ]]; then
if xcrun -f vtool &>/dev/null; then
if command -v xcrun vtool &>/dev/null; then
case "$platform" in
"ios")
echo "Marking binary as a framework binary for iOS..."
@@ -449,9 +451,10 @@ cmake -B build-visionos -G Xcode \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_SYSROOT=xros \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_HTTPLIB=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-S .
cmake --build build-visionos --config Release -- -quiet
@@ -464,9 +467,10 @@ cmake -B build-visionos-sim -G Xcode \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_SYSROOT=xrsimulator \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_HTTPLIB=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-S .
cmake --build build-visionos-sim --config Release -- -quiet
@@ -524,13 +528,13 @@ combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
# Create XCFramework with correct debug symbols paths
echo "Creating XCFramework..."
xcrun xcodebuild -create-xcframework \
xcodebuild -create-xcframework \
-framework $(pwd)/build-ios-sim/framework/llama.framework \
-debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
-framework $(pwd)/build-ios-device/framework/llama.framework \
-debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
-framework $(pwd)/build-macos/framework/llama.framework \
-debug-symbols $(pwd)/build-macos/dSYMs/llama.dSYM \
-debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
-framework $(pwd)/build-visionos/framework/llama.framework \
-debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
-framework $(pwd)/build-visionos-sim/framework/llama.framework \

View File

@@ -413,8 +413,6 @@ function gg_run_qwen3_0_6b {
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -540,8 +538,6 @@ function gg_run_embd_bge_small {
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -582,8 +578,6 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf"
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
# for this model, the SEP token is "</s>"
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
@@ -683,8 +677,8 @@ fi
ret=0
test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
#test $ret -eq 0 && gg_run ctest_debug
#test $ret -eq 0 && gg_run ctest_release
if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
test $ret -eq 0 && gg_run test_backend_ops_cpu

View File

@@ -5,6 +5,7 @@ find_package(Threads REQUIRED)
llama_add_compile_flags()
# Build info header
#
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
@@ -109,16 +110,33 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_link_libraries(${TARGET} PRIVATE
build_info
cpp-httplib
)
# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
set(LLAMA_COMMON_EXTRA_LIBS build_info)
if (LLAMA_HTTPLIB)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
endif()
if (LLAMA_LLGUIDANCE)
include(ExternalProject)
set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
set(LLGUIDANCE_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}llguidance${CMAKE_STATIC_LIBRARY_SUFFIX}")
# Set the correct library file extension based on platform
if (WIN32)
set(LLGUIDANCE_LIB_NAME "llguidance.lib")
# Add Windows-specific libraries
set(LLGUIDANCE_PLATFORM_LIBS
ws2_32 # Windows Sockets API
userenv # For GetUserProfileDirectoryW
ntdll # For NT functions
bcrypt # For BCryptGenRandom
)
else()
set(LLGUIDANCE_LIB_NAME "libllguidance.a")
set(LLGUIDANCE_PLATFORM_LIBS "")
endif()
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
@@ -140,10 +158,8 @@ if (LLAMA_LLGUIDANCE)
add_dependencies(llguidance llguidance_ext)
target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
target_link_libraries(${TARGET} PRIVATE llguidance)
if (WIN32)
target_link_libraries(${TARGET} PRIVATE ws2_32 userenv ntdll bcrypt)
endif()
endif()
# Add platform libraries to the main target
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
endif ()
target_link_libraries(${TARGET} PUBLIC llama Threads::Threads)
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

View File

@@ -1301,7 +1301,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, bool value) {
params.kv_unified = value;
}
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH}));
add_opt(common_arg(
{"--context-shift"},
{"--no-context-shift"},
@@ -1578,7 +1578,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--temp", "--temperature"}, "N",
{"--temp"}, "N",
string_format("temperature (default: %.2f)", (double)params.sampling.temp),
[](common_params & params, const std::string & value) {
params.sampling.temp = std::stof(value);
@@ -1611,7 +1611,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--top-nsigma", "--top-n-sigma"}, "N",
{"--top-nsigma"}, "N",
string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
[](common_params & params, const std::string & value) {
params.sampling.top_n_sigma = std::stof(value);
@@ -1634,7 +1634,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--typical", "--typical-p"}, "N",
{"--typical"}, "N",
string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
[](common_params & params, const std::string & value) {
params.sampling.typ_p = std::stof(value);
@@ -2331,19 +2331,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
add_opt(common_arg(
{"-sm", "--split-mode"}, "{none,layer,row}",
{"-sm", "--split-mode"}, "{none,layer,row,tensor}",
"how to split the model across multiple GPUs, one of:\n"
"- none: use one GPU only\n"
"- layer (default): split layers and KV across GPUs\n"
"- row: split rows across GPUs",
"- layer (default): split layers and KV across GPUs (pipelined)\n"
"- row: split weight across GPUs by rows (parallelized)\n"
"- tensor: split weights and KV across GPUs (parallelized)",
[](common_params & params, const std::string & value) {
std::string arg_next = value;
if (arg_next == "none") {
if (value == "none") {
params.split_mode = LLAMA_SPLIT_MODE_NONE;
} else if (arg_next == "layer") {
} else if (value == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (arg_next == "row") {
} else if (value == "row") {
params.split_mode = LLAMA_SPLIT_MODE_ROW;
} else if (value == "tensor") {
params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
} else {
throw std::invalid_argument("invalid value");
}
@@ -2520,28 +2522,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
));
add_opt(common_arg(
{"-a", "--alias"}, "STRING",
"set model name aliases, comma-separated (to be used by API)",
"set alias for model name (to be used by REST API)",
[](common_params & params, const std::string & value) {
for (auto & alias : string_split<std::string>(value, ',')) {
alias = string_strip(alias);
if (!alias.empty()) {
params.model_alias.insert(alias);
}
}
params.model_alias = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
add_opt(common_arg(
{"--tags"}, "STRING",
"set model tags, comma-separated (informational, not used for routing)",
[](common_params & params, const std::string & value) {
for (auto & tag : string_split<std::string>(value, ',')) {
tag = string_strip(tag);
if (!tag.empty()) {
params.model_tags.insert(tag);
}
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS"));
add_opt(common_arg(
{"-m", "--model"}, "FNAME",
ex == LLAMA_EXAMPLE_EXPORT_LORA
@@ -3454,6 +3439,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.ngram_size_m = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--spec-ngram-check-rate"}, "N",
string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate),
[](common_params & params, int value) {
if (value < 1) {
throw std::invalid_argument("ngram check rate must be at least 1");
}
params.speculative.ngram_check_rate = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--spec-ngram-min-hits"}, "N",
string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits),

View File

@@ -803,7 +803,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
}
// remove potential partial suffix
if (builder.pos() == builder.input().size() && builder.is_partial()) {
if (builder.pos() == builder.input().size()) {
if (unclosed_reasoning_content.empty()) {
rstrip(content);
trim_potential_partial_word(content);

View File

@@ -893,6 +893,23 @@ static void common_chat_parse_minimax_m2(common_chat_msg_parser & builder) {
builder.consume_reasoning_with_xml_tool_calls(form, "<think>", "</think>");
}
static void common_chat_parse_qwen3_coder_xml(common_chat_msg_parser & builder) {
static const xml_tool_call_format form = ([]() {
xml_tool_call_format form {};
form.scope_start = "<tool_call>";
form.tool_start = "<function=";
form.tool_sep = ">";
form.key_start = "<parameter=";
form.key_val_sep = ">";
form.val_end = "</parameter>";
form.tool_end = "</function>";
form.scope_end = "</tool_call>";
form.trim_raw_argval = true;
return form;
})();
builder.consume_reasoning_with_xml_tool_calls(form);
}
static void common_chat_parse_kimi_k2(common_chat_msg_parser & builder) {
static const xml_tool_call_format form = ([]() {
xml_tool_call_format form {};
@@ -1573,6 +1590,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_KIMI_K2:
common_chat_parse_kimi_k2(builder);
break;
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML:
common_chat_parse_qwen3_coder_xml(builder);
break;
case COMMON_CHAT_FORMAT_APRIEL_1_5:
common_chat_parse_apriel_1_5(builder);
break;

View File

@@ -65,25 +65,14 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
} else if (!content_parts.empty()) {
if (concat_typed_text) {
std::string text;
bool last_was_media_marker = false;
// join parts with newline, do not add newline before or after media markers
for (const auto & part : content_parts) {
bool add_new_line = true;
if (part.type == "text") {
add_new_line = !last_was_media_marker && !text.empty();
last_was_media_marker = false;
} else if (part.type == "media_marker") {
add_new_line = false;
last_was_media_marker = true;
} else {
if (part.type != "text") {
LOG_WRN("Ignoring content part type: %s\n", part.type.c_str());
continue;
}
if (add_new_line) {
if (!text.empty()) {
text += '\n';
}
text += part.text;
}
jmsg["content"] = text;
@@ -330,7 +319,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
throw std::invalid_argument("Missing content part type: " + part.dump());
}
const auto & type = part.at("type");
if (type != "text" && type != "media_marker") {
if (type != "text") {
throw std::invalid_argument("Unsupported content part type: " + type.dump());
}
common_chat_msg_content_part msg_part;
@@ -391,46 +380,15 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
return msgs;
}
static json render_message_to_json(const std::vector<common_chat_msg> & msgs, const jinja::caps & c) {
if (!c.supports_string_content && !c.supports_typed_content) {
LOG_WRN("%s: Neither string content nor typed content is supported by the template. This is unexpected and may lead to issues.\n", __func__);
}
bool only_string_accepted = c.supports_string_content && !c.supports_typed_content;
bool only_typed_accepted = !c.supports_string_content && c.supports_typed_content;
json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text) {
json messages = json::array();
for (const auto & msg : msgs) {
if (only_string_accepted) {
json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ true);
messages.push_back(jmsg);
} else if (only_typed_accepted) {
json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ false);
if (jmsg.at("content").is_string()) {
jmsg["content"] = json::array({
json{
{"type", "text"},
{"text", jmsg.at("content").get<std::string>()},
}
});
}
messages.push_back(jmsg);
} else {
json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ false);
messages.push_back(jmsg);
}
json jmsg = msg.to_json_oaicompat(concat_typed_text);
messages.push_back(jmsg);
}
return messages;
}
// DEPRECATED: only used in tests
json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text) {
jinja::caps c;
c.supports_string_content = true;
c.supports_typed_content = !concat_typed_text;
return render_message_to_json(msgs, c);
}
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
std::vector<common_chat_tool> result;
@@ -736,6 +694,7 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_MINIMAX_M2: return "MiniMax-M2";
case COMMON_CHAT_FORMAT_GLM_4_5: return "GLM 4.5";
case COMMON_CHAT_FORMAT_KIMI_K2: return "Kimi K2";
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
@@ -1521,17 +1480,14 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
return data;
}
static common_chat_params common_chat_params_init_qwen3_coder(const common_chat_template & tmpl, const struct templates_params & inputs) {
static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED;
// Nemotron Nano 3 and Step-3.5-Flash use the Qwen3 Coder tool calling with thinking
bool supports_reasoning = (tmpl.source().find("<think>") != std::string::npos);
// Handle thinking tags appropriately based on inputs.enable_thinking
if (supports_reasoning && string_ends_with(data.prompt, "<think>\n")) {
if (string_ends_with(data.prompt, "<think>\n")) {
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
@@ -1540,21 +1496,19 @@ static common_chat_params common_chat_params_init_qwen3_coder(const common_chat_
}
data.preserved_tokens = {
"<think>",
"</think>",
"<tool_call>",
"</tool_call>",
};
if (supports_reasoning) {
data.preserved_tokens.insert(data.preserved_tokens.end(), {"<think>", "</think>"});
}
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = true;
auto parser = build_chat_peg_constructed_parser([&](auto & p) {
auto reasoning = p.eps();
if (supports_reasoning && inputs.enable_thinking && extract_reasoning) {
if (inputs.enable_thinking && extract_reasoning) {
auto reasoning_content = p.reasoning(p.until("</think>")) + ("</think>" | p.end());
if (data.thinking_forced_open) {
reasoning = reasoning_content;
@@ -1892,6 +1846,38 @@ static common_chat_params common_chat_params_init_minimax_m2(const common_chat_t
return data;
}
static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_chat_template & tmpl, const struct templates_params & params) {
common_chat_params data;
data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
data.prompt = apply(tmpl, params);
data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML;
data.preserved_tokens = {
"<tool_call>",
"</tool_call>",
"<function=",
"</function>",
"<parameter=",
"</parameter>",
};
// build grammar for tool call
static const xml_tool_call_format form {
/* form.scope_start = */ "<tool_call>\n",
/* form.tool_start = */ "<function=",
/* form.tool_sep = */ ">\n",
/* form.key_start = */ "<parameter=",
/* form.key_val_sep = */ ">\n",
/* form.val_end = */ "\n</parameter>\n",
/* form.tool_end = */ "</function>\n",
/* form.scope_end = */ "</tool_call>",
};
build_grammar_xml_tool_call(data, params.tools, form);
return data;
}
static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl, const struct templates_params & params) {
common_chat_params data;
data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -2015,7 +2001,6 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
if (has_reasoning_content && has_tool_calls) {
auto adjusted_message = msg;
adjusted_message["thinking"] = msg.at("reasoning_content");
adjusted_message.erase("content");
adjusted_messages.push_back(adjusted_message);
} else {
adjusted_messages.push_back(msg);
@@ -3035,7 +3020,7 @@ static common_chat_params common_chat_templates_apply_jinja(
: *tmpls->template_default;
const auto & src = tmpl.source();
const auto & caps = tmpl.original_caps();
params.messages = render_message_to_json(inputs.messages, tmpl.original_caps());
params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
params.add_generation_prompt = inputs.add_generation_prompt;
params.tool_choice = inputs.tool_choice;
params.reasoning_format = inputs.reasoning_format;
@@ -3113,13 +3098,19 @@ static common_chat_params common_chat_templates_apply_jinja(
}
// Qwen3-Coder XML format detection (must come before Hermes 2 Pro)
// Detect via XML markers: <tool_call>, <function=...>, and <parameter=...> blocks.
// Also matches Step-3.5-Flash and Nemotron 3 Nano which use the same output format.
// Detect via explicit XML markers unique to Qwen3-Coder to avoid false positives in other templates.
// Require presence of <tool_call>, <function=...>, and <parameter=...> blocks.
if (src.find("<tool_call>") != std::string::npos &&
src.find("<function>") != std::string::npos &&
src.find("<function=") != std::string::npos &&
src.find("<parameters>") != std::string::npos &&
src.find("<parameter=") != std::string::npos) {
workaround::func_args_not_string(params.messages);
return common_chat_params_init_qwen3_coder(tmpl, params);
// Nemotron 3 Nano 30B A3B
if (src.find("<think>") != std::string::npos) {
return common_chat_params_init_nemotron_v3(tmpl, params);
}
return common_chat_params_init_qwen3_coder_xml(tmpl, params);
}
// Xiaomi MiMo format detection (must come before Hermes 2 Pro)
@@ -3285,7 +3276,7 @@ static common_chat_params common_chat_templates_apply_legacy(
for (const auto & msg : inputs.messages) {
auto content = msg.content;
for (const auto & part : msg.content_parts) {
if (part.type != "text" && part.type != "media_marker") {
if (part.type != "text") {
LOG_WRN("Ignoring non-text content part: %s\n", part.type.c_str());
continue;
}

View File

@@ -128,6 +128,7 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_GLM_4_5,
COMMON_CHAT_FORMAT_MINIMAX_M2,
COMMON_CHAT_FORMAT_KIMI_K2,
COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
COMMON_CHAT_FORMAT_APRIEL_1_5,
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
COMMON_CHAT_FORMAT_SOLAR_OPEN,
@@ -239,8 +240,6 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
// Parses a JSON array of messages in OpenAI's chat completion API format.
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
// DEPRECATED: only used in tests
nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);

View File

@@ -1,3 +1,7 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "ggml.h"
#include "gguf.h"
@@ -5,12 +9,12 @@
#include "log.h"
#include "llama.h"
#include "sampling.h"
#include "unicode.h"
#include <algorithm>
#include <cinttypes>
#include <climits>
#include <cmath>
#include <codecvt>
#include <chrono>
#include <cstdarg>
#include <cstring>
@@ -452,6 +456,34 @@ void string_replace_all(std::string & s, const std::string & search, const std::
s = std::move(builder);
}
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
bool has_suffix = string_ends_with(str, suffix);
if (has_suffix) {
str = str.substr(0, str.size() - suffix.size());
}
return has_suffix;
}
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
if (!str.empty() && !stop.empty()) {
const char text_last_char = str.back();
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
if (stop[char_index] == text_last_char) {
const auto current_partial = stop.substr(0, char_index + 1);
if (string_ends_with(str, current_partial)) {
return str.size() - char_index - 1;
}
}
}
}
return std::string::npos;
}
std::string regex_escape(const std::string & s) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
return std::regex_replace(s, special_chars, "\\$&");
@@ -674,28 +706,45 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
return false;
}
size_t offset = 0;
while (offset < filename.size()) {
utf8_parse_result result = parse_utf8_codepoint(filename, offset);
std::u32string filename_utf32;
try {
#if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
if (result.status != utf8_parse_result::SUCCESS) {
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
#if defined(__clang__)
# pragma clang diagnostic pop
#elif defined(__GNUC__)
# pragma GCC diagnostic pop
#endif
filename_utf32 = converter.from_bytes(filename);
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
// or invalid encodings were encountered. Reject such attempts
std::string filename_reencoded = converter.to_bytes(filename_utf32);
if (filename_reencoded != filename) {
return false;
}
uint32_t c = result.codepoint;
} catch (const std::exception &) {
return false;
}
if ((result.bytes_consumed == 2 && c < 0x80) ||
(result.bytes_consumed == 3 && c < 0x800) ||
(result.bytes_consumed == 4 && c < 0x10000)) {
return false;
}
// Check for forbidden codepoints:
// - Control characters
// - Unicode equivalents of illegal characters
// - UTF-16 surrogate pairs
// - UTF-8 replacement character
// - Byte order mark (BOM)
// - Illegal characters: / \ : * ? " < > |
// Check for forbidden codepoints:
// - Control characters
// - Unicode equivalents of illegal characters
// - UTF-16 surrogate pairs
// - UTF-8 replacement character
// - Byte order mark (BOM)
// - Illegal characters: / \ : * ? " < > |
for (char32_t c : filename_utf32) {
if (c <= 0x1F // Control characters (C0)
|| c == 0x7F // Control characters (DEL)
|| (c >= 0x80 && c <= 0x9F) // Control characters (C1)
@@ -703,7 +752,6 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|| c == 0x2215 // Division Slash (forward slash equivalent)
|| c == 0x2216 // Set Minus (backslash equivalent)
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|| c > 0x10FFFF // Max Unicode limit
|| c == 0xFFFD // Replacement Character (UTF-8)
|| c == 0xFEFF // Byte Order Mark (BOM)
|| c == ':' || c == '*' // Illegal characters
@@ -714,7 +762,6 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
// Subdirectories not allowed, reject path separators
return false;
}
offset += result.bytes_consumed;
}
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
@@ -851,8 +898,7 @@ std::string fs_get_cache_directory() {
if (getenv("LLAMA_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE");
} else {
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || \
defined(__OpenBSD__) || defined(__NetBSD__)
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else if (std::getenv("HOME")) {
@@ -1196,7 +1242,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
return res;
}
int err = llama_set_adapter_cvec(
int err = llama_apply_adapter_cvec(
lctx,
cvec.data.data(),
cvec.data.size(),
@@ -1298,15 +1344,12 @@ std::string get_model_endpoint() {
}
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
std::vector<llama_adapter_lora *> loras;
std::vector<float> scales;
for (auto & la: lora) {
loras.push_back(la.ptr);
scales.push_back(la.scale);
llama_clear_adapter_lora(ctx);
for (auto & la : lora) {
if (la.scale != 0.0f) {
llama_set_adapter_lora(ctx, la.ptr, la.scale);
}
}
llama_set_adapters_lora(ctx, loras.data(), loras.size(), scales.data());
}
struct llama_model_params common_model_params_to_llama(common_params & params) {
@@ -1426,6 +1469,66 @@ void common_batch_add(
batch.n_tokens++;
}
//
// Token utils
//
size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
return i;
}
size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
// check for empty sequences
if (a.empty() || b.empty()) {
return 0;
}
// get the lengths of the input sequences
size_t a_len = a.size();
size_t b_len = b.size();
// initialize the maximum length of the longest common subsequence (LCS)
size_t max_length = 0;
// use two rows instead of a 2D matrix to optimize space
std::vector<size_t> prev_row(b_len + 1, 0);
std::vector<size_t> curr_row(b_len + 1, 0);
// iterate through the elements of a
for (size_t i = 1; i <= a_len; i++) {
// iterate through the elements of b
for (size_t j = 1; j <= b_len; j++) {
// if elements at the current positions match
if (a[i - 1] == b[j - 1]) {
// if it's the first element of either sequences, set LCS length to 1
if (i == 1 || j == 1) {
curr_row[j] = 1;
} else {
// increment LCS length by 1 compared to the previous element
curr_row[j] = prev_row[j - 1] + 1;
}
// update max_length if necessary
if (curr_row[j] > max_length) {
max_length = curr_row[j];
}
} else {
// reset LCS length if elements don't match
curr_row[j] = 0;
}
}
// update the previous row for the next iteration
prev_row = curr_row;
}
// return the maximum length of the LCS
return max_length;
}
//
// Vocab utils
//
@@ -1760,65 +1863,3 @@ float lr_opt::get_lr(float epoch) const {
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
return r;
}
bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos) {
llama_batch batch = llama_batch_get_one(&last_token, 1);
batch.pos = &pos;
if (llama_decode(ctx, batch)) {
LOG_ERR("%s: failed to replay last token\n", __func__);
return false;
}
return true;
}
bool common_prompt_batch_decode(
struct llama_context * ctx,
const std::vector<llama_token> & tokens,
int & n_past,
int n_batch,
std::string_view state_path,
bool save_state) {
const int n_eval = tokens.size();
if (n_eval == 0) {
return true;
}
if (save_state && n_eval > 1) {
const int n_tokens_before_last = n_eval - 1;
GGML_ASSERT(n_eval <= n_batch);
// Decode all but the last token so we can save the memory state before decoding the last token.
// This is done so we can restore the session state later and replay the last token.
// Memory implementations in recurrent/hybrid models don't support removing tokens from their
// memory, so we can't just remove the last token from the memory and replay the last token which
// is the reason for this logic.
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_tokens_before_last))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
n_past += n_tokens_before_last;
llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last);
LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last);
llama_token last_token = tokens.back();
llama_batch batch = llama_batch_get_one(&last_token, 1);
int32_t pos = n_past;
batch.pos = &pos;
if (llama_decode(ctx, batch)) {
LOG_ERR("%s : failed to eval last token\n", __func__);
return false;
}
n_past++;
} else {
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_eval))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
return true;
}

View File

@@ -269,6 +269,7 @@ struct common_params_speculative {
uint16_t ngram_size_n = 12; // ngram size for lookup
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
std::shared_ptr<common_ngram_mod> ngram_mod;
@@ -410,8 +411,7 @@ struct common_params {
struct common_params_model model;
std::set<std::string> model_alias; // model aliases // NOLINT
std::set<std::string> model_tags; // model tags (informational, not used for routing) // NOLINT
std::string model_alias = ""; // model alias // NOLINT
std::string hf_token = ""; // HF token // NOLINT
std::string prompt = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
@@ -671,55 +671,30 @@ static std::vector<T> string_split(const std::string & str, char delim) {
}
template<>
inline std::vector<std::string> string_split<std::string>(const std::string & str, char delim)
std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
{
std::vector<std::string> parts;
size_t begin_pos = 0;
size_t delim_pos = str.find(delim);
while (delim_pos != std::string::npos) {
std::string part = str.substr(begin_pos, delim_pos - begin_pos);
size_t separator_pos = input.find(separator);
while (separator_pos != std::string::npos) {
std::string part = input.substr(begin_pos, separator_pos - begin_pos);
parts.emplace_back(part);
begin_pos = delim_pos + 1;
delim_pos = str.find(delim, begin_pos);
begin_pos = separator_pos + 1;
separator_pos = input.find(separator, begin_pos);
}
parts.emplace_back(str.substr(begin_pos));
parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
return parts;
}
// remove when moving to c++20
inline bool string_starts_with(std::string_view str, std::string_view prefix) {
return str.size() >= prefix.size() &&
str.compare(0, prefix.size(), prefix) == 0;
static bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}
// remove when moving to c++20
inline bool string_ends_with(std::string_view str, std::string_view suffix) {
return str.size() >= suffix.size() &&
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
}
inline bool string_remove_suffix(std::string & str, std::string_view suffix) {
if (string_ends_with(str, suffix)) {
str.resize(str.size() - suffix.size());
return true;
}
return false;
}
inline size_t string_find_partial_stop(std::string_view str, std::string_view stop) {
if (!str.empty() && !stop.empty()) {
const size_t max_len = std::min(str.size(), stop.size());
const char last_char = str.back();
for (size_t len = max_len; len > 0; --len) {
if (stop[len - 1] == last_char) {
if (string_ends_with(str, stop.substr(0, len))) {
return str.size() - len;
}
}
}
}
return std::string::npos;
}
// While we wait for C++20's std::string::ends_with...
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
@@ -805,22 +780,15 @@ void common_batch_add(
const std::vector<llama_seq_id> & seq_ids,
bool logits);
// decodes a single batch of tokens for a prompt and manages session tokens
//
// Note: We save state before the last token so that we can replay it to ensure
// compatibility with all memory types. Recurrent/hybrid models cannot remove
// tokens from memory, so this approach works across all model architectures.
bool common_prompt_batch_decode(
struct llama_context * ctx,
const std::vector<llama_token> & embd,
int & n_past,
int n_batch,
std::string_view state_path,
bool save_state);
// Token utils
//
// replays the last token after loading state to regenerate logits
// used after loading session state to ensure the sampling context has valid logits
bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos);
// longest common prefix
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
// longet common subsequence
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
//
// Vocab utils
@@ -913,11 +881,11 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
inline std::string llm_ffn_exps_block_regex(int idx) {
static std::string llm_ffn_exps_block_regex(int idx) {
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
}
inline llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
}

View File

@@ -19,7 +19,9 @@
#include <thread>
#include <vector>
#if defined(LLAMA_USE_HTTPLIB)
#include "http.h"
#endif
#ifndef __EMSCRIPTEN__
#ifdef __linux__
@@ -112,18 +114,44 @@ static void write_etag(const std::string & path, const std::string & etag) {
}
static std::string read_etag(const std::string & path) {
std::string none;
const std::string etag_path = path + ".etag";
if (!std::filesystem::exists(etag_path)) {
return {};
if (std::filesystem::exists(etag_path)) {
std::ifstream etag_in(etag_path);
if (!etag_in) {
LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
return none;
}
std::string etag;
std::getline(etag_in, etag);
return etag;
}
std::ifstream etag_in(etag_path);
if (!etag_in) {
LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
return {};
// no etag file, but maybe there is an old .json
// remove this code later
const std::string metadata_path = path + ".json";
if (std::filesystem::exists(metadata_path)) {
std::ifstream metadata_in(metadata_path);
try {
nlohmann::json metadata_json;
metadata_in >> metadata_json;
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
metadata_json.dump().c_str());
if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
std::string etag = metadata_json.at("etag");
write_etag(path, etag);
if (!std::filesystem::remove(metadata_path)) {
LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
}
return etag;
}
} catch (const nlohmann::json::exception & e) {
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
}
}
std::string etag;
std::getline(etag_in, etag);
return etag;
return none;
}
static bool is_http_status_ok(int status) {
@@ -140,6 +168,8 @@ std::pair<std::string, std::string> common_download_split_repo_tag(const std::st
return {hf_repo, tag};
}
#if defined(LLAMA_USE_HTTPLIB)
class ProgressBar {
static inline std::mutex mutex;
static inline std::map<const ProgressBar *, int> lines;
@@ -275,10 +305,7 @@ static bool common_pull_file(httplib::Client & cli,
);
if (!res) {
LOG_ERR("%s: download failed: %s (status: %d)\n",
__func__,
httplib::to_string(res.error()).c_str(),
res ? res->status : -1);
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
return false;
}
@@ -317,64 +344,62 @@ static int common_download_file_single_online(const std::string & url,
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
auto head = cli.Head(parts.path);
if (!head || head->status < 200 || head->status >= 300) {
LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
if (file_exists) {
LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
}
return head ? head->status : -1;
}
std::string etag;
if (head->has_header("ETag")) {
etag = head->get_header_value("ETag");
}
size_t total_size = 0;
if (head->has_header("Content-Length")) {
try {
total_size = std::stoull(head->get_header_value("Content-Length"));
} catch (const std::exception& e) {
LOG_WRN("%s: invalid Content-Length in HEAD response: %s\n", __func__, e.what());
}
}
bool supports_ranges = false;
if (head->has_header("Accept-Ranges")) {
supports_ranges = head->get_header_value("Accept-Ranges") != "none";
}
if (file_exists) {
if (etag.empty()) {
LOG_INF("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
}
if (!last_etag.empty() && last_etag == etag) {
LOG_INF("%s: using cached file (same etag): %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
}
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return -1;
}
}
const std::string path_temporary = path + ".downloadInProgress";
int delay = retry_delay_seconds;
for (int i = 0; i < max_attempts; ++i) {
if (i) {
LOG_WRN("%s: retrying after %d seconds...\n", __func__, delay);
std::this_thread::sleep_for(std::chrono::seconds(delay));
delay *= retry_delay_seconds;
auto head = cli.Head(parts.path);
bool head_ok = head && head->status >= 200 && head->status < 300;
if (!head_ok) {
LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
if (file_exists) {
LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
}
return head->status; // cannot use cached file, return raw status code
// TODO: maybe retry only on certain codes
}
std::string etag;
if (head_ok && head->has_header("ETag")) {
etag = head->get_header_value("ETag");
}
size_t total_size = 0;
if (head_ok && head->has_header("Content-Length")) {
try {
total_size = std::stoull(head->get_header_value("Content-Length"));
} catch (const std::exception& e) {
LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what());
}
}
bool supports_ranges = false;
if (head_ok && head->has_header("Accept-Ranges")) {
supports_ranges = head->get_header_value("Accept-Ranges") != "none";
}
bool should_download_from_scratch = false;
if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
last_etag.c_str(), etag.c_str());
should_download_from_scratch = true;
}
if (file_exists) {
if (!should_download_from_scratch) {
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
}
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return -1;
}
}
const std::string path_temporary = path + ".downloadInProgress";
size_t existing_size = 0;
if (std::filesystem::exists(path_temporary)) {
if (supports_ranges) {
if (supports_ranges && !should_download_from_scratch) {
existing_size = std::filesystem::file_size(path_temporary);
} else if (remove(path_temporary.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
@@ -382,23 +407,32 @@ static int common_download_file_single_online(const std::string & url,
}
}
LOG_INF("%s: downloading from %s to %s (etag:%s)...\n",
__func__, common_http_show_masked_url(parts).c_str(),
path_temporary.c_str(), etag.c_str());
if (common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size)) {
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return -1;
// start the download
LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
__func__, common_http_show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
if (!was_pull_successful) {
if (i + 1 < max_attempts) {
const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
} else {
LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
}
if (!etag.empty()) {
write_etag(path, etag);
}
return head->status;
continue;
}
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return -1;
}
if (!etag.empty()) {
write_etag(path, etag);
}
return head->status; // TODO: use actual GET status?
}
LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
return -1; // max attempts reached
}
@@ -764,6 +798,30 @@ std::string common_docker_resolve_model(const std::string & docker) {
}
}
#else
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
std::string common_docker_resolve_model(const std::string &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
int common_download_file_single(const std::string &,
const std::string &,
const std::string &,
bool,
const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
#endif // defined(LLAMA_USE_HTTPLIB)
std::vector<common_cached_model_info> common_list_cached_models() {
std::vector<common_cached_model_info> models;
const std::string cache_dir = fs_get_cache_directory();

View File

@@ -63,8 +63,7 @@ static void caps_print_stats(value & v, const std::string & path) {
std::map<std::string, bool> caps::to_map() const {
return {
{"supports_string_content", supports_string_content},
{"supports_typed_content", supports_typed_content},
{"requires_typed_content", requires_typed_content},
{"supports_tools", supports_tools},
{"supports_tool_calls", supports_tool_calls},
{"supports_parallel_tool_calls", supports_parallel_tool_calls},
@@ -90,7 +89,7 @@ caps caps_get(jinja::program & prog) {
return v->stats.ops.find(op_name) != v->stats.ops.end();
};
// case: typed content support
// case: typed content requirement
caps_try_execute(
prog,
[&]() {
@@ -106,16 +105,12 @@ caps caps_get(jinja::program & prog) {
// tools
return json{nullptr};
},
[&](bool success, value & messages, value &) {
[&](bool, value & messages, value &) {
auto & content = messages->at(0)->at("content");
caps_print_stats(content, "messages[0].content");
if (has_op(content, "selectattr") || has_op(content, "array_access")) {
// accessed as an array
result.supports_typed_content = true;
}
if (!success) {
// failed to execute with content as string
result.supports_string_content = false;
result.requires_typed_content = true;
}
}
);

View File

@@ -14,9 +14,7 @@ struct caps {
bool supports_parallel_tool_calls = true;
bool supports_preserve_reasoning = false; // support assistant message with reasoning_content
// one of the 2 content capabilities must be true
bool supports_string_content = true;
bool supports_typed_content = false;
bool requires_typed_content = false; // default: use string content
// for reporting on server
std::map<std::string, bool> to_map() const;

View File

@@ -85,7 +85,7 @@ value identifier::execute_impl(context & ctx) {
auto builtins = global_builtins();
if (!it->is_undefined()) {
if (ctx.is_get_stats) {
value_t::stats_t::mark_used(it);
it->stats.used = true;
}
JJ_DEBUG("Identifier '%s' found, type = %s", val.c_str(), it->type().c_str());
return it;
@@ -277,7 +277,7 @@ value binary_expression::execute_impl(context & ctx) {
static value try_builtin_func(context & ctx, const std::string & name, value & input, bool undef_on_missing = false) {
JJ_DEBUG("Trying built-in function '%s' for type %s", name.c_str(), input->type().c_str());
if (ctx.is_get_stats) {
value_t::stats_t::mark_used(input);
input->stats.used = true;
input->stats.ops.insert(name);
}
auto builtins = input->get_builtins();
@@ -446,12 +446,6 @@ value for_statement::execute_impl(context & ctx) {
value iterable_val = iter_expr->execute(scope);
// mark the variable being iterated as used for stats
if (ctx.is_get_stats) {
value_t::stats_t::mark_used(iterable_val);
iterable_val->stats.ops.insert("array_access");
}
if (iterable_val->is_undefined()) {
JJ_DEBUG("%s", "For loop iterable is undefined, skipping loop");
iterable_val = mk_val<value_array>();
@@ -470,7 +464,7 @@ value for_statement::execute_impl(context & ctx) {
items.push_back(std::move(tuple));
}
if (ctx.is_get_stats) {
value_t::stats_t::mark_used(iterable_val);
iterable_val->stats.used = true;
iterable_val->stats.ops.insert("object_access");
}
} else {
@@ -480,7 +474,7 @@ value for_statement::execute_impl(context & ctx) {
items.push_back(item);
}
if (ctx.is_get_stats) {
value_t::stats_t::mark_used(iterable_val);
iterable_val->stats.used = true;
iterable_val->stats.ops.insert("array_access");
}
}
@@ -721,8 +715,6 @@ value member_expression::execute_impl(context & ctx) {
int64_t arr_size = 0;
if (is_val<value_array>(object)) {
arr_size = object->as_array().size();
} else if (is_val<value_string>(object)) {
arr_size = object->as_string().length();
}
if (is_stmt<slice_expression>(this->property)) {
@@ -819,9 +811,8 @@ value member_expression::execute_impl(context & ctx) {
}
if (ctx.is_get_stats && val && object && property) {
value_t::stats_t::mark_used(val);
value_t::stats_t::mark_used(object);
value_t::stats_t::mark_used(property);
val->stats.used = true;
object->stats.used = true;
if (is_val<value_int>(property)) {
object->stats.ops.insert("array_access");
} else if (is_val<value_string>(property)) {

View File

@@ -4,7 +4,6 @@
// for converting from JSON to jinja values
#include <nlohmann/json.hpp>
#include <sstream>
#include <string>
#include <cctype>
#include <vector>
@@ -161,11 +160,6 @@ static value tojson(const func_args & args) {
value val_separators = args.get_kwarg_or_pos("separators", 3);
value val_sort = args.get_kwarg_or_pos("sort_keys", 4);
int indent = -1;
if (args.ctx.is_get_stats) {
// mark as used (recursively) for stats
auto val_input = args.get_pos(0);
value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
}
if (is_val<value_int>(val_indent)) {
indent = static_cast<int>(val_indent->as_int());
}
@@ -721,46 +715,8 @@ const func_builtins & value_string_t::get_builtins() const {
return args.get_pos(0);
}},
{"tojson", tojson},
{"indent", [](const func_args &args) -> value {
args.ensure_count(1, 4);
value val_input = args.get_pos(0);
value val_width = args.get_kwarg_or_pos("width", 1);
const bool first = args.get_kwarg_or_pos("first", 2)->as_bool(); // undefined == false
const bool blank = args.get_kwarg_or_pos("blank", 3)->as_bool(); // undefined == false
if (!is_val<value_string>(val_input)) {
throw raised_exception("indent() first argument must be a string");
}
std::string indent;
if (is_val<value_int>(val_width)) {
indent.assign(val_width->as_int(), ' ');
} else if (is_val<value_string>(val_width)) {
indent = val_width->as_string().str();
} else {
indent = " ";
}
std::string indented;
std::string input = val_input->as_string().str();
std::istringstream iss = std::istringstream(input);
std::string line;
while (std::getline(iss, line)) {
if (!indented.empty()) {
indented.push_back('\n');
}
if ((indented.empty() ? first : (!line.empty() || blank))) {
indented += indent;
}
indented += line;
}
if (!input.empty() && input.back() == '\n') {
indented.push_back('\n');
if (blank) {
indented += indent;
}
}
auto res = mk_val<value_string>(indented);
res->val_str.mark_input_based_on(val_input->as_string());
return res;
{"indent", [](const func_args &) -> value {
throw not_implemented_exception("String indent builtin not implemented");
}},
{"join", [](const func_args &) -> value {
throw not_implemented_exception("String join builtin not implemented");
@@ -896,11 +852,6 @@ const func_builtins & value_array_t::get_builtins() const {
}},
{"string", [](const func_args & args) -> value {
args.ensure_vals<value_array>();
if (args.ctx.is_get_stats) {
// mark as used (recursively) for stats
auto val_input = args.get_pos(0);
value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
}
return mk_val<value_string>(args.get_pos(0)->as_string());
}},
{"tojson", tojson},
@@ -1056,11 +1007,6 @@ const func_builtins & value_object_t::get_builtins() const {
{"tojson", tojson},
{"string", [](const func_args & args) -> value {
args.ensure_vals<value_object>();
if (args.ctx.is_get_stats) {
// mark as used (recursively) for stats
auto val_input = args.get_pos(0);
value_t::stats_t::mark_used(const_cast<value&>(val_input), true);
}
return mk_val<value_string>(args.get_pos(0)->as_string());
}},
{"length", [](const func_args & args) -> value {
@@ -1373,21 +1319,4 @@ std::string value_to_string_repr(const value & val) {
}
}
// stats utility
void value_t::stats_t::mark_used(value & val, bool deep) {
val->stats.used = true;
if (deep) {
if (is_val<value_array>(val)) {
for (auto & item : val->val_arr) {
mark_used(item, deep);
}
} else if (is_val<value_object>(val)) {
for (auto & pair : val->val_obj) {
mark_used(pair.first, deep);
mark_used(pair.second, deep);
}
}
}
}
} // namespace jinja

View File

@@ -118,8 +118,6 @@ struct value_t {
bool used = false;
// ops can be builtin calls or operators: "array_access", "object_access"
std::set<std::string> ops;
// utility to recursively mark value and its children as used
static void mark_used(value & val, bool deep = false);
} stats;
value_t() = default;

View File

@@ -231,9 +231,10 @@ void common_ngram_map_draft(common_ngram_map & map,
GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
}
if (map.idx_last_check > cur_len) {
// Should not happen because of common_ngram_map_begin().
GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
// Only check every check_rate tokens to save compute
// i.e., perform check if (cur_len - idx_last_check) >= check_rate
if (map.idx_last_check + map.check_rate > cur_len) {
return;
}
map.idx_last_check = cur_len;
@@ -461,7 +462,7 @@ void common_ngram_map_draft(common_ngram_map & map,
slot_max = v;
}
}
// What is sum of the other occurrences?
// What is sum of the other occurences?
uint32_t sum_occur = 0;
for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
if (v == slot_max) {

View File

@@ -24,6 +24,7 @@
struct common_ngram_simple_config {
uint16_t size_ngram; // size of n-grams to lookup in self-mode
uint16_t size_mgram; // size of m-grams to draft in self-mode
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
};
// Searches for a n-gram in the history and checks whether a draft sequence should be generated.
@@ -44,7 +45,7 @@ llama_tokens common_ngram_simple_draft(
// statistics of a m-gram after a known n-gram
struct common_ngram_map_value {
size_t value_idx = 0; // index of value m-gram in token-history (0 if unused)
uint16_t value_num = 0; // number of occurrences of this value m-gram after the key n-gram (0 in an unused values-slot)
uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
int16_t n_accepted = -1; // number of accepted tokens at last draft (-1 if unused)
};
@@ -53,7 +54,7 @@ struct common_ngram_map_key {
size_t key_idx; // index of key n-gram in token-history
size_t stat_idx; // index of last token of stastistics computation (key_num, values)
uint16_t key_num; // number of occurrences of this key n-gram in token-history
uint16_t key_num; // number of occurences of this key n-gram in token-history
common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
};
@@ -65,14 +66,15 @@ struct common_ngram_map {
bool key_only; // true if only key n-grams are used, no values.
std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
uint16_t min_hits; // minimum number of key hits to consider a draft
bool show_key_map_stats = false; // true, if statistics of the key_map should be printed.
bool show_key_map_stats = false; // true, if statitics of the key_map should be printed.
common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
uint16_t min_hits)
uint16_t check_rate, uint16_t min_hits)
: size_key(sz_key), size_value(sz_value), key_only(only_keys),
min_hits(min_hits) {
check_rate(check_rate), min_hits(min_hits) {
key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
}

View File

@@ -113,14 +113,13 @@ static bool common_speculative_are_compatible(
struct common_speculative_state {
const enum common_speculative_type type;
size_t n_call_begin = 0; // number of times this implementation was called for refresh.
size_t n_call_draft = 0; // number of times this implementation was called for generation.
size_t n_call_accept = 0; // number of times this implementation was called for accumulation.
size_t n_gen_drafts = 0; // number of times a draft or part was generated by this implementation.
size_t n_acc_drafts = 0; // number of times a draft or part was accepted by the target model.
size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
size_t n_acc_tokens = 0; // number of tokens accepted by the target model.
// TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
// TODO: add n_call_begin, n_call_accept
size_t drafts_call_count = 0; // number of times this implementation was called.
size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation.
size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model.
size_t drafts_generated_tokens = 0; // number of tokens generated by this implementation.
size_t drafts_accepted_tokens = 0; // number of tokens accepted by the target model.
// TODO: track performance of most recent calls
const bool gen_perf = true; // whether to generate performance stats.
@@ -466,6 +465,8 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
struct common_speculative_state_ngram_simple : public common_speculative_state {
common_ngram_simple_config config;
uint16_t check_id = 0; // used to control the frequency of generating drafts
common_speculative_state_ngram_simple(
enum common_speculative_type type,
common_ngram_simple_config config)
@@ -480,6 +481,11 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
const llama_tokens & prompt_tgt,
llama_token id_last,
llama_tokens & result) override {
++check_id;
if (check_id < config.check_rate) {
return;
}
check_id = 0;
result = common_ngram_simple_draft(config, prompt_tgt, id_last);
GGML_UNUSED(params);
@@ -746,9 +752,10 @@ static common_ngram_map get_common_ngram_map(const common_speculative_config & c
uint16_t size_key = config.params.ngram_size_n;
uint16_t size_value = config.params.ngram_size_m;
bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
uint16_t check_rate = config.params.ngram_check_rate;
uint16_t min_hits = config.params.ngram_min_hits;
return common_ngram_map(size_key, size_value, key_only, min_hits);
return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits);
}
static common_speculative_state_ngram_cache create_state_ngram_cache(
@@ -798,42 +805,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
return it->second;
}
bool common_speculative_is_compat(llama_context * ctx_tgt) {
auto * mem = llama_get_memory(ctx_tgt);
if (mem == nullptr) {
return false;
}
bool res = true;
llama_memory_clear(mem, true);
// eval 2 tokens to check if the context is compatible
std::vector<llama_token> tmp;
tmp.push_back(0);
tmp.push_back(0);
int ret = llama_decode(ctx_tgt, llama_batch_get_one(tmp.data(), tmp.size()));
if (ret != 0) {
LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
res = false;
goto done;
}
// try to remove the last tokens
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
res = false;
goto done;
}
done:
llama_memory_clear(mem, true);
llama_synchronize(ctx_tgt);
return res;
}
// initialization of the speculative decoding system
//
common_speculative * common_speculative_init(
@@ -924,10 +895,12 @@ common_speculative * common_speculative_init(
uint16_t ngram_size_key = ngram_map.size_key;
uint16_t mgram_size_value = ngram_map.size_value;
uint16_t check_rate = ngram_map.check_rate;
auto config_simple = common_ngram_simple_config {
/* .size_ngram = */ ngram_size_key,
/* .size_mgram = */ mgram_size_value
/* .size_mgram = */ mgram_size_value,
/* .check_rate = */ check_rate
};
auto state = std::make_unique<common_speculative_state_ngram_simple>(
/* .type = */ config.type,
@@ -988,7 +961,6 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr
for (auto & impl : spec->impls) {
common_time_meas tm(impl->t_begin_us, !impl->gen_perf);
impl->begin(prompt);
impl->n_call_begin++;
}
}
@@ -1005,17 +977,17 @@ llama_tokens common_speculative_draft(
{
common_time_meas tm(impl->t_draft_us, !impl->gen_perf);
impl->draft(params, prompt_tgt, id_last, result);
impl->n_call_draft++;
impl->drafts_call_count++;
}
if (!result.empty()) {
LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
impl.get()->n_call_draft, result.size());
impl.get()->drafts_call_count, result.size());
spec->curr_impl = impl.get(); // set current implementation for stats
impl->n_gen_drafts++;
impl->n_gen_tokens += result.size();
impl->drafts_generated_count++;
impl->drafts_generated_tokens += result.size();
break; // We have a draft, so break out of the loop and return it.
}
@@ -1036,12 +1008,11 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
{
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
if (n_accepted > 0) {
impl->n_acc_drafts++;
impl->n_acc_tokens += n_accepted;
impl->drafts_accepted_count++;
impl->drafts_accepted_tokens += n_accepted;
}
impl->accept(n_accepted);
impl->n_call_accept++;
}
}
@@ -1062,13 +1033,13 @@ void common_speculative_print_stats(const common_speculative * spec) {
str_perf = "";
}
LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
common_speculative_type_to_str(impl->type).c_str(),
impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
impl->n_gen_drafts,
impl->n_acc_drafts,
impl->n_gen_tokens,
impl->n_acc_tokens,
impl->drafts_call_count,
impl->drafts_generated_count,
impl->drafts_accepted_count,
impl->drafts_generated_tokens,
impl->drafts_accepted_tokens,
str_perf.c_str());
}
}

View File

@@ -14,10 +14,6 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
// convert type to string
std::string common_speculative_type_to_str(enum common_speculative_type type);
// check if the llama_context is compatible for speculative decoding
// note: clears the memory of the context
bool common_speculative_is_compat(llama_context * ctx_tgt);
common_speculative * common_speculative_init(
common_params_speculative & params,
llama_context * ctx_tgt);

File diff suppressed because it is too large Load Diff

View File

@@ -99,7 +99,6 @@ models = [
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "tiny_aya", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
@@ -107,7 +106,6 @@ models = [
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "jina-v5-nano", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v5-text-nano", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
@@ -115,7 +113,6 @@ models = [
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
{"name": "jais-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inceptionai/Jais-2-8B-Chat", },
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
@@ -151,9 +148,6 @@ models = [
{"name": "youtu", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
{"name": "solar-open", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
{"name": "exaone-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
{"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -163,7 +157,6 @@ pre_computed_hashes = [
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
@@ -177,6 +170,7 @@ pre_computed_hashes = [
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
# jina-v2-de variants
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
]

View File

@@ -246,7 +246,7 @@ cmake --build build --config release
1. **Retrieve and prepare model**
You can refer to the general [*Obtaining and quantizing models*](../../README.md#obtaining-and-quantizing-models) guide for model prepration.
You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration.
**Notes**:

View File

@@ -281,7 +281,7 @@ as `-cl-fp32-correctly-rounded-divide-sqrt`
#### Retrieve and prepare model
You can refer to the general [*Obtaining and quantizing models*](../../README.md#obtaining-and-quantizing-models) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
##### Check device
@@ -569,7 +569,7 @@ Once it is completed, final results will be in **build/Release/bin**
#### Retrieve and prepare model
You can refer to the general [*Obtaining and quantizing models*](../../README.md#obtaining-and-quantizing-models) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
##### Check device

View File

@@ -1,182 +0,0 @@
# GGML-VirtGPU Backend
The GGML-VirtGPU backend enables GGML applications to run machine
learning computations on host hardware while the application itself
runs inside a virtual machine. It uses host-guest shared memory to
efficiently share data buffers between the two sides.
This backend relies on the virtio-gpu, and VirglRenderer API Remoting
(APIR) component. The backend is split into two libraries:
- a GGML implementation (the "remoting frontend"), running in the
guest and interacting with the virtgpu device
- a VirglRenderer APIR compatible library (the "remoting backend"),
running in the host and interacting with Virglrenderer and an actual
GGML device backend.
## OS support
| OS | Status | Backend | CI testing | Notes
| -------- | ----------------- | ----------- | ----------- | -----
| MacOS 14 | Supported | ggml-metal | X | Working when compiled on MacOS 14
| MacOS 15 | Supported | ggml-metal | X | Working when compiled on MacOS 14 or MacOS 15
| MacOS 26 | Not tested | | |
| Linux | Under development | ggml-vulkan | not working | Working locally, CI running into deadlocks
## Architecture Overview
The GGML-VirtGPU backend consists of three main components:
```mermaid
graph TD
%% Nodes
subgraph GuestVM ["Guest VM - Frontend"]
App([GGML Application<br/>llama.cpp, etc.])
direction TB
Interface[GGML Backend Interface]
Comm["GGML-VirtGPU<br/>(hypercalls + shared mem)"]
App --> Interface
Interface --> Comm
end
API[virtio-gpu / virglrenderer API]
subgraph HostSystem [Host System - Backend]
direction TB
Dispatcher[GGML-VirtGPU-Backend]
BackendLib[GGML Backend library<br/>Metal / Vulkan / CPU / ...]
Dispatcher --> BackendLib
end
%% Connections
Comm --> API
API --> HostSystem
```
### Key Components
1. **Guest-side Frontend** (`ggml-virtgpu/`): Implements the GGML backend interface and forwards operations to the host
2. **Host-side Backend** (`ggml-virtgpu/backend/`): Receives forwarded operations and executes them on actual hardware backends
3. **Communication Layer**: Uses virtio-gpu hypercalls and shared memory for efficient data transfer
## Features
- **Dynamic backend loading** on the host side (CPU, CUDA, Metal, etc.)
- **Zero-copy data transfer** via host-guest shared memory pages
## Communication Protocol
### Hypercalls and Shared Memory
The backend uses two primary communication mechanisms:
1. **Hypercalls (`DRM_IOCTL_VIRTGPU_EXECBUFFER`)**: Trigger remote execution from guest to host
2. **Shared Memory Pages**: Zero-copy data transfer for tensors and parameters
#### Shared Memory Layout
Each connection uses two shared memory buffers:
- **Data Buffer** (24 MiB): For command/response data and tensor transfers
- **Reply Buffer** (16 KiB): For command replies and status information
- **Data Buffers**: Dynamically allocated host-guest shared buffers
served as GGML buffers.
### APIR Protocol
The Virglrender API Remoting protocol defines three command types:
- `HANDSHAKE`: Protocol version negotiation and capability discovery
- `LOADLIBRARY`: Dynamic loading of backend libraries on the host
- `FORWARD`: API function call forwarding
### Binary Serialization
Commands and data are serialized using a custom binary protocol with:
- Fixed-size encoding for basic types
- Variable-length arrays with size prefixes
- Buffer bounds checking
- Error recovery mechanisms
## Supported Operations
### Device Operations
- Device enumeration and capability queries
- Memory information (total/free)
- Backend type detection
### Buffer Operations
- Buffer allocation and deallocation
- Tensor data transfer (host ↔ guest)
- Memory copying and clearing
### Computation Operations
- Graph execution forwarding
## Build Requirements
### Guest-side Dependencies
- `libdrm` for DRM/virtio-gpu communication
- C++20 compatible compiler
- CMake 3.14+
### Host-side Dependencies
- virglrenderer with APIR support (pending upstream review)
- Target backend libraries (libggml-metal, libggml-vulkan, etc.)
## Configuration
### Environment Variables
- `GGML_VIRTGPU_BACKEND_LIBRARY`: Path to the host-side backend library
- `GGML_VIRTGPU_DEBUG`: Enable debug logging
### Build Options
- `GGML_VIRTGPU`: Enable the VirtGPU backend (`ON` or `OFF`, default: `OFF`)
- `GGML_VIRTGPU_BACKEND`: Build the host-side backend component (`ON`, `OFF` or `ONLY`, default: `OFF`)
### System Requirements
- VM with virtio-gpu support
- VirglRenderer with APIR patches
- Compatible backend libraries on host
## Limitations
- **VM-specific**: Only works in virtual machines with virtio-gpu support
- **Host dependency**: Requires properly configured host-side backend
- **Latency**: Small overhead from VM escaping for each operation
- **Shared-memory size**: with the `libkrun` hypervisor, the RAM + VRAM
addressable memory is limited to 64 GB. So the maximum GPU memory
will be `64GB - RAM`, regardless of the hardware VRAM size.
* This work is pending upstream changes in the VirglRenderer
project.
* The backend can be tested with Virglrenderer compiled from source
using this PR:
https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590
* This work is pending changes in the VMM/hypervisor running the
virtual machine, which need to know how to route the newly
introduced APIR capset.
* The environment variable `VIRGL_ROUTE_VENUS_TO_APIR=1` allows
using the Venus capset, until the relevant hypervisors have been
patched. However, setting this flag breaks the Vulkan/Venus normal
behavior.
* The environment variable `GGML_REMOTING_USE_APIR_CAPSET` tells the
`ggml-virtgpu` backend to use the APIR capset. This will become
the default when the relevant hypervisors have been patched.
* This work focused on improving the performance of llama.cpp running
on MacOS containers, and is mainly tested on this platform. The
linux support (via `krun`) is in progress.
## See Also
- [Development and Testing](VirtGPU/development.md)
- [Backend configuration](VirtGPU/configuration.md)

View File

@@ -1,174 +0,0 @@
# GGML-VirtGPU Backend Configuration
This document describes the environment variables used by the ggml-virtgpu backend system, covering both the frontend (guest-side) and backend (host-side) components.
## Environment Variables Overview
The ggml-virtgpu backend uses environment variables for configuration across three main components:
- **Frontend (Guest)**: GGML applications running in VMs
- **Hypervisor**: Virglrenderer/APIR system
- **Backend (Host)**: Host-side GGML backend integration
## Frontend (Guest-side) Configuration
### GGML_REMOTING_USE_APIR_CAPSET
- **Location**: `ggml/src/ggml-virtgpu/virtgpu.cpp`
- **Type**: Boolean flag (presence-based)
- **Purpose**: Controls which virtio-gpu capability set to use for communication
- **Values**:
- Set (any value): Use the APIR capset (long-term setup)
- Unset: Use the Venus capset (easier for testing with an unmodified hypervisor)
- **Default**: Unset (Venus capset)
- **Usage**:
```bash
export GGML_REMOTING_USE_APIR_CAPSET=1 # Use APIR capset
# or leave unset for Venus capset
```
## Hypervisor (Virglrenderer/APIR) Configuration
These environment variables are used during the transition phase for
running with an unmodified hypervisor (not supporting the
VirglRenderer APIR component). They will be removed in the future, and
the hypervisor will instead configure VirglRenderer with the APIR
_Configuration Key_.
### VIRGL_APIR_BACKEND_LIBRARY
- **Location**: `virglrenderer/src/apir/apir-context.c`
- **Configuration Key**: `apir.load_library.path`
- **Type**: File path string
- **Purpose**: Path to the APIR backend library that virglrenderer should dynamically load
- **Required**: Yes
- **Example**:
```bash
export VIRGL_APIR_BACKEND_LIBRARY="/path/to/libggml-remotingbackend.so"
```
### VIRGL_ROUTE_VENUS_TO_APIR
- **Location**: `virglrenderer/src/apir/apir-renderer.h`
- **Type**: Boolean flag (presence-based)
- **Purpose**: Temporary workaround to route Venus capset calls to APIR during hypervisor transition period
- **Status**: will be removed once hypervisors support APIR natively
- **Warning**: Breaks normal Vulkan/Venus functionality
- **Usage**:
```bash
export VIRGL_ROUTE_VENUS_TO_APIR=1 # For testing with an unmodified hypervisor
```
### VIRGL_APIR_LOG_TO_FILE
- **Location**: `virglrenderer/src/apir/apir-renderer.c`
- **Environment Variable**: `VIRGL_APIR_LOG_TO_FILE`
- **Type**: File path string
- **Purpose**: Enable debug logging from the VirglRenderer APIR component to specified file
- **Required**: No (optional debugging)
- **Default**: Logging to `stderr`
- **Usage**:
```bash
export VIRGL_APIR_LOG_TO_FILE="/tmp/apir-debug.log"
```
## Backend (Host-side) Configuration
These environment variables are used during the transition phase for
running with an unmodified hypervisor (not supporting the
VirglRenderer APIR component). They will be removed in the future, and
the hypervisor will instead configure VirglRenderer with the APIR
_Configuration Key_.
### APIR_LLAMA_CPP_GGML_LIBRARY_PATH
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp`
- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_PATH`
- **Configuration Key**: `ggml.library.path`
- **Type**: File path string
- **Purpose**: Path to the actual GGML backend library (Metal, CUDA, Vulkan, etc.)
- **Required**: **Yes** - backend initialization fails without this
- **Examples**:
```bash
# macOS with Metal backend
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib"
# Linux with CUDA backend
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-cuda.so"
# macOS or Linux with Vulkan backend
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-vulkan.so"
```
### APIR_LLAMA_CPP_GGML_LIBRARY_REG
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp`
- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_REG`
- **Configuration Key**: `ggml.library.reg`
- **Type**: Function symbol name string
- **Purpose**: Name of the backend registration function to call after loading the library
- **Required**: No (defaults to `ggml_backend_init`)
- **Default**: `ggml_backend_init`
- **Examples**:
```bash
# Metal backend
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg"
# CUDA backend
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_cuda_reg"
# Vulkan backend
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_vulkan_reg"
# Generic fallback (default)
# export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_init"
```
### APIR_LLAMA_CPP_LOG_TO_FILE
- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp:62`
- **Environment Variable**: `APIR_LLAMA_CPP_LOG_TO_FILE`
- **Type**: File path string
- **Purpose**: Enable debug logging from the GGML backend to specified file
- **Required**: No (optional debugging)
- **Usage**:
```bash
export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml-backend-debug.log"
```
## Configuration Flow
The configuration system works as follows:
1. **Hypervisor Setup**: Virglrenderer loads the APIR backend library specified by `VIRGL_APIR_BACKEND_LIBRARY`
2. **Context Creation**: When an APIR context is created, it populates a configuration table with environment variables:
- `apir.load_library.path` ← `VIRGL_APIR_BACKEND_LIBRARY`
- `ggml.library.path` ← `APIR_LLAMA_CPP_GGML_LIBRARY_PATH`
- `ggml.library.reg` ← `APIR_LLAMA_CPP_GGML_LIBRARY_REG`
- this step will eventually be performed by the hypervisor itself, with command-line arguments instead of environment variables.
3. **Backend Initialization**: The backend queries the configuration via callbacks:
- `virgl_cbs->get_config(ctx_id, "ggml.library.path")` returns the library path
- `virgl_cbs->get_config(ctx_id, "ggml.library.reg")` returns the registration function
4. **Library Loading**: The backend dynamically loads and initializes the specified GGML library
## Error Messages
Common error scenarios and their messages:
- **Missing library path**: `"cannot open the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_PATH' not defined"`
- **Missing registration function**: `"cannot register the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_REG' not defined"`
## Example Complete Configuration
Here's an example configuration for a macOS host with Metal backend:
```bash
# Hypervisor environment
export VIRGL_APIR_BACKEND_LIBRARY="/opt/llama.cpp/lib/libggml-virtgpu-backend.dylib"
# Backend configuration
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib"
export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg"
# Optional logging
export VIRGL_APIR_LOG_TO_FILE="/tmp/apir.log"
export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml.log"
# Guest configuration
export GGML_REMOTING_USE_APIR_CAPSET=1
```

View File

@@ -1,220 +0,0 @@
# Development and Testing
## Development
### Code Generation
The backend uses code generation from YAML configuration:
```bash
# Regenerate protocol code
cd ggml-virtgpu/
python regenerate_remoting.py
```
### Adding New Operations
1. Add function definition to `ggmlremoting_functions.yaml`
2. Regenerate code with `regenerate_remoting.py`
3. Implement guest-side forwarding in `virtgpu-forward-*.cpp`
4. Implement host-side handling in `backend-dispatched-*.cpp`
## Testing
This document provides instructions for building and testing the GGML-VirtGPU backend on macOS with containers.
### Prerequisites
The testing setup requires:
- macOS host system
- Container runtime with `libkrun` provider (podman machine)
- Access to development patchset for VirglRenderer
### Required Patchsets
The backend requires patches that are currently under review:
- **Virglrenderer APIR upstream PR**: https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590 (for reference)
- **MacOS Virglrenderer (for krunkit)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-macos
- **Linux Virglrenderer (for krun)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-linux
### Build Instructions
#### 1. Build ggml-virtgpu-backend (Host-side, macOS)
```bash
# Build the backend that runs natively on macOS
mkdir llama.cpp
cd llama.cpp
git clone https://github.com/ggml-org/llama.cpp.git src
cd src
LLAMA_MAC_BUILD=$PWD/build/ggml-virtgpu-backend
cmake -S . -B $LLAMA_MAC_BUILD \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=ON \
-DGGML_REMOTINGBACKEND=ONLY \
-DGGML_METAL=ON
TARGETS="ggml-metal"
cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $TARGETS
# Build additional tools for native benchmarking
EXTRA_TARGETS="llama-run llama-bench"
cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $EXTRA_TARGETS
```
#### 2. Build virglrenderer (Host-side, macOS)
```bash
# Build virglrenderer with APIR support
mkdir virglrenderer
git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src
cd src
VIRGL_BUILD_DIR=$PWD/build
# -Dvenus=true and VIRGL_ROUTE_VENUS_TO_APIR=1 route the APIR requests via the Venus backend, for easier testing without a patched hypervisor
meson setup $VIRGL_BUILD_DIR \
-Dvenus=true \
-Dapir=true
ninja -C $VIRGL_BUILD_DIR
```
#### 3. Build ggml-virtgpu (Guest-side, Linux)
Option A: Build from a script:
```bash
# Inside a Linux container
mkdir llama.cpp
git clone https://github.com/ggml-org/llama.cpp.git src
cd src
LLAMA_LINUX_BUILD=$PWD//build-virtgpu
cmake -S . -B $LLAMA_LINUX_BUILD \
-DGGML_VIRTGPU=ON
ninja -C $LLAMA_LINUX_BUILD
```
Option B: Build container image with frontend:
```bash
cat << EOF > remoting.containerfile
FROM quay.io/fedora/fedora:43
USER 0
WORKDIR /app/remoting
ARG LLAMA_CPP_REPO="https://github.com/ggml-org/llama.cpp.git"
ARG LLAMA_CPP_VERSION="master"
ARG LLAMA_CPP_CMAKE_FLAGS="-DGGML_VIRTGPU=ON"
ARG LLAMA_CPP_CMAKE_BUILD_FLAGS="--parallel 4"
RUN dnf install -y git cmake gcc gcc-c++ libcurl-devel libdrm-devel
RUN git clone "\${LLAMA_CPP_REPO}" src \\
&& git -C src fetch origin \${LLAMA_CPP_VERSION} \\
&& git -C src reset --hard FETCH_HEAD
RUN mkdir -p build \\
&& cd src \\
&& set -o pipefail \\
&& cmake -S . -B ../build \${LLAMA_CPP_CMAKE_FLAGS} \\
&& cmake --build ../build/ \${LLAMA_CPP_CMAKE_BUILD_FLAGS}
ENTRYPOINT ["/app/remoting/src/build/bin/llama-server"]
EOF
mkdir -p empty_dir
podman build -f remoting.containerfile ./empty_dir -t localhost/llama-cpp.virtgpu
```
### Environment Setup
#### Set krunkit Environment Variables
```bash
# Define the base directories (adapt these paths to your system)
VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build
LLAMA_MAC_BUILD=$HOME/remoting/llama.cpp/build-backend
# For krunkit to load the custom virglrenderer library
export DYLD_LIBRARY_PATH=$VIRGL_BUILD_DIR/src
# For Virglrenderer to load the ggml-remotingbackend library
export VIRGL_APIR_BACKEND_LIBRARY="$LLAMA_MAC_BUILD/bin/libggml-virtgpu-backend.dylib"
# For llama.cpp remotingbackend to load the ggml-metal backend
export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="$LLAMA_MAC_BUILD/bin/libggml-metal.dylib"
export APIR_LLAMA_CPP_GGML_LIBRARY_REG=ggml_backend_metal_reg
```
#### Launch Container Environment
```bash
# Set container provider to libkrun
export CONTAINERS_MACHINE_PROVIDER=libkrun
podman machine start
```
#### Verify Environment
Confirm that krunkit is using the correct virglrenderer library:
```bash
lsof -c krunkit | grep virglrenderer
# Expected output:
# krunkit 50574 user txt REG 1,14 2273912 10849442 ($VIRGL_BUILD_DIR/src)/libvirglrenderer.1.dylib
```
### Running Tests
#### Launch Test Container
```bash
# Optional model caching
mkdir -p models
PODMAN_CACHE_ARGS="-v models:/models --user root:root --cgroupns host --security-opt label=disable -w /models"
podman run $PODMAN_CACHE_ARGS -it --rm --device /dev/dri localhost/llama-cpp.virtgpu
```
#### Test llama.cpp in Container
```bash
# Run performance benchmark
/app/remoting/build/bin/llama-bench -m ./llama3.2
```
Expected output (performance may vary):
```
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | ------------: | -------------------: |
| llama 3B Q4_K - Medium | 1.87 GiB | 3.21 B | ggml-virtgpu | 99 | pp512 | 991.30 ± 0.66 |
| llama 3B Q4_K - Medium | 1.87 GiB | 3.21 B | ggml-virtgpu | 99 | tg128 | 85.71 ± 0.11 |
```
### Troubleshooting
#### SSH Environment Variable Issues
⚠️ **Warning**: Setting `DYLD_LIBRARY_PATH` from SSH doesn't work on macOS. Here is a workaround:
**Workaround 1: Replace system library**
```bash
VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build # ⚠️ adapt to your system
BREW_VIRGL_DIR=/opt/homebrew/Cellar/virglrenderer/0.10.4d/lib
VIRGL_LIB=libvirglrenderer.1.dylib
cd $BREW_VIRGL_DIR
mv $VIRGL_LIB ${VIRGL_LIB}.orig
ln -s $VIRGL_BUILD_DIR/src/$VIRGL_LIB
```

View File

@@ -22,7 +22,7 @@
**Llama.cpp + ZenDNN**
The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL DLP, LibXSMM, OneDNN).
The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL BLIS, LibXSMM, OneDNN).
For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendnn.html
@@ -32,7 +32,7 @@ For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendn
|:-------:|:-------:|:----------------------------------------------:|
| Linux | Support | Ubuntu 20.04, 22.04, 24.04 |
For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md#15-supported-os).
For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/zendnnl/README.md#15-supported-os).
## Hardware
@@ -44,9 +44,9 @@ ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based
| CPU Family | Status | Notes |
|:-----------------------------:|:-------:|:----------------------------------:|
| AMD EPYC™ 9005 Series (Turin) | Support | 5th Gen - Zen 5 architecture |
| AMD EPYC™ 9004 Series (Genoa) | Support | 4th Gen - Zen 4 architecture |
| AMD EPYC™ 7003 Series (Milan) | Support | 3rd Gen - Zen 3 architecture |
| AMD EPYC™ 9005 Series (Turin)| Support | 5th Gen - Zen 5 architecture |
| AMD EPYC™ 9004 Series (Genoa)| Support | 4th Gen - Zen 4 architecture |
| AMD EPYC™ 7003 Series (Milan)| Support | 3rd Gen - Zen 3 architecture |
| AMD Ryzen™ AI MAX (Strix Halo)| Support | High-performance mobile processors |
*Notes:*
@@ -61,7 +61,7 @@ The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** ope
| Operation | Status | Notes |
|:-------------|:-------:|:----------------------------------------------:|
| MUL_MAT | Support | Accelerated via ZenDNN LowOHA MatMul |
| MUL_MAT | | Accelerated via ZenDNN LowOHA MatMul |
*Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
@@ -104,6 +104,7 @@ If you want to build ZenDNN yourself or use a specific version:
# Clone ZenDNN repository
git clone https://github.com/amd/ZenDNN.git
cd ZenDNN
git checkout zendnnl
# Build and install (requires CMake >= 3.25)
mkdir build && cd build
@@ -113,7 +114,7 @@ cmake --build . --target all
Default installation path: `ZenDNN/build/install`
**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md).
**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/zendnnl/README.md).
**Step 2: Build llama.cpp with custom ZenDNN path**
@@ -145,7 +146,8 @@ Run llama.cpp server with ZenDNN acceleration:
```sh
# Set optimal configuration
export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo for best performance
export OMP_NUM_THREADS=64 # Adjust to your CPU core count
export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS for best performance
# Start server
./build/bin/llama-server \
@@ -158,26 +160,62 @@ export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo for best performance
Access the server at `http://localhost:8080`.
**Performance tips**:
- Use `ZENDNNL_MATMUL_ALGO=1` for optimal performance
- Set `OMP_NUM_THREADS` to match your physical core count
- Use `ZENDNNL_MATMUL_ALGO=2` for optimal performance
- For NUMA systems: `numactl --cpunodebind=0 --membind=0 ./build/bin/llama-server ...`
## Environment Variable
For environment variables related to ZenDNN, refer to the [ZenDNN Environment Variables Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md).
### Build Time
### Performance Optimization
| Name | Value | Function |
|--------------------|---------------------------------------|---------------------------------------------|
| GGML_ZENDNN | ON/OFF | Enable ZenDNN backend support |
| ZENDNN_ROOT | Path to ZenDNN installation | Set ZenDNN installation directory |
| GGML_OPENMP | ON/OFF (recommended: ON) | Enable OpenMP for multi-threading |
ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL DLP** algorithm:
### Runtime
| Name | Value | Function |
|-------------------------|--------------------------|-------------------------------------------------------------------|
| OMP_NUM_THREADS | Number (e.g., 64) | Set number of OpenMP threads (recommended: physical core count) |
| ZENDNNL_MATMUL_ALGO | 0-5 | Select MatMul backend algorithm (see Performance Optimization) |
| ZENDNNL_PROFILE_LOG_LEVEL | 0-4 | Profiling log level (0=disabled, 4=verbose) |
| ZENDNNL_ENABLE_PROFILER | 0 or 1 | Enable detailed profiling (1=enabled) |
| ZENDNNL_API_LOG_LEVEL | 0-4 | API log level (0=disabled, 4=verbose) |
**Example**:
```sh
export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo (recommended)
export OMP_NUM_THREADS=64
export ZENDNNL_MATMUL_ALGO=2 # Use Blocked AOCL BLIS for best performance
./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Test" -n 100
```
For more details on available algorithms, see the [ZenDNN MatMul Algorithm Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md#algorithm-details).
## Performance Optimization
### MatMul Algorithm Selection
ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL BLIS** algorithm:
```sh
export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS (recommended)
```
**Available algorithms**:
| Value | Algorithm | Description |
|:-----:|:-----------------------|:----------------------------------------------|
| 0 | Dynamic Dispatch | Automatic backend selection (default) |
| 1 | AOCL BLIS | AOCL BLIS backend |
| 2 | AOCL BLIS Blocked | **Blocked AOCL BLIS (recommended)** |
| 3 | OneDNN | OneDNN backend |
| 4 | OneDNN Blocked | Blocked OneDNN |
| 5 | LibXSMM | LibXSMM backend |
### Profiling and Debugging
For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/logging.md).
For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/zendnnl/docs/logging.md).
## Known Issues
@@ -207,9 +245,10 @@ A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized mode
A: Ensure:
1. You're using an AMD EPYC or Ryzen processor (Zen 2 or newer)
2. `ZENDNNL_MATMUL_ALGO=1` is set for best performance (Blocked AOCL DLP)
3. You're using a sufficiently large model (small models may not benefit as much)
4. Enable profiling to verify ZenDNN MatMul is being called
2. `OMP_NUM_THREADS` is set appropriately (physical core count)
3. `ZENDNNL_MATMUL_ALGO=2` is set for best performance (Blocked AOCL BLIS)
4. You're using a sufficiently large model (small models may not benefit as much)
5. Enable profiling to verify ZenDNN MatMul is being called
### **GitHub Contribution**:
Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-team check/address them without delay.

View File

@@ -35,7 +35,7 @@ Adapt below build commands accordingly.
Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
```
[d]/workspace> cp docs/backend/snapdragon/CMakeUserPresets.json .
[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
[d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
Preset CMake variables:

View File

@@ -242,10 +242,10 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|------------|-------------|------|-------|
| FP32 | ✅ | ✅ | ❓ |
| FP16 | ✅ | ✅ | ❓ |
| BF16 | | ✅ | ❓ |
| BF16 | 🚫 | ✅ | ❓ |
| Q4_0 | ✅ | ❓ | ❓ |
| Q4_1 | ✅ | ❓ | ❓ |
| MXFP4 | | ❓ | ❓ |
| MXFP4 | 🚫 | ❓ | ❓ |
| Q5_0 | ✅ | ❓ | ❓ |
| Q5_1 | ✅ | ❓ | ❓ |
| Q8_0 | ✅ | ❓ | ❓ |
@@ -272,4 +272,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Feb 15, 2026.
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.

View File

@@ -108,7 +108,7 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
- Using oneAPI docker image:
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://builders.intel.com/solutionslibrary/optimizing-and-running-llama2-on-intel-cpu) for more information.
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
### Other BLAS libraries

View File

@@ -22,7 +22,7 @@ Legend:
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
@@ -31,7 +31,7 @@ Legend:
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | | ❌ | ❌ |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@@ -96,13 +96,13 @@ Legend:
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | | ❌ | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | | ❌ | ❌ |
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |

View File

@@ -77,8 +77,8 @@
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
@@ -161,8 +161,8 @@
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
Can't render this file because it is too large.

View File

@@ -8760,14 +8760,22 @@
"WebGPU: WebGPU","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=1","support","0","no","WebGPU"
"WebGPU: WebGPU","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=32","support","0","no","WebGPU"
"WebGPU: WebGPU","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=129","support","0","no","WebGPU"
"WebGPU: WebGPU","SQR","type=f16,ne=[10,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","SQRT","type=f16,ne=[10,3,3,2]","support","0","no","WebGPU"
"WebGPU: WebGPU","LOG","type=f16,ne=[10,5,4,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SIN","type=f16,ne=[10,2,2,2]","support","0","no","WebGPU"
"WebGPU: WebGPU","COS","type=f16,ne=[10,2,2,2]","support","0","no","WebGPU"
"WebGPU: WebGPU","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
"WebGPU: WebGPU","LEAKY_RELU","type=f16,ne_a=[10,5,4,3],negative_slope=0.100000","support","0","no","WebGPU"
"WebGPU: WebGPU","FLOOR","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","CEIL","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","ROUND","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","TRUNC","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQR","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","SQRT","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","LOG","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SIN","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","COS","type=f16,ne=[7,1,5,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
"WebGPU: WebGPU","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","WebGPU"
"WebGPU: WebGPU","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
@@ -8778,14 +8786,22 @@
"WebGPU: WebGPU","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQR","type=f32,ne=[10,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","SQRT","type=f32,ne=[10,3,3,2]","support","0","no","WebGPU"
"WebGPU: WebGPU","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SIN","type=f32,ne=[10,2,2,2]","support","0","no","WebGPU"
"WebGPU: WebGPU","COS","type=f32,ne=[10,2,2,2]","support","0","no","WebGPU"
"WebGPU: WebGPU","CLAMP","type=f32,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
"WebGPU: WebGPU","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","0","no","WebGPU"
"WebGPU: WebGPU","FLOOR","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","CEIL","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","ROUND","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","TRUNC","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQR","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","SQRT","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","LOG","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SIN","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","COS","type=f32,ne=[7,1,5,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","WebGPU"
"WebGPU: WebGPU","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","WebGPU"
"WebGPU: WebGPU","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
@@ -18885,27 +18901,3 @@
"WebGPU: WebGPU","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","SQR","type=f16,ne=[10,5,4,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQRT","type=f16,ne=[10,3,3,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SIN","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","COS","type=f16,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQR","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQR","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQRT","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQRT","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SIN","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SIN","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","COS","type=f16,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","COS","type=f16,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SIN","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","COS","type=f32,ne=[10,2,2,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQR","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQR","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQRT","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SQRT","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SIN","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SIN","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","COS","type=f32,ne=[7,1,5,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","COS","type=f32,ne=[1024,1024,1,1]","support","1","yes","WebGPU"
Can't render this file because it is too large.

View File

@@ -119,6 +119,8 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
of lookup n-gram (default: 12)
--spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length
of draft m-gram (default: 48)
--spec-ngram-check-rate N ngram check rate for ngram-simple/ngram-map speculative decoding
(default: 1)
--spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1)
```
@@ -151,6 +153,10 @@ Sets the size M of the draft m-gram for n-gram map based speculative decoding.
The m-gram size determines how many tokens to draft when a match is found.
Larger values can provide more speedup but may reduce acceptance rate.
### `--spec-ngram-check-rate R`
This option aims at performance if the n-gram lookup in history is to costly. A lookup will be executed at every R tokens (default is 1, every token).
### `--spec-ngram-min-hits H`
This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
@@ -169,12 +175,7 @@ draft acceptance rate = 0.70312 ( 90 accepted / 128 generated)
statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms
```
```
statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts = 26, #gen tokens = 1248, #acc tokens = 968, dur(b,g,a) = 2.234, 1.427, 0.016 ms
```
- `#calls(b,g,a)`: number of calls of begin (new prompt), generation and accumulation of this implementations
- `#calls`: number of calls of this implementations
- `#gen drafts`: number of drafts generated by this implementation
- `#acc drafts`: number of drafts accepted (partially) by the main model
- `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)

View File

@@ -5,7 +5,6 @@
#include "sampling.h"
#include <algorithm>
#include <clocale>
#include <cstdio>
#include <string>
#include <vector>
@@ -17,8 +16,6 @@ static void print_usage(int, char ** argv) {
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
params.prompt = "Hello my name is";

View File

@@ -5,16 +5,14 @@
#include "common.h"
#include "log.h"
#include <algorithm>
#include <cassert>
#include <cinttypes>
#include <climits>
#include <clocale>
#include <cstdarg>
#include <cstring>
#include <ctime>
#include <unordered_map>
#include <vector>
#include <cassert>
#include <climits>
#include <cstring>
#include <cstdarg>
#include <cinttypes>
#include <ctime>
#include <random>
#include <stdexcept>
#include <sstream>
@@ -876,8 +874,6 @@ static std::string basename(const std::string &path) {
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_init();
struct train_params params = get_default_train_params();

View File

@@ -1,14 +1,11 @@
// Warns users that this filename was deprecated, and provides a link for more information.
#include <clocale>
#include <cstdio>
#include <string>
#include <unordered_map>
// Main
int main(int argc, char** argv) {
std::setlocale(LC_NUMERIC, "C");
std::string filename = "main";
if (argc >= 1) {
filename = argv[0];

View File

@@ -7,7 +7,6 @@
#include <limits.h>
#include <algorithm>
#include <clocale>
#include <cmath>
#include <cstring>
#include <limits>
@@ -539,8 +538,6 @@ static std::string format_input_text(const std::string & prompt, const std::stri
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
ggml_time_init();
common_params params;

View File

@@ -3,7 +3,6 @@
#include "log.h"
#include "llama.h"
#include <clocale>
#include <ctime>
#include <algorithm>
@@ -95,8 +94,6 @@ static void print_raw_embeddings(const float * emb,
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {

View File

@@ -4,8 +4,6 @@
#include "log.h"
#include "llama.h"
#include "llama-cpp.h"
#include <clocale>
#include <string>
#include <vector>
@@ -31,8 +29,6 @@ static bool run(llama_context * ctx, const common_params & params) {
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
base_callback_data cb_data;
common_params params;

View File

@@ -1,7 +1,6 @@
#include "arg.h"
#include "common.h"
#include <clocale>
#include <fstream>
#include <sstream>
#include <string>
@@ -101,8 +100,6 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
}
int main(int, char **) {
std::setlocale(LC_NUMERIC, "C");
for (const auto & md : md_files) {
std::ifstream infile(md.fname);
if (!infile.is_open()) {

View File

@@ -1,14 +1,13 @@
#include "ggml.h"
#include "gguf.h"
#include <algorithm>
#include <clocale>
#include <cstdlib> /* abort() */
#include <cstddef>
#include <cstdio>
#include <cstdlib> /* abort() */
#include <cstring>
#include <stdexcept>
#include <string>
#include <stdexcept>
#include <algorithm>
#include <cstring>
#include <sstream>
#include <fstream>
@@ -627,8 +626,6 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
}
int main(int argc, const char ** argv) {
std::setlocale(LC_NUMERIC, "C");
hash_params params;
manifest_check_params manifest_check;
hash_params_parse(argc, argv, params);

View File

@@ -1,7 +1,6 @@
#include "ggml.h"
#include "gguf.h"
#include <clocale>
#include <cstdio>
#include <string>
#include <sstream>
@@ -241,8 +240,6 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
if (argc < 3) {
printf("usage: %s data.gguf r|w [n]\n", argv[0]);
printf("r: read data.gguf file\n");

View File

@@ -4,11 +4,10 @@
#include "log.h"
#include "llama.h"
#include <algorithm>
#include <clocale>
#include <cstdio>
#include <string>
#include <vector>
#include <algorithm>
struct ngram_data {
bool active = false;
@@ -39,8 +38,6 @@ struct ngram_container {
};
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {

View File

@@ -3,13 +3,10 @@
#include "ngram-cache.h"
#include "llama.h"
#include <clocale>
#include <string>
#include <vector>
int main(int argc, char ** argv){
std::setlocale(LC_NUMERIC, "C");
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {

View File

@@ -3,7 +3,6 @@
#include "common.h"
#include "ngram-cache.h"
#include <clocale>
#include <cstdint>
#include <cstdio>
#include <fstream>
@@ -18,8 +17,6 @@ static void print_usage(char* argv0) {
}
int main(int argc, char ** argv){
std::setlocale(LC_NUMERIC, "C");
if (argc < 3) {
print_usage(argv[0]);
exit(1);

View File

@@ -5,17 +5,14 @@
#include "llama.h"
#include "ggml.h"
#include <cinttypes>
#include <clocale>
#include <cstdint>
#include <cstdio>
#include <cinttypes>
#include <fstream>
#include <string>
#include <vector>
int main(int argc, char ** argv){
std::setlocale(LC_NUMERIC, "C");
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {

View File

@@ -6,7 +6,6 @@
#include "log.h"
#include "llama.h"
#include <clocale>
#include <cstdint>
#include <cstdio>
#include <fstream>
@@ -14,8 +13,6 @@
#include <vector>
int main(int argc, char ** argv){
std::setlocale(LC_NUMERIC, "C");
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {

View File

@@ -77,10 +77,7 @@ causal-verify-embeddings: causal-run-original-embeddings causal-run-converted-em
@./scripts/causal/compare-embeddings-logits.sh
causal-inspect-original-model:
@./scripts/utils/inspect-org-model.py --list-all -s
causal-list-original-model-tensors:
@./scripts/utils/inspect-org-model.py --list-all-short -s
@./scripts/utils/inspect-org-model.py
causal-inspect-converted-model:
@./scripts/utils/inspect-converted-model.sh
@@ -156,7 +153,7 @@ embedding-verify-logits-st: embedding-run-original-model-st embedding-run-conver
embedding-inspect-original-model:
$(call validate_embedding_model_path,embedding-inspect-original-model)
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH} --list-all -s
@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}
embedding-inspect-converted-model:
@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/utils/inspect-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}

View File

@@ -42,15 +42,11 @@ def load_model_and_tokenizer(model_path, device="auto"):
config = config.text_config
multimodal = True
def print_if_exists(label, obj, attr, default="N/A"):
val = getattr(obj, attr) if hasattr(obj, attr) else default
print(f"{label}", val)
print_if_exists("Vocab size: ", config, "vocab_size")
print_if_exists("Hidden size: ", config, "hidden_size")
print_if_exists("Number of layers: ", config, "num_hidden_layers")
print_if_exists("BOS token id: ", config, "bos_token_id")
print_if_exists("EOS token id: ", config, "eos_token_id")
print("Vocab size: ", config.vocab_size)
print("Hidden size: ", config.hidden_size)
print("Number of layers: ", config.num_hidden_layers)
print("BOS token id: ", config.bos_token_id)
print("EOS token id: ", config.eos_token_id)
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
if unreleased_model_name:

View File

@@ -1,290 +1,67 @@
#!/usr/bin/env python3
import argparse
import json
import os
import re
import struct
import sys
from pathlib import Path
from typing import Optional
import json
from safetensors import safe_open
from collections import defaultdict
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
args = parser.parse_args()
MODEL_SAFETENSORS_FILE = "model.safetensors"
MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
model_path = os.environ.get('MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
DTYPE_SIZES = {
"F64": 8, "I64": 8, "U64": 8,
"F32": 4, "I32": 4, "U32": 4,
"F16": 2, "BF16": 2, "I16": 2, "U16": 2,
"I8": 1, "U8": 1, "BOOL": 1,
"F8_E4M3": 1, "F8_E5M2": 1,
}
# Check if there's an index file (multi-file model)
index_path = os.path.join(model_path, "model.safetensors.index.json")
single_file_path = os.path.join(model_path, "model.safetensors")
SIZE_UNITS = ['B', 'KB', 'MB', 'GB', 'TB']
if os.path.exists(index_path):
# Multi-file model
print("Multi-file model detected")
with open(index_path, 'r') as f:
index_data = json.load(f)
def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
index_file = model_path / MODEL_SAFETENSORS_INDEX
# Get the weight map (tensor_name -> file_name)
weight_map = index_data.get("weight_map", {})
if index_file.exists():
with open(index_file, 'r') as f:
index = json.load(f)
return index.get("weight_map", {})
# Group tensors by file for efficient processing
file_tensors = defaultdict(list)
for tensor_name, file_name in weight_map.items():
file_tensors[file_name].append(tensor_name)
return None
print("Tensors in model:")
# Process each shard file
for file_name, tensor_names in file_tensors.items():
file_path = os.path.join(model_path, file_name)
print(f"\n--- From {file_name} ---")
def get_all_tensor_names(model_path: Path) -> list[str]:
weight_map = get_weight_map(model_path)
with safe_open(file_path, framework="pt") as f:
for tensor_name in sorted(tensor_names):
tensor = f.get_tensor(tensor_name)
print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
if weight_map is not None:
return list(weight_map.keys())
elif os.path.exists(single_file_path):
# Single file model (original behavior)
print("Single-file model detected")
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
try:
with safe_open(single_file, framework="pt", device="cpu") as f:
return list(f.keys())
except Exception as e:
print(f"Error reading {single_file}: {e}")
sys.exit(1)
with safe_open(single_file_path, framework="pt") as f:
keys = f.keys()
print("Tensors in model:")
for key in sorted(keys):
tensor = f.get_tensor(key)
print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
print(f"Error: No safetensors files found in {model_path}")
sys.exit(1)
def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
weight_map = get_weight_map(model_path)
if weight_map is not None:
return weight_map.get(tensor_name)
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
return single_file.name
return None
def read_safetensors_header(file_path: Path) -> dict:
with open(file_path, 'rb') as f:
header_size = struct.unpack('<Q', f.read(8))[0]
return json.loads(f.read(header_size))
def get_tensor_size_bytes(tensor_meta: dict) -> int:
offsets = tensor_meta.get("data_offsets")
if offsets and len(offsets) == 2:
return offsets[1] - offsets[0]
n_elements = 1
for d in tensor_meta.get("shape", []):
n_elements *= d
return n_elements * DTYPE_SIZES.get(tensor_meta.get("dtype", "F32"), 4)
def format_size(size_bytes: int) -> str:
val = float(size_bytes)
for unit in SIZE_UNITS[:-1]:
if val < 1024.0:
return f"{val:.2f} {unit}"
val /= 1024.0
return f"{val:.2f} {SIZE_UNITS[-1]}"
def get_all_tensor_metadata(model_path: Path) -> dict[str, dict]:
weight_map = get_weight_map(model_path)
if weight_map is not None:
file_to_tensors: dict[str, list[str]] = {}
for tensor_name, file_name in weight_map.items():
file_to_tensors.setdefault(file_name, []).append(tensor_name)
all_metadata: dict[str, dict] = {}
for file_name, tensor_names in file_to_tensors.items():
try:
header = read_safetensors_header(model_path / file_name)
for tensor_name in tensor_names:
if tensor_name in header:
all_metadata[tensor_name] = header[tensor_name]
except Exception as e:
print(f"Warning: Could not read header from {file_name}: {e}", file=sys.stderr)
return all_metadata
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
try:
header = read_safetensors_header(single_file)
return {k: v for k, v in header.items() if k != "__metadata__"}
except Exception as e:
print(f"Error reading {single_file}: {e}")
sys.exit(1)
print(f"Error: No safetensors files found in {model_path}")
sys.exit(1)
def normalize_tensor_name(tensor_name: str) -> str:
normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
normalized = re.sub(r'\.\d+$', '.#', normalized)
return normalized
def list_all_tensors(
model_path: Path,
short: bool = False,
show_sizes: bool = False,
):
tensor_names = get_all_tensor_names(model_path)
metadata: Optional[dict[str, dict]] = None
if show_sizes:
metadata = get_all_tensor_metadata(model_path)
total_bytes = 0
if short:
seen: dict[str, str] = {}
for tensor_name in sorted(tensor_names):
normalized = normalize_tensor_name(tensor_name)
if normalized not in seen:
seen[normalized] = tensor_name
display_pairs = list(sorted(seen.items()))
name_width = max((len(n) for n, _ in display_pairs), default=0)
for normalized, first_name in display_pairs:
if metadata and first_name in metadata:
m = metadata[first_name]
size = get_tensor_size_bytes(m)
total_bytes += size
print(f"{normalized:{name_width}} {m.get('dtype', '?'):6s} {str(m.get('shape', '')):30s} {format_size(size)}")
else:
print(normalized)
else:
print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
print("Available files:")
if os.path.exists(model_path):
for item in sorted(os.listdir(model_path)):
print(f" {item}")
else:
name_width = max((len(n) for n in tensor_names), default=0)
for tensor_name in sorted(tensor_names):
if metadata and tensor_name in metadata:
m = metadata[tensor_name]
size = get_tensor_size_bytes(m)
total_bytes += size
print(f"{tensor_name:{name_width}} {m.get('dtype', '?'):6s} {str(m.get('shape', '')):30s} {format_size(size)}")
else:
print(tensor_name)
if show_sizes:
print(f"\nTotal: {format_size(total_bytes)}")
def print_tensor_info(model_path: Path, tensor_name: str, num_values: Optional[int] = None):
tensor_file = find_tensor_file(model_path, tensor_name)
if tensor_file is None:
print(f"Error: Could not find tensor '{tensor_name}' in model index")
print(f"Model path: {model_path}")
sys.exit(1)
file_path = model_path / tensor_file
try:
header = read_safetensors_header(file_path)
tensor_meta = header.get(tensor_name, {})
dtype_str = tensor_meta.get("dtype")
with safe_open(file_path, framework="pt", device="cpu") as f:
if tensor_name in f.keys():
tensor_slice = f.get_slice(tensor_name)
shape = tensor_slice.get_shape()
print(f"Tensor: {tensor_name}")
print(f"File: {tensor_file}")
print(f"Shape: {shape}")
if dtype_str:
print(f"Dtype: {dtype_str}")
if tensor_meta:
print(f"Size: {format_size(get_tensor_size_bytes(tensor_meta))}")
if num_values is not None:
tensor = f.get_tensor(tensor_name)
if not dtype_str:
print(f"Dtype: {tensor.dtype}")
flat = tensor.flatten()
n = min(num_values, flat.numel())
print(f"Values: {flat[:n].tolist()}")
else:
print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
sys.exit(1)
except FileNotFoundError:
print(f"Error: The file '{file_path}' was not found.")
sys.exit(1)
except Exception as e:
print(f"An error occurred: {e}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Print tensor information from a safetensors model"
)
parser.add_argument(
"tensor_name",
nargs="?",
help="Name of the tensor to inspect"
)
parser.add_argument(
"-m", "--model-path",
type=Path,
help="Path to the model directory (default: MODEL_PATH environment variable)"
)
parser.add_argument(
"-l", "--list-all-short",
action="store_true",
help="List unique tensor patterns (layer numbers replaced with #)"
)
parser.add_argument(
"-la", "--list-all",
action="store_true",
help="List all tensor names with actual layer numbers"
)
parser.add_argument(
"-n", "--num-values",
nargs="?",
const=10,
default=None,
type=int,
metavar="N",
help="Print the first N values of the tensor flattened (default: 10 if flag is given without a number)"
)
parser.add_argument(
"-s", "--sizes",
action="store_true",
help="Show dtype, shape, and size for each tensor when listing"
)
args = parser.parse_args()
model_path = args.model_path
if model_path is None:
model_path_str = os.environ.get("MODEL_PATH")
if model_path_str is None:
print("Error: --model-path not provided and MODEL_PATH environment variable not set")
sys.exit(1)
model_path = Path(model_path_str)
if not model_path.exists():
print(f"Error: Model path does not exist: {model_path}")
sys.exit(1)
if not model_path.is_dir():
print(f"Error: Model path is not a directory: {model_path}")
sys.exit(1)
if args.list_all_short or args.list_all:
list_all_tensors(model_path, short=args.list_all_short, show_sizes=args.sizes)
else:
if args.tensor_name is None:
print("Error: tensor_name is required when not using --list-all-short or --list-all")
sys.exit(1)
print_tensor_info(model_path, args.tensor_name, args.num_values)
if __name__ == "__main__":
main()
print(f" Directory {model_path} does not exist")
exit(1)

View File

@@ -0,0 +1,159 @@
#!/usr/bin/env python3
import argparse
import json
import os
import re
import sys
from pathlib import Path
from typing import Optional
from safetensors import safe_open
MODEL_SAFETENSORS_FILE = "model.safetensors"
MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
index_file = model_path / MODEL_SAFETENSORS_INDEX
if index_file.exists():
with open(index_file, 'r') as f:
index = json.load(f)
return index.get("weight_map", {})
return None
def get_all_tensor_names(model_path: Path) -> list[str]:
weight_map = get_weight_map(model_path)
if weight_map is not None:
return list(weight_map.keys())
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
try:
with safe_open(single_file, framework="pt", device="cpu") as f:
return list(f.keys())
except Exception as e:
print(f"Error reading {single_file}: {e}")
sys.exit(1)
print(f"Error: No safetensors files found in {model_path}")
sys.exit(1)
def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
weight_map = get_weight_map(model_path)
if weight_map is not None:
return weight_map.get(tensor_name)
single_file = model_path / MODEL_SAFETENSORS_FILE
if single_file.exists():
return single_file.name
return None
def normalize_tensor_name(tensor_name: str) -> str:
normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
normalized = re.sub(r'\.\d+$', '.#', normalized)
return normalized
def list_all_tensors(model_path: Path, unique: bool = False):
tensor_names = get_all_tensor_names(model_path)
if unique:
seen = set()
for tensor_name in sorted(tensor_names):
normalized = normalize_tensor_name(tensor_name)
if normalized not in seen:
seen.add(normalized)
print(normalized)
else:
for tensor_name in sorted(tensor_names):
print(tensor_name)
def print_tensor_info(model_path: Path, tensor_name: str):
tensor_file = find_tensor_file(model_path, tensor_name)
if tensor_file is None:
print(f"Error: Could not find tensor '{tensor_name}' in model index")
print(f"Model path: {model_path}")
sys.exit(1)
file_path = model_path / tensor_file
try:
with safe_open(file_path, framework="pt", device="cpu") as f:
if tensor_name in f.keys():
tensor_slice = f.get_slice(tensor_name)
shape = tensor_slice.get_shape()
print(f"Tensor: {tensor_name}")
print(f"File: {tensor_file}")
print(f"Shape: {shape}")
else:
print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
sys.exit(1)
except FileNotFoundError:
print(f"Error: The file '{file_path}' was not found.")
sys.exit(1)
except Exception as e:
print(f"An error occurred: {e}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(
description="Print tensor information from a safetensors model"
)
parser.add_argument(
"tensor_name",
nargs="?", # optional (if --list is used for example)
help="Name of the tensor to inspect"
)
parser.add_argument(
"-m", "--model-path",
type=Path,
help="Path to the model directory (default: MODEL_PATH environment variable)"
)
parser.add_argument(
"-l", "--list",
action="store_true",
help="List unique tensor patterns in the model (layer numbers replaced with #)"
)
args = parser.parse_args()
model_path = args.model_path
if model_path is None:
model_path_str = os.environ.get("MODEL_PATH")
if model_path_str is None:
print("Error: --model-path not provided and MODEL_PATH environment variable not set")
sys.exit(1)
model_path = Path(model_path_str)
if not model_path.exists():
print(f"Error: Model path does not exist: {model_path}")
sys.exit(1)
if not model_path.is_dir():
print(f"Error: Model path is not a directory: {model_path}")
sys.exit(1)
if args.list:
list_all_tensors(model_path, unique=True)
else:
if args.tensor_name is None:
print("Error: tensor_name is required when not using --list")
sys.exit(1)
print_tensor_info(model_path, args.tensor_name)
if __name__ == "__main__":
main()

View File

@@ -7,13 +7,12 @@
#include "log.h"
#include "llama.h"
#include <algorithm>
#include <clocale>
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
#include <ctime>
#include <algorithm>
// trim whitespace from the beginning and end of a string
static std::string trim(const std::string & str) {
@@ -154,8 +153,6 @@ static std::vector<std::string> split_string(const std::string& input, char deli
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
srand(1234);
common_params params;

View File

@@ -3,7 +3,6 @@
#include "log.h"
#include "llama.h"
#include <clocale>
#include <cmath>
#include <cstdio>
#include <string>
@@ -17,8 +16,6 @@ static void print_usage(int, char ** argv) {
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
params.n_junk = 250;

View File

@@ -4,7 +4,6 @@
#include "llama.h"
#include <algorithm>
#include <clocale>
#include <fstream>
#include <iostream> // TODO: remove me
@@ -113,8 +112,6 @@ static void batch_process(llama_context * ctx, llama_batch & batch, float * outp
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {

View File

@@ -2,21 +2,15 @@
#include "common.h"
#include "llama.h"
#include <clocale>
#include <vector>
#include <cstdio>
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
params.prompt = "The quick brown fox";
params.sampling.seed = 1234;
const std::string_view state_file = "dump_state.bin";
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}
@@ -59,16 +53,35 @@ int main(int argc, char ** argv) {
// tokenize prompt
auto tokens = common_tokenize(ctx, params.prompt, true);
const bool save_state = true;
if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) {
return 1;
// prepare the batch
llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
for (size_t i = 0; i < tokens.size(); i++) {
common_batch_add(batch, tokens[i], i, {0}, false);
}
batch.logits[batch.n_tokens - 1] = true; // generate next token
// evaluate prompt
llama_decode(ctx, batch);
n_past += batch.n_tokens;
// save state (rng, logits, embedding and kv_cache) to file
{
std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
FILE *fp_write = fopen("dump_state.bin", "wb");
fwrite(state_mem.data(), 1, written, fp_write);
fclose(fp_write);
fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size());
}
// save state (last tokens)
const auto n_past_saved = n_past;
// first run
printf("\nfirst run: %s", params.prompt.c_str());
llama_batch batch = llama_batch_init(1, 0, 1);
for (auto i = 0; i < params.n_predict; i++) {
auto next_token = llama_sampler_sample(smpl, ctx, -1);
auto next_token_str = common_token_to_piece(ctx, next_token);
@@ -98,23 +111,27 @@ int main(int argc, char ** argv) {
printf("\nsecond run: %s", params.prompt.c_str());
// load state from file
std::vector<llama_token> unused_sts(tokens.size()); // unused session tokens.
size_t n_token_count_out = 0;
// load state (rng, logits, embedding and kv_cache) from file
{
std::vector<uint8_t> state_mem;
if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
fprintf(stderr, "\n%s : failed to load state\n", __func__);
return 1;
FILE * fp_read = fopen("dump_state.bin", "rb");
fseek(fp_read, 0, SEEK_END);
state_mem.resize(ftell(fp_read));
fseek(fp_read, 0, SEEK_SET);
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read);
if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
return 1;
}
fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
}
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
// restore state (last tokens)
n_past = n_token_count_out;
if (!common_replay_last_token(ctx2, tokens.back(), n_past)) {
return 1;
}
++n_past;
n_past = n_past_saved;
// second run
for (auto i = 0; i < params.n_predict; i++) {
@@ -143,9 +160,7 @@ int main(int argc, char ** argv) {
}
// make new context
auto params_ctx3 = common_context_params_to_llama(params);
params_ctx3.n_seq_max = 2;
llama_context * ctx3 = llama_init_from_model(model, params_ctx3);
llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
@@ -154,21 +169,26 @@ int main(int argc, char ** argv) {
printf("\nsingle seq run: %s", params.prompt.c_str());
// load state (rng, logits, embedding and kv_cache) from file
n_token_count_out = 0;
{
std::vector<uint8_t> state_mem;
if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
fprintf(stderr, "\n%s : failed to load state\n", __func__);
return 1;
FILE * fp_read = fopen("dump_state.bin", "rb");
fseek(fp_read, 0, SEEK_END);
state_mem.resize(ftell(fp_read));
fseek(fp_read, 0, SEEK_SET);
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read);
if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
return 1;
}
fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
}
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
// restore state (last tokens)
n_past = n_token_count_out;
if (!common_replay_last_token(ctx3, tokens.back(), n_past)) {
return 1;
}
++n_past;
n_past = n_past_saved;
// save seq 0 and load into seq 1
{

View File

@@ -1,5 +1,4 @@
#include "llama.h"
#include <clocale>
#include <cstdio>
#include <cstring>
#include <iostream>
@@ -13,8 +12,6 @@ static void print_usage(int, char ** argv) {
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
std::string model_path;
int ngl = 99;
int n_ctx = 2048;

View File

@@ -1,5 +1,4 @@
#include "llama.h"
#include <clocale>
#include <cstdio>
#include <cstring>
#include <string>
@@ -12,8 +11,6 @@ static void print_usage(int, char ** argv) {
}
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
// path to the model gguf file
std::string model_path;
// prompt to generate text from

View File

@@ -5,15 +5,12 @@
#include "log.h"
#include "llama.h"
#include <clocale>
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {

View File

@@ -5,7 +5,6 @@
#include "llama.h"
#include <algorithm>
#include <clocale>
#include <cstdio>
#include <cstring>
#include <random>
@@ -31,8 +30,6 @@ struct seq_draft {
};
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
// needed to get candidate probs even for temp <= 0.0

View File

@@ -6,10 +6,8 @@
#include "ggml-sycl.h"
#include <clocale>
int main() {
std::setlocale(LC_NUMERIC, "C");
ggml_backend_sycl_print_sycl_devices();
return 0;
}

View File

@@ -3,7 +3,6 @@
#include "log.h"
#include "llama.h"
#include <clocale>
#include <cmath>
#include <cstdio>
#include <cstring>
@@ -15,8 +14,6 @@
#endif
int main(int argc, char ** argv) {
std::setlocale(LC_NUMERIC, "C");
common_params params;
params.escape = false;

View File

@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
### GGML Version
set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 7)
set(GGML_VERSION_PATCH 5)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)

View File

@@ -68,7 +68,7 @@ extern "C" {
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst);
//
// Backend (stream)
@@ -109,7 +109,18 @@ extern "C" {
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// asynchronous tensor shuffle
// - src1, dst1 belong to backend_1
// - src2, dst2 belong to backend_2
// - src1 is copied to dst2
// - src2 is copied to dst1
// - both backends wait until both copies have completed
GGML_API void ggml_backend_tensor_shfl_async(
ggml_backend_t backend_1, ggml_backend_t backend_2,
const struct ggml_tensor * src1, const struct ggml_tensor * src2,
struct ggml_tensor * dst1, struct ggml_tensor * dst2);
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
@@ -135,7 +146,9 @@ extern "C" {
// integrated GPU device using host memory
GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
GGML_BACKEND_DEVICE_TYPE_ACCEL,
// "meta" device wrapping multiple other devices for tensor parallelism
GGML_BACKEND_DEVICE_TYPE_META,
};
// functionality supported by the device
@@ -211,6 +224,52 @@ extern "C" {
};
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
//
// Meta backend
//
enum ggml_backend_meta_split_state {
// tensor split by tensor dimensions:
GGML_BACKEND_SPLIT_STATE_BY_NE0 = 0,
GGML_BACKEND_SPLIT_STATE_BY_NE1 = 1,
GGML_BACKEND_SPLIT_STATE_BY_NE2 = 2,
GGML_BACKEND_SPLIT_STATE_BY_NE3 = 3,
GGML_BACKEND_SPLIT_STATE_MIRRORED = 10, // all values on all backends
GGML_BACKEND_SPLIT_STATE_PARTIAL = 11, // each backend has a partial sum
// for internal bookkeeping only:
GGML_BACKEND_SPLIT_STATE_NONE = 98,
GGML_BACKEND_SPLIT_STATE_UNKNOWN = 99,
};
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
typedef enum ggml_backend_meta_split_state (*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
GGML_API bool ggml_backend_dev_is_meta(ggml_backend_dev_t dev);
GGML_API size_t ggml_backend_meta_dev_n_devs(ggml_backend_dev_t meta_dev);
GGML_API ggml_backend_dev_t ggml_backend_meta_dev_simple_dev(ggml_backend_dev_t meta_dev, size_t index);
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
GGML_API bool ggml_backend_buft_is_meta(ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_meta_buft_n_bufts(ggml_backend_buffer_type_t meta_buft);
GGML_API ggml_backend_buffer_type_t ggml_backend_meta_buft_simple_buft(ggml_backend_buffer_type_t meta_buft, size_t index);
GGML_API bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf);
GGML_API size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf);
GGML_API ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index);
GGML_API struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index);
GGML_API bool ggml_backend_is_meta(ggml_backend_t backend);
GGML_API size_t ggml_backend_meta_n_backends(ggml_backend_t meta_backend);
GGML_API ggml_backend_t ggml_backend_meta_simple_backend(ggml_backend_t meta_backend, size_t index);
GGML_API enum ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
//
// Backend registry
//

View File

@@ -730,6 +730,10 @@ extern "C" {
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
GGML_DEPRECATED(
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
"use ggml_row_size() instead");
GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op);
GGML_API const char * ggml_op_symbol(enum ggml_op op);
@@ -748,7 +752,6 @@ extern "C" {
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_view (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);

View File

@@ -200,6 +200,7 @@ add_library(ggml-base
ggml.cpp
ggml-alloc.c
ggml-backend.cpp
ggml-backend-meta.cpp
ggml-opt.cpp
ggml-threading.cpp
ggml-threading.h

View File

@@ -1,5 +1,6 @@
#include "ggml-alloc.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml.h"
#include "ggml-impl.h"
#include <assert.h>
@@ -17,6 +18,11 @@
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
#define AT_PRINTF(...)
static bool ggml_is_view(const struct ggml_tensor * t) {
return t->view_src != NULL;
}
// ops that return true for this function must not use restrict pointers for their backend implementations
bool ggml_op_can_inplace(enum ggml_op op) {
switch (op) {
@@ -622,7 +628,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
GGML_ASSERT(buffer_id >= 0);
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_impl_is_view(node)) {
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
hn->allocated = true;
assert(hn->addr.offset == 0);
@@ -653,7 +659,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
if (p_hn->n_children == 1 && p_hn->n_views == 0) {
if (ggml_impl_is_view(parent)) {
if (ggml_is_view(parent)) {
struct ggml_tensor * view_src = parent->view_src;
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
@@ -734,7 +740,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
// itself is never used and should not be considered a dependency
if (ggml_impl_is_view(node) && node->op != GGML_OP_NONE) {
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
struct ggml_tensor * view_src = node->view_src;
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
}
@@ -801,7 +807,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
if (ggml_impl_is_view(parent)) {
if (ggml_is_view(parent)) {
struct ggml_tensor * view_src = parent->view_src;
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
view_src_hn->n_views -= 1;
@@ -1235,6 +1241,9 @@ size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx,
}
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_meta(buft)) {
return ggml_backend_meta_alloc_ctx_tensors_from_buft(ctx, buft);
}
size_t nbytes_total = 0;
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
}

View File

@@ -2,7 +2,9 @@
// ggml-backend internal header
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
@@ -90,9 +92,16 @@ extern "C" {
void (*free)(ggml_backend_t backend);
// (optional) asynchronous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*set_tensor_async) (ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async) (ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*set_tensor_2d_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void (*get_tensor_2d_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
bool (*shfl_tensor_async)(ggml_backend_t backend_1, ggml_backend_t backend_2,
const struct ggml_tensor * src1, const struct ggml_tensor * src2, struct ggml_tensor * dst1, struct ggml_tensor * dst2);
// (optional) backend-specific AllReduce operation for meta backend
bool (*allreduce_tensor_async)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
// (optional) complete all pending operations (required if the backend supports async operations)
void (*synchronize)(ggml_backend_t backend);
@@ -250,6 +259,9 @@ extern "C" {
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
#endif
// temporary workaround to statically allocate tensors from a context in a deduplicated way:
GGML_API struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -471,10 +471,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
int best_score = 0;
fs::path best_path;
std::error_code ec;
for (const auto & search_path : search_paths) {
if (!fs::exists(search_path, ec)) {
if (std::error_code ec; !fs::exists(search_path, ec)) {
if (ec) {
GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
} else {
@@ -484,7 +483,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
}
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) {
if (entry.is_regular_file(ec)) {
if (entry.is_regular_file()) {
auto filename = entry.path().filename();
auto ext = entry.path().extension();
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {

View File

@@ -123,7 +123,7 @@ size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_ASSERT(buffer);
// get_base is optional if the buffer is zero-sized
if (buffer->size == 0) {
if (!ggml_backend_buffer_is_meta(buffer) && buffer->size == 0) {
return NULL;
}
@@ -388,7 +388,7 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
// backend copy
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
void ggml_backend_tensor_copy(const struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) {
@@ -402,7 +402,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
#endif
#endif // NDEBUG
size_t nbytes = ggml_nbytes(src);
void * data = malloc(nbytes);
ggml_backend_tensor_get(src, data, 0, nbytes);
@@ -411,7 +411,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
}
}
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
if (src == dst) {
@@ -432,6 +432,20 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
ggml_backend_tensor_copy(src, dst);
}
void ggml_backend_tensor_shfl_async(
ggml_backend_t backend_1, ggml_backend_t backend_2,
const struct ggml_tensor * src1, const struct ggml_tensor * src2,
struct ggml_tensor * dst1, struct ggml_tensor * dst2) {
GGML_ASSERT(ggml_are_same_layout(src1, dst1) && "cannot shuffle tensors with different layouts");
GGML_ASSERT(ggml_are_same_layout(src2, dst2) && "cannot shuffle tensors with different layouts");
if (backend_1->iface.shfl_tensor_async != NULL) {
if (backend_1->iface.shfl_tensor_async(backend_1, backend_2, src1, src2, dst1, dst2)) {
return;
}
}
ggml_backend_tensor_copy_async(backend_1, backend_2, src1, dst2);
ggml_backend_tensor_copy_async(backend_2, backend_1, src2, dst1);
}
// events
ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device) {
@@ -500,6 +514,7 @@ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
}
void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
GGML_ASSERT(device);
memset(props, 0, sizeof(*props));
device->iface.get_props(device, props);
}
@@ -1455,6 +1470,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
int split_backend_id = split->backend_id;
ggml_backend_t split_backend = sched->backends[split_backend_id];
if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
ggml_backend_synchronize(split_backend);
}
// copy the input tensors to the split backend
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
@@ -1465,16 +1484,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
ggml_backend_tensor_copy(input, input_cpy);
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
} else {
// wait for the split backend to finish using the input before overwriting it
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
@@ -1578,6 +1593,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
}
if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
ggml_backend_synchronize(split_backend);
}
if (!sched->callback_eval) {
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
if (ec != GGML_STATUS_SUCCESS) {
@@ -1899,8 +1918,9 @@ enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct
GGML_ASSERT(tensor->data == NULL);
GGML_ASSERT(tensor->view_src == NULL);
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer) ||
(char *) addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
(char *) ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
tensor->buffer = buffer;
tensor->data = addr;

View File

@@ -260,8 +260,12 @@ static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_get_name,
/* .free = */ ggml_backend_blas_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -3286,223 +3286,130 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context & ctx, ggml_tensor
}
/**
* @brief Performs quantized matrix multiplication for Mixture of Experts (MoE)
* models using the CANN backend.
* @brief Performs expert-specific matrix multiplication (MoE) with
* quantized precision using the CANN backend.
*
* This function implements MUL_MAT_ID operation for quantized weight matrices
* (Q4_0 and Q8_0 formats). It selects expert-specific weight matrices based on
* the provided expert indices, and computes matrix multiplication using CANN's
* WeightQuantBatchMatmulV2 operator.
* This function executes a matrix multiplication operation tailored for
* Mixture of Experts (MoE) models, where the input tensor is multiplied
* with expert-specific quantized weight matrices. It leverages the CANN
* backend to perform efficient low-precision computations and stores the
* quantized result in the destination tensor `dst`.
*
* The function performs the following steps:
* 1. Converts input/output tensors to F16 format if necessary
* 2. Uses IndexSelect to extract expert-specific weights and scales based on indices
* 3. Performs quantized matrix multiplication for each expert using WeightQuantBatchMatmulV2
* 4. Converts output back to the target type if needed
* Quantization techniques reduce memory footprint and improve performance
* by using lower-bit representations (e.g., int8) instead of floating-point.
* This function is designed to work with such formats and may incorporate
* optimizations like identity-based fast paths or routing masks for sparse
* expert selection.
*
* Tensor shapes:
* - dst: [M, K, N, 1] - output tensor
* - src0: [D, M, A, 1] - quantized weight matrices (Q4_0 or Q8_0)
* - src1: [D, B, N, 1] - input activations (B = K for per-expert input, or B = 1 for broadcast)
* - ids: [K, N] - expert indices for routing
* @param ctx The context for executing CANN backend operations.
* @param dst The destination tensor where the quantized MoE multiplication result
* will be stored.
*
* @param ctx The CANN backend context for operation execution.
* @param dst The destination tensor where the multiplication result will be stored.
*
* @note Only Q4_0 and Q8_0 quantization formats are supported.
* @note The function handles automatic type conversion to/from F16 as needed by the hardware.
* @note This function assumes quantized data types and is designed for
* MoE architectures with potential sparse expert routing.
*/
static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
// dst: [M, K, N, 1]
// src0: [D, M, A, 1] - quantized weights
// src1: [D, B, N, 1] - input activations, B = K or B = 1
// ids: [K, N] - expert indices
ggml_tensor * src0 = dst->src[0];
ggml_tensor * src1 = dst->src[1];
ggml_tensor * ids = dst->src[2];
// TODO: Use aclnnGroupedMatMul
//dst [M, K, N, 1]
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
ggml_tensor * ids = dst->src[2]; //ids [K, N]
GGML_ASSERT(src0->ne[3] == 1);
GGML_ASSERT(src1->ne[3] == 1);
GGML_ASSERT(dst->ne[3] == 1);
GGML_ASSERT(src1->ne[2] == ids->ne[1]);
GGML_TENSOR_BINARY_OP_LOCALS
const int64_t n_batches = ids->ne[1];
const int64_t n_select_experts = ids->ne[0];
const enum ggml_type type = src0->type;
// copy index from npu to cpu
int64_t n_as = ne02; // A
int64_t n_ids = ids->ne[0]; // K
const int32_t group_size = QK8_0; // Both Q4_0 and Q8_0 use group size of 32
GGML_ASSERT(group_size == QK4_0);
std::vector<char> ids_host(ggml_nbytes(ids));
ACL_CHECK(aclrtMemcpyAsync(ids_host.data(), ggml_nbytes(ids), ids->data, ggml_nbytes(ids),
ACL_MEMCPY_DEVICE_TO_HOST, ctx.stream()));
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
// Calculate element size for quantized weights
const float weight_elem_size =
(type == GGML_TYPE_Q4_0) ? 0.5f :
(type == GGML_TYPE_Q8_0) ? 1.0f :
(GGML_ABORT("MUL_MAT_ID only supports Q4_0 and Q8_0"), 0.0f);
char * src0_original = (char *) src0->data;
char * src1_original = (char *) src1->data;
char * dst_original = (char *) dst->data;
// Calculate scale offset in memory
const size_t weight_size = src0->ne[0] * src0->ne[1] * src0->ne[2] * weight_elem_size;
const size_t scale_elem_size = sizeof(uint16_t);
char * scale_data = (char *) src0->data + weight_size;
ggml_tensor src0_row = *src0;
ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst;
// Allocate buffers for selected expert weights and scales
const size_t selected_weight_size = src0->ne[0] * src0->ne[1] * n_select_experts * weight_elem_size;
ggml_cann_pool_alloc selected_weight_alloc(ctx.pool(), selected_weight_size);
void * selected_weight_buffer = selected_weight_alloc.get();
const size_t selected_scale_size = (src0->ne[0] / group_size) * src0->ne[1] * n_select_experts * scale_elem_size;
ggml_cann_pool_alloc selected_scale_alloc(ctx.pool(), selected_scale_size);
void * selected_scale_buffer = selected_scale_alloc.get();
// Helper lambda to allocate and cast tensor to F16 if needed
constexpr size_t f16_elem_size = sizeof(uint16_t);
auto prepare_f16_buffer = [&](ggml_tensor * tensor, ggml_cann_pool_alloc & allocator,
bool need_cast = false) -> void * {
if (tensor->type == GGML_TYPE_F16) {
return tensor->data;
}
size_t total_size = f16_elem_size;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
total_size *= tensor->ne[i];
}
void * buffer = allocator.alloc(total_size);
if (need_cast == false) {
return buffer;
}
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS] = { f16_elem_size };
for (int i = 0; i < GGML_MAX_DIMS; i++) {
ne[i] = tensor->ne[i];
if (i > 0) {
nb[i] = nb[i - 1] * ne[i - 1];
}
}
acl_tensor_ptr src_tensor = ggml_cann_create_tensor(tensor);
acl_tensor_ptr f16_tensor = ggml_cann_create_tensor(buffer, ACL_FLOAT16, f16_elem_size, ne, nb, GGML_MAX_DIMS);
aclnn_cast(ctx, src_tensor.get(), f16_tensor.get(), ACL_FLOAT16);
return buffer;
};
// Prepare input and output buffers
ggml_cann_pool_alloc input_alloc(ctx.pool());
void * input_buffer = prepare_f16_buffer(src1, input_alloc, true);
ggml_cann_pool_alloc output_alloc(ctx.pool());
void * output_buffer = prepare_f16_buffer(dst, output_alloc, false);
// Process each batch
for (int64_t batch_idx = 0; batch_idx < n_batches; batch_idx++) {
// Create index tensor for current batch
const size_t index_offset = batch_idx * ids->nb[1];
acl_tensor_ptr batch_indices = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, index_offset);
// Select quantized weights using expert indices
// Q4_0 stores 2 values per byte, Q8_0 stores 1 value per byte
const int64_t weight_d = (type == GGML_TYPE_Q4_0) ? src0->ne[0] / 2 : src0->ne[0];
const int64_t weight_m = src0->ne[1];
const int64_t weight_n_experts = src0->ne[2];
int64_t weight_ne[3] = { weight_d, weight_m, weight_n_experts };
size_t weight_nb[3] = { sizeof(int8_t), weight_d * sizeof(int8_t), weight_d * weight_m * sizeof(int8_t) };
acl_tensor_ptr all_weights =
ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, 3);
int64_t selected_weight_ne[3] = { weight_d, weight_m, n_select_experts };
size_t selected_weight_nb[3] = { sizeof(int8_t), weight_d * sizeof(int8_t),
weight_d * weight_m * sizeof(int8_t) };
acl_tensor_ptr selected_weights = ggml_cann_create_tensor(selected_weight_buffer, ACL_INT8, sizeof(int8_t),
selected_weight_ne, selected_weight_nb, 3);
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, all_weights.get(), 0, batch_indices.get(), selected_weights.get());
// Select scales using the same expert indices
const int64_t scale_d = src0->ne[0] / group_size;
int64_t scale_ne[3] = { scale_d, weight_m, weight_n_experts };
size_t scale_nb[3] = { scale_elem_size, scale_d * scale_elem_size, scale_d * weight_m * scale_elem_size };
acl_tensor_ptr all_scales =
ggml_cann_create_tensor(scale_data, ACL_FLOAT16, scale_elem_size, scale_ne, scale_nb, 3);
int64_t selected_scale_ne[3] = { scale_d, weight_m, n_select_experts };
size_t selected_scale_nb[3] = { scale_elem_size, scale_d * scale_elem_size,
scale_d * weight_m * scale_elem_size };
acl_tensor_ptr selected_scales = ggml_cann_create_tensor(selected_scale_buffer, ACL_FLOAT16, scale_elem_size,
selected_scale_ne, selected_scale_nb, 3);
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, all_scales.get(), 0, batch_indices.get(), selected_scales.get());
// Process each expert for current batch
// IndexSelect output layout: [D, M, K] in contiguous format
// WeightQuantBatchMatmulV2 expects: [M, D] with row-major stride
for (int64_t expert_idx = 0; expert_idx < n_select_experts; expert_idx++) {
// Determine input offset: broadcast if src1->ne[1]==1, otherwise use per-expert input
const size_t input_offset =
(batch_idx * src1->ne[1] + (src1->ne[1] == 1 ? 0 : expert_idx)) * src1->ne[0] * f16_elem_size;
const size_t output_offset = (batch_idx * dst->ne[1] + expert_idx) * dst->ne[0] * f16_elem_size;
// Create weight view for current expert: [D, M, K] -> [M, D]
int64_t weight_view_ne[2] = { weight_m, src0->ne[0] };
float weight_view_nb[2] = { src0->ne[0] * weight_elem_size, weight_elem_size };
const size_t weight_view_offset = expert_idx * selected_weight_nb[2];
acl_tensor_ptr weight_view =
ggml_cann_create_tensor(selected_weight_buffer, ggml_cann_type_mapping(type), weight_elem_size,
weight_view_ne, weight_view_nb, 2, ACL_FORMAT_ND, weight_view_offset);
// Create scale view for current expert: [D, M, K] -> [M, D]
int64_t scale_view_ne[2] = { weight_m, scale_d };
size_t scale_view_nb[2] = { selected_scale_nb[1], selected_scale_nb[0] };
const size_t scale_view_offset = expert_idx * selected_scale_nb[2];
acl_tensor_ptr scale_view =
ggml_cann_create_tensor(selected_scale_buffer, ACL_FLOAT16, scale_elem_size, scale_view_ne,
scale_view_nb, 2, ACL_FORMAT_ND, scale_view_offset);
// Create input activation tensor [D, 1]
int64_t input_ne[2] = { src1->ne[0], 1 };
size_t input_nb[2] = { f16_elem_size, src1->ne[0] * f16_elem_size };
acl_tensor_ptr input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, f16_elem_size, input_ne,
input_nb, 2, ACL_FORMAT_ND, input_offset);
// Create output tensor [M, 1]
int64_t output_ne[2] = { dst->ne[0], 1 };
size_t output_nb[2] = { f16_elem_size, dst->ne[0] * f16_elem_size };
acl_tensor_ptr output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, f16_elem_size, output_ne,
output_nb, 2, ACL_FORMAT_ND, output_offset);
// Perform quantized matrix multiplication
GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, input_tensor.get(), weight_view.get(),
scale_view.get(), nullptr, nullptr, nullptr, nullptr, group_size,
output_tensor.get());
}
const enum ggml_type type = dst->src[0]->type;
float weight_elem_size;
if (type == GGML_TYPE_Q4_0) {
weight_elem_size = float(sizeof(uint8_t)) / 2;
} else if (type == GGML_TYPE_Q8_0) {
weight_elem_size = float(sizeof(uint8_t));
} else {
GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
}
// Cast output back to original type if we used a temporary F16 buffer
if (dst->type != GGML_TYPE_F16) {
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS] = { f16_elem_size };
for (int i = 0; i < GGML_MAX_DIMS; i++) {
ne[i] = dst->ne[i];
if (i > 0) {
nb[i] = nb[i - 1] * ne[i - 1];
}
// src0_row [D, M, 1, 1] weight without permute
src0_row.ne[2] = 1;
src0_row.ne[3] = 1;
src0_row.nb[0] = weight_elem_size;
src0_row.nb[1] = weight_elem_size * ne00;
src0_row.nb[2] = weight_elem_size * ne00;
src0_row.nb[3] = weight_elem_size * ne00;
size_t weight_stride = ne00 * ne01 * weight_elem_size;
size_t weight_size = weight_stride * ne02 * ne03;
// scale [D, M, 1, 1] -> scale && permute
size_t scale_elem_size = sizeof(uint16_t);
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
// src1_row [D, 1, 1, 1] -> input
src1_row.ne[1] = 1;
src1_row.ne[2] = 1;
src1_row.ne[3] = 1;
src1_row.nb[2] = nb11;
src1_row.nb[3] = nb11;
// dst_row [M, 1, 1, 1] -> out
dst_row.ne[1] = 1;
dst_row.ne[2] = 1;
dst_row.ne[3] = 1;
dst_row.nb[2] = nb1;
dst_row.nb[3] = nb1;
//create weight for one row
ggml_cann_pool_alloc weight_allocator(ctx.pool());
void * weight_buffer = weight_allocator.alloc(nb02);
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
for (int64_t id = 0; id < n_ids; id++) {
// expert index
int32_t i02 = *(int32_t *) (ids_host.data() + iid1 * ids->nb[1] + id * ids->nb[0]);
GGML_ASSERT(i02 >= 0 && i02 < n_as);
// If B = 1 (broadcast), always use 0; otherwise, use id.
int64_t i11 = (ne11 == 1 ? 0 : id);
int64_t i12 = iid1;
int64_t i1 = id;
int64_t i2 = i12;
void * src0_tmp_ptr = src0_original + i02 * weight_stride;
void * scale_tmp_ptr = src0_original + weight_size + i02 * scale_stride;
void * src1_tmp_ptr = src1_original + i11 * nb11 + i12 * nb12;
void * dst_tmp_ptr = dst_original + i1 * nb1 + i2 * nb2;
// mem cpy
ACL_CHECK(aclrtMemcpyAsync(weight_buffer, weight_stride, src0_tmp_ptr, weight_stride,
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
void * scale_buffer = (char *) weight_buffer + weight_stride;
ACL_CHECK(aclrtMemcpyAsync(scale_buffer, scale_stride, scale_tmp_ptr, scale_stride,
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
src0_row.data = weight_buffer;
src1_row.data = src1_tmp_ptr;
dst_row.data = dst_tmp_ptr;
dst_row.src[0] = &src0_row;
dst_row.src[1] = &src1_row;
ggml_cann_mul_mat(ctx, &dst_row);
}
acl_tensor_ptr f16_output =
ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, f16_elem_size, ne, nb, GGML_MAX_DIMS);
acl_tensor_ptr dst_tensor = ggml_cann_create_tensor(dst);
aclnn_cast(ctx, f16_output.get(), dst_tensor.get(), ggml_cann_type_mapping(dst->type));
}
return;
}
void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst) {

View File

@@ -794,44 +794,19 @@ struct ggml_backend_cann_buffer_context {
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
};
// cann buffer type
/**
* @brief Structure representing context information for a specific backend
* buffer type.
* @brief Check if a buffer is a CANN buffer.
*
* This function checks if a given buffer is a CANN buffer by comparing its
* `get_name` function pointer to `ggml_backend_cann_buffer_get_name`.
*
* @param buffer The buffer to check.
* @return true if the buffer is a CANN buffer, false otherwise.
*/
struct ggml_backend_cann_buffer_type_context {
int32_t device; /**< Device identifier associated with the buffer context. */
std::string name; /**< Name associated with the buffer context. */
};
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
/**
* @brief Retrieves the name associated with a CANN buffer type.
*
* This function returns the descriptive name associated with the specified
* CANN buffer type context.
*
* @param buft Pointer to the buffer type context.
* @return Const pointer to the C-style string containing the name.
*/
static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
return buft_ctx->name.c_str();
}
/**
* @brief Checks if the backend buffer type is associated with the CANN backend.
*
* This function checks whether the provided backend buffer type is associated
* with the CANN backend based on the comparison of its name retrieval function
* pointer.
*
* @param buft Pointer to the backend buffer type to check.
* @return bool Returns true if the buffer type is associated with the CANN
* backend, otherwise false.
*/
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_is_cann(buffer->buft);
}
/**
@@ -1296,7 +1271,7 @@ static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
const ggml_tensor * src,
ggml_tensor * dst) {
if (ggml_backend_buft_is_cann(src->buffer->buft)) {
if (ggml_backend_buffer_is_cann(src->buffer)) {
ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;
@@ -1360,6 +1335,31 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
/* .reset = */ NULL,
};
// cann buffer type
/**
* @brief Structure representing context information for a specific backend
* buffer type.
*/
struct ggml_backend_cann_buffer_type_context {
int32_t device; /**< Device identifier associated with the buffer context. */
std::string name; /**< Name associated with the buffer context. */
};
/**
* @brief Retrieves the name associated with a CANN buffer type.
*
* This function returns the descriptive name associated with the specified
* CANN buffer type context.
*
* @param buft Pointer to the buffer type context.
* @return Const pointer to the C-style string containing the name.
*/
static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
return buft_ctx->name.c_str();
}
/**
* @brief Allocates a new CANN buffer of the specified type and size.
*
@@ -1997,7 +1997,7 @@ static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t backend_src,
GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));
if (!ggml_backend_buft_is_cann(src->buffer->buft) || !ggml_backend_buft_is_cann(dst->buffer->buft)) {
if (!ggml_backend_buffer_is_cann(src->buffer) || !ggml_backend_buffer_is_cann(dst->buffer)) {
return false;
}
@@ -2523,6 +2523,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
GGML_UNUSED(dev);
}
/**
* @brief Checks if the backend buffer type is associated with the CANN backend.
*
* This function checks whether the provided backend buffer type is associated
* with the CANN backend based on the comparison of its name retrieval function
* pointer.
*
* @param buft Pointer to the backend buffer type to check.
* @return bool Returns true if the buffer type is associated with the CANN
* backend, otherwise false.
*/
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
}
/**
* @brief Records an event on the CANN backend stream.
*
@@ -2567,7 +2582,11 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .free = */ ggml_backend_cann_free,
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
/* .get_tensor_2d_async = */ NULL,
/* .set_tensor_2d_async = */ NULL,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
/* .shfl_tensor_async = */ NULL,
/* .allreduce_tensor_async = */ NULL,
/* .synchronize = */ ggml_backend_cann_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,

View File

@@ -9,11 +9,6 @@ function(ggml_add_cpu_backend_features cpu_name arch)
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
# Disable LTO for the feature detection code to prevent cross-module optimization
# from inlining architecture-specific instructions into the score function.
# Without this, LTO can cause SIGILL when loading backends on older CPUs
# (e.g., loading power10 backend on power9 crashes before feature check runs).
target_compile_options(${GGML_CPU_FEATS_NAME} PRIVATE -fno-lto)
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
endfunction()
@@ -566,32 +561,35 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# Fetch KleidiAI sources:
include(FetchContent)
set(KLEIDIAI_COMMIT_TAG "v1.22.0")
set(KLEIDIAI_COMMIT_TAG "v1.16.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "54049037570ab0ee0a0d126b2ba5ece1")
set(KLEIDIAI_ARCHIVE_MD5 "0a9e9008adb6031f9e8cf70dff4a3321")
if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif()
# TODO: Use FetchContent_MakeAvailable with EXCLUDE_FROM_ALL after bumping minimum CMake version to 3.28+
# Using FetchContent_Populate instead to avoid EXCLUDE_FROM_ALL which requires CMake 3.28
FetchContent_Declare(KleidiAI_Download
URL ${KLEIDIAI_DOWNLOAD_URL}
DOWNLOAD_EXTRACT_TIMESTAMP NEW
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
FetchContent_MakeAvailable(KleidiAI_Download)
FetchContent_GetProperties(KleidiAI_Download
SOURCE_DIR KLEIDIAI_SRC
POPULATED KLEIDIAI_POPULATED)
if (NOT KLEIDIAI_POPULATED)
FetchContent_Populate(KleidiAI_Download)
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
message(FATAL_ERROR "KleidiAI source downloaded failed.")
endif()
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
# Remove kleidiai target after fetching it
if (TARGET kleidiai)
set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif()
list(APPEND GGML_CPU_SOURCES
ggml-cpu/kleidiai/kleidiai.cpp
ggml-cpu/kleidiai/kernels.cpp
@@ -608,7 +606,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f16p_qsi4c32p/
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
@@ -649,6 +646,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (NOT SME_ENABLED MATCHES -1)
list(APPEND GGML_KLEIDIAI_SOURCES
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa_asm.S
@@ -656,13 +654,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot_asm.S
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f16p_qsi4c32p/kai_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_f16p_qsi4c32p/kai_matmul_clamp_f32_f16p1vlx2_qsi4c32p4vlx2_1vlx4vl_sme2_mopa_asm.S
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_f16pmrx2_f32_neon.c
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2+sme2+fp16")
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
endif()
if (NOT SVE_ENABLED MATCHES -1)

View File

@@ -141,50 +141,27 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
namespace ggml::cpu::amx {
class extra_buffer_type : ggml::cpu::extra_buffer_type {
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
if (op->op != GGML_OP_MUL_MAT) {
return false;
}
auto * src0 = op->src[0];
auto * src1 = op->src[1];
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
return false;
}
if (!src0->buffer || src0->buffer->buft != ggml_backend_amx_buffer_type()) {
return false;
}
if (src1->buffer && !ggml_backend_buft_is_host(src1->buffer->buft)) {
return false;
}
if (op->ne[0] % (TILE_N * 2)) {
return false;
}
int alignment;
switch (src0->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q8_0:
alignment = TILE_K;
break;
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_XS:
alignment = 256; // QK_K
break;
case GGML_TYPE_F16:
alignment = 16;
break;
default:
if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
is_contiguous_2d(op->src[1]) && // src1 must be contiguous
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
// src1 must be host buffer
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false;
}
// src1 must be float32
if (op->src[1]->type == GGML_TYPE_F32) {
return true;
}
}
if (src0->ne[0] % alignment) {
return false;
}
if (src1->type != GGML_TYPE_F32) {
return false;
}
return true;
return false;
}
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {

View File

@@ -9,8 +9,6 @@
#if defined(GGML_USE_OPENMP)
#include <omp.h>
#else
#include <thread>
#endif
#define TILE_M 16
@@ -58,40 +56,18 @@ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
}
template <typename func_t>
inline void parallel_for(int n, const func_t & f) {
if (n <= 0) {
return;
}
inline void parallel_for(int n, const func_t& f) {
#if defined(GGML_USE_OPENMP)
#pragma omp parallel
{
int nth = omp_get_num_threads();
int ith = omp_get_thread_num();
int tbegin, tend;
balance211(n, nth, ith, tbegin, tend);
f(tbegin, tend);
}
#pragma omp parallel
{
int nth = omp_get_num_threads();
int ith = omp_get_thread_num();
int tbegin, tend;
balance211(n, nth, ith, tbegin, tend);
f(tbegin, tend);
}
#else
int nth = std::thread::hardware_concurrency();
if (nth <= 1) {
f(0, n);
return;
}
if (nth > n) {
nth = n;
}
std::vector<std::thread> threads;
threads.reserve(nth);
for (int ith = 0; ith < nth; ++ith) {
threads.emplace_back([&f, n, ith, nth] {
int tbegin, tend;
balance211(n, nth, ith, tbegin, tend);
f(tbegin, tend);
});
}
for (auto & t : threads) {
t.join();
}
f(0, n);
#endif
}

View File

@@ -1,3 +1,4 @@
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wpedantic"
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
@@ -201,27 +202,35 @@ struct tile_config_t{
// advanced-matrix-extensions-intrinsics-functions.html
//
inline void ggml_tile_config_init(void) {
static thread_local bool done = false;
#define TC_CONFIG_TILE(i, r, cb) tc.rows[i] = r; tc.colsb[i] = cb
void ggml_tile_config_init(void) {
static thread_local bool is_first_time = true;
if (done) {
if (!is_first_time) {
return;
}
alignas(64) tile_config_t tc = {};
tc.palette_id = 1;
tc.start_row = 0;
tc.rows[0] = 8; tc.colsb[0] = 64;
tc.rows[1] = 8; tc.colsb[1] = 64;
tc.rows[2] = 16; tc.colsb[2] = 32;
tc.rows[3] = 16; tc.colsb[3] = 32;
tc.rows[4] = 16; tc.colsb[4] = 64;
tc.rows[5] = 16; tc.colsb[5] = 64;
tc.rows[6] = 16; tc.colsb[6] = 64;
tc.rows[7] = 16; tc.colsb[7] = 64;
static thread_local tile_config_t tc;
tile_config_t current_tc;
_tile_storeconfig(&current_tc);
_tile_loadconfig(&tc);
done = true;
// load only when config changes
if (tc.palette_id == 0 || (memcmp(&current_tc.colsb, &tc.colsb, sizeof(uint16_t) * 8) != 0 &&
memcmp(&current_tc.rows, &tc.rows, sizeof(uint8_t) * 8) != 0)) {
tc.palette_id = 1;
tc.start_row = 0;
TC_CONFIG_TILE(TMM0, 8, 64);
TC_CONFIG_TILE(TMM1, 8, 64);
TC_CONFIG_TILE(TMM2, 16, 32);
TC_CONFIG_TILE(TMM3, 16, 32);
TC_CONFIG_TILE(TMM4, 16, 64);
TC_CONFIG_TILE(TMM5, 16, 64);
TC_CONFIG_TILE(TMM6, 16, 64);
TC_CONFIG_TILE(TMM7, 16, 64);
_tile_loadconfig(&tc);
}
is_first_time = false;
}
// we need an extra 16 * 4B (TILE_N * int32_t) for each NB/KB block for compensation.
@@ -259,6 +268,33 @@ int get_row_size(int K) {
return row_size;
}
// vectorized dtype conversion
inline float FP16_TO_FP32(ggml_half val) {
__m256i v = _mm256_setr_epi16(
val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512 o = _mm512_cvtph_ps(v);
return _mm512_cvtss_f32(o);
}
inline __m512 FP16_TO_FP32_VEC(ggml_half val) {
__m256i v = _mm256_set1_epi16(val);
return _mm512_cvtph_ps(v);
}
// horizontal reduce
inline float _mm512_reduce_max_ps(const __m512 x) {
__m512 v = x;
__m512 v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
v = _mm512_max_ps(v, v1);
v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
v = _mm512_max_ps(v, v1);
v1 = _mm512_shuffle_ps(v, v, 0x4E);
v = _mm512_max_ps(v, v1);
v1 = _mm512_shuffle_ps(v, v, 0xB1);
v = _mm512_max_ps(v, v1);
return _mm512_cvtss_f32(v);
}
// transpose utils
#define SHUFFLE_EPI32(a, b, mask) \
_mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask))
@@ -1334,9 +1370,9 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
#define LAUNCH_TINYGEMM_KERNEL_AVX(MB_SIZE, NB_SIZE) \
tinygemm_kernel_avx<float, type, float, MB_SIZE, NB_SIZE, blck_size>::apply( \
K, (const float *)src1->data + src1_offset + mb_start * K, \
(const type *)src0->data + src0_offset + nb_start * K, \
(float *)dst->data + dst_offset + mb_start * ldc + nb_start, ldc)
K, (const float *)src1->data + mb_start * K, \
(const type *)src0->data + nb_start * K, \
(float *)dst->data + mb_start * ldc + nb_start, ldc);
// re-organize in the format {NB, KB, TILE_SIZE}:
@@ -1983,11 +2019,11 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B
}
};
#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE) \
tinygemm_kernel_vnni<vec_dot_type, type, float, 1, NB_SIZE, blck_size>::apply( \
KB, wdata_batch, \
(const char *)src0->data + src0_offset + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), \
(float *) dst->data + dst_offset + nb_start, ldc)
#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE) \
tinygemm_kernel_vnni<vec_dot_type, type, float, 1, NB_SIZE, blck_size>::apply( \
KB, (const char *)wdata + 0 * row_size_A, \
(const char *)src0->data + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), \
(float *) dst->data + 0 * N + nb_start, ldc)
template <typename TA, typename TB, typename TC, int BLOCK_K,
typename std::enable_if<!is_type_qkk<TB>::value, int>::type = 0>
@@ -2043,7 +2079,7 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
_tile_stored(TMM5, Tile5(C_pre), TILE_N * sizeof(int32_t));
if (need_unpack) {
unpack_B<TB>(Tile1, B_blk1);
unpack_B<TB>(Tile1, B_blk0);
_tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK);
} else {
_tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK);
@@ -2300,13 +2336,6 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
});
}
// ne2 is passed explicitly to help compiler optimize repeated calls
inline int64_t ggml_batch_offset(const ggml_tensor * t, int64_t batch_idx, int64_t ne2) {
const int64_t i2 = batch_idx % ne2;
const int64_t i3 = batch_idx / ne2;
return i3 * t->nb[3] + i2 * t->nb[2];
}
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
struct ggml_tensor * src0 = dst->src[0];
@@ -2319,13 +2348,12 @@ size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
const int M = dst->ne[1];
const int K = src0->ne[0];
const int64_t n_batch = dst->ne[2] * dst->ne[3];
size_t desired_wsize = 0;
GGML_DISPATCH_QTYPES(TYPE, [&] {
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
desired_wsize = n_batch * M * row_size_A;
desired_wsize = M * row_size_A;
});
return desired_wsize;
@@ -2337,7 +2365,7 @@ size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) {
// src1: input in shape of {M, K}, float32
// dst: output in shape of {M, N}, float32
//
// the function performs: dst = src1 @ src0.T for each batch
// the function performs: dst = src1 @ src0.T
//
void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) {
struct ggml_tensor * src0 = dst->src[0];
@@ -2354,26 +2382,17 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
const int K = src0->ne[0];
const int ldc = dst->nb[1] / dst->nb[0];
const int64_t ne2 = dst->ne[2];
const int64_t n_batch = ne2 * dst->ne[3];
if (is_floating_type) {
constexpr int BLOCK_M = 4;
constexpr int BLOCK_N = 6;
const int MB = div_up(M, BLOCK_M);
const int NB = div_up(N, BLOCK_N);
parallel_for_ggml(params, n_batch * MB * NB, [&](int begin, int end) {
parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] {
for (int i = begin; i < end; ++i) {
int batch_idx = i / (MB * NB);
int remaining = i % (MB * NB);
int mb = remaining / NB;
int nb = remaining % NB;
int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2);
int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
int64_t dst_offset = ggml_batch_offset(dst, batch_idx, ne2);
int mb = i / NB;
int nb = i % NB;
int mb_start = mb * BLOCK_M;
int mb_size = std::min(BLOCK_M, M - mb_start);
@@ -2405,10 +2424,10 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
void * wdata = params->wdata;
//TODO: performance improvement: merge quant A
// if (params->ith == 0) {
if (params->ith == 0) {
GGML_DISPATCH_QTYPES(TYPE, [&] {
const size_t row_size_A = K / blck_size * sizeof(vec_dot_type);
const size_t desired_wsize = n_batch * M * row_size_A;
const size_t desired_wsize = M * row_size_A;
if (params->wsize < desired_wsize) {
GGML_ABORT("insufficient work space size");
}
@@ -2417,19 +2436,12 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
parallel_for_ggml(params, n_batch, [&](int begin, int end) {
for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
const float * A_data = (const float *)((const char *)src1->data + src1_offset);
char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
for (int m = 0; m < M; ++m) {
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
}
}
});
const float * A_data = static_cast<const float *>(src1->data);
for (int m = 0; m < M; ++m) {
from_float<vec_dot_type>(A_data + m * K, (char *)wdata + m * row_size_A, K);
}
});
// }
}
ggml_barrier(params->threadpool);
@@ -2439,19 +2451,13 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
constexpr int BLOCK_N = TILE_N * kTilesN;
const int NB = div_up(N, BLOCK_N);
parallel_for_ggml(params, n_batch * NB, [&](int begin, int end) {
parallel_for_ggml(params, NB, [&](int begin, int end) {
GGML_DISPATCH_QTYPES(TYPE, [&] {
const int KB = K / blck_size;
const int TILE_SIZE = get_tile_size<type>();
const int row_size_A = KB * sizeof(vec_dot_type);
for (int i = begin; i < end; ++i) {
int batch_idx = i / NB;
int nb = i % NB;
int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2);
int64_t dst_offset = ggml_batch_offset(dst, batch_idx, ne2);
const char * wdata_batch = (const char *)wdata + batch_idx * row_size_A;
int nb = i;
int nb_start = nb * BLOCK_N;
int nb_size = std::min(BLOCK_N, N - nb_start); // 32, 64, 96
@@ -2475,7 +2481,7 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
const int MB = div_up(M, BLOCK_M);
const int NB = div_up(N, BLOCK_N);
parallel_for_ggml(params, n_batch * MB * NB, [&](int begin, int end) {
parallel_for_ggml(params, MB * NB, [&](int begin, int end) {
// init tile config for each thread
ggml_tile_config_init();
@@ -2485,14 +2491,8 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
const int row_size_A = KB * sizeof(vec_dot_type);
for (int i = begin; i < end; ++i) {
int batch_idx = i / (MB * NB);
int remaining = i % (MB * NB);
int mb = remaining / NB;
int nb = remaining % NB;
int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2);
int64_t dst_offset = ggml_batch_offset(dst, batch_idx, ne2);
const char * wdata_batch = (const char *)wdata + batch_idx * M * row_size_A;
int mb = i / NB;
int nb = i % NB;
int mb_start = mb * BLOCK_M;
int mb_size = std::min(BLOCK_M, M - mb_start);
@@ -2501,9 +2501,9 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
tinygemm_kernel_amx<vec_dot_type, type, float, blck_size>(
mb_size, nb_size, KB,
wdata_batch + mb_start * row_size_A,
(const char *)src0->data + src0_offset + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE),
(float *) dst->data + dst_offset + mb_start * N + nb_start, ldc);
(const char *)wdata + mb_start * row_size_A,
(const char *)src0->data + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE),
(float *) dst->data + mb_start * N + nb_start, ldc);
}
});
});

View File

@@ -42,14 +42,10 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -58,14 +54,10 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
# define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
@@ -73,10 +65,8 @@
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// repack.cpp
@@ -85,23 +75,17 @@
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__POWERPC__) || defined(__powerpc__)
@@ -122,14 +106,10 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -138,14 +118,10 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__loongarch64)
@@ -166,14 +142,10 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -182,22 +154,24 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__riscv)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
@@ -211,14 +185,10 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -226,14 +196,10 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__s390x__)
@@ -260,14 +226,10 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -276,14 +238,10 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__wasm__)
@@ -312,14 +270,10 @@
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q5_K_8x4_q8_K_generic ggml_gemv_q5_K_8x4_q8_K
#define ggml_gemv_q5_K_8x8_q8_K_generic ggml_gemv_q5_K_8x8_q8_K
#define ggml_gemv_q6_K_8x4_q8_K_generic ggml_gemv_q6_K_8x4_q8_K
#define ggml_gemv_q6_K_8x8_q8_K_generic ggml_gemv_q6_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_mxfp4_4x4_q8_0_generic ggml_gemv_mxfp4_4x4_q8_0
#define ggml_gemv_mxfp4_8x8_q8_0_generic ggml_gemv_mxfp4_8x8_q8_0
#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
@@ -328,14 +282,10 @@
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q5_K_8x4_q8_K_generic ggml_gemm_q5_K_8x4_q8_K
#define ggml_gemm_q5_K_8x8_q8_K_generic ggml_gemm_q5_K_8x8_q8_K
#define ggml_gemm_q6_K_8x4_q8_K_generic ggml_gemm_q6_K_8x4_q8_K
#define ggml_gemm_q6_K_8x8_q8_K_generic ggml_gemm_q6_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#define ggml_gemm_mxfp4_4x4_q8_0_generic ggml_gemm_mxfp4_4x4_q8_0
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1954,773 +1954,3 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
#endif
}
static const uint8_t sign_gather_indices_arr[64] = {
0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,
4,4,4,4,4,4,4,4, 5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7
};
static const uint8_t sign_bit_masks_arr[64] = {
1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128,
1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128, 1,2,4,8,16,32,64,128
};
static void ggml_vec_dot_iq2_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
const block_iq2_s * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const uint64_t * grid64 = (const uint64_t *)iq2s_grid;
// --- Pre-load Constants ---
uint16_t gather_qh_arr[8] = {0, 0, 0, 0, 1, 1, 1, 1};
vuint16mf2_t v_gather_qh = __riscv_vle16_v_u16mf2(gather_qh_arr, 8);
uint16_t shift_qh_arr[8] = {11, 9, 7, 5, 11, 9, 7, 5};
vuint16mf2_t v_shift_qh = __riscv_vle16_v_u16mf2(shift_qh_arr, 8);
// Constants for sign extraction
vuint8m2_t v_sign_gather_indices = __riscv_vle8_v_u8m2(sign_gather_indices_arr, 64);
vuint8m2_t v_sign_masks = __riscv_vle8_v_u8m2(sign_bit_masks_arr, 64);
float sumf = 0.0f;
for (int i = 0; i < nb; ++i) {
const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint8_t * GGML_RESTRICT scales = x[i].scales;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
const uint8_t * signs_ptr = qs + 32;
float sum_block = 0.0f;
for (int ib = 0; ib < 4; ++ib) {
// Combine low + high bits
vuint8mf4_t v_qs_u8 = __riscv_vle8_v_u8mf4(qs, 8);
qs += 8;
uint16_t qh_val;
memcpy(&qh_val, qh, 2);
qh += 2;
vuint8mf8_t v_qh_raw = __riscv_vle8_v_u8mf8((const uint8_t*)&qh_val, 2);
vuint16mf4_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16mf4(v_qh_raw, 2);
vuint16mf2_t v_qh_u16_ext = __riscv_vlmul_ext_v_u16mf4_u16mf2(v_qh_u16);
vuint16mf2_t v_qh_expanded = __riscv_vrgather_vv_u16mf2(v_qh_u16_ext, v_gather_qh, 8);
v_qh_expanded = __riscv_vsll_vv_u16mf2(v_qh_expanded, v_shift_qh, 8);
// Mask: We want bits 11-12. 0x1800 = 0001 1000 0000 0000
v_qh_expanded = __riscv_vand_vx_u16mf2(v_qh_expanded, 0x1800, 8);
vuint16mf2_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16mf2(v_qs_u8, 8);
// Multiply by 8 to get byte offset, instead of element offset
v_qs_u16 = __riscv_vsll_vx_u16mf2(v_qs_u16, 3, 8);
vuint16mf2_t v_grid_offsets = __riscv_vor_vv_u16mf2(v_qs_u16, v_qh_expanded, 8);
// Lookup Grid using Byte Offsets
vuint64m2_t v_grid_vals = __riscv_vluxei16_v_u64m2(grid64, v_grid_offsets, 8);
vuint8m2_t v_grid_u8 = __riscv_vreinterpret_v_u64m2_u8m2(v_grid_vals);
vint8m2_t v_grid_i8 = __riscv_vreinterpret_v_u8m2_i8m2(v_grid_u8);
// Load signs and generate sign mask
vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs_ptr, 8);
signs_ptr += 8;
vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 64);
vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 64);
vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 64);
vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
q8 += 64;
vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 64);
vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_grid_i8, v_q8_signed, 64);
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
int32_t s0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
__riscv_vget_v_i16m4_i16m1(v_dot, 0), v_zero, 16));
int32_t s1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
__riscv_vget_v_i16m4_i16m1(v_dot, 1), v_zero, 16));
int32_t s2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
__riscv_vget_v_i16m4_i16m1(v_dot, 2), v_zero, 16));
int32_t s3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(
__riscv_vget_v_i16m4_i16m1(v_dot, 3), v_zero, 16));
uint8_t sc0 = scales[0];
uint8_t sc1 = scales[1];
scales += 2;
sum_block += s0 * (2 * (sc0 & 0xF) + 1);
sum_block += s1 * (2 * (sc0 >> 4) + 1);
sum_block += s2 * (2 * (sc1 & 0xF) + 1);
sum_block += s3 * (2 * (sc1 >> 4) + 1);
}
sumf += sum_block * combined_scale;
}
*s = 0.125f * sumf;
}
static void ggml_vec_dot_iq2_s_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
UNUSED(nrc); UNUSED(bx); UNUSED(by); UNUSED(bs);
const block_iq2_s * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const uint64_t * grid64 = (const uint64_t *)iq2s_grid;
// Pre-load Constants
vuint8m2_t v_ids = __riscv_vid_v_u8m2(32);
vuint8m2_t v_sign_gather_indices = __riscv_vsrl_vx_u8m2(v_ids, 3, 32);
vuint8m2_t v_ones = __riscv_vmv_v_x_u8m2(1, 32);
vuint8m2_t v_shift_amts = __riscv_vand_vx_u8m2(v_ids, 7, 32);
vuint8m2_t v_sign_masks = __riscv_vsll_vv_u8m2(v_ones, v_shift_amts, 32);
uint16_t shift_qh_arr[4] = {11, 9, 7, 5};
vuint16mf2_t v_shift_qh = __riscv_vle16_v_u16mf2(shift_qh_arr, 4);
float sumf = 0.0f;
for (int i = 0; i < nb; ++i) {
const float combined_scale = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint8_t * GGML_RESTRICT scales = x[i].scales;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
const uint8_t * signs_ptr = qs + 32;
float sum_block = 0.0f;
for (int ib = 0; ib < 8; ++ib) {
// Load Low Bits [4 bytes]
vuint8mf4_t v_qs_u8 = __riscv_vle8_v_u8mf4(qs, 4);
qs += 4;
// Load 1 byte. It contains bits for 4 mini-blocks.
uint8_t qh_val = *qh++;
// Combine Low + High bits of 10bit indices
vuint8mf4_t v_qh_raw = __riscv_vmv_v_x_u8mf4(qh_val, 4);
vuint16mf2_t v_qh_u16 = __riscv_vwcvtu_x_x_v_u16mf2(v_qh_raw, 4);
vuint16mf2_t v_qh_mf2 = __riscv_vsll_vv_u16mf2(v_qh_u16, v_shift_qh, 4);
v_qh_mf2 = __riscv_vand_vx_u16mf2(v_qh_mf2, 0x1800, 4);
vuint16mf2_t v_qs_u16_mf2 = __riscv_vwcvtu_x_x_v_u16mf2(v_qs_u8, 4);
vuint16mf2_t v_qs_u16 = __riscv_vsll_vx_u16mf2(v_qs_u16_mf2, 3, 4);
vuint16mf2_t v_grid_offsets = __riscv_vor_vv_u16mf2(v_qs_u16, v_qh_mf2, 4);
// Lookup Grid
vint8m2_t v_grid_i8 = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vreinterpret_v_u64m2_u8m2(__riscv_vluxei16_v_u64m2(grid64, v_grid_offsets, 4)));
vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs_ptr, 4);
signs_ptr += 4;
vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 32);
// generating sign mask
vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 32);
vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 32);
vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 32);
q8 += 32;
// apply signs
vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative,v_q8, v_q8, 0, 32);
vint16m4_t v_dot = __riscv_vwmul_vv_i16m4(v_grid_i8, v_q8_signed, 32);
// Reduction
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
// Reduce 0-15 (First Half)
int32_t s0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
__riscv_vget_v_i16m4_i16m2(v_dot, 0), v_zero, 16));
// Reduce 16-31 (Second Half)
int32_t s1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(
__riscv_vget_v_i16m4_i16m2(v_dot, 1), v_zero, 16));
// Apply sub Scales
uint8_t sc = *scales++;
sum_block += s0 * (2 * (sc & 0xF) + 1);
sum_block += s1 * (2 * (sc >> 4) + 1);
}
sumf += sum_block * combined_scale;
}
*s = 0.125f * sumf;
}
void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
#if defined __riscv_v_intrinsic
switch (__riscv_vlenb() * 8) {
case 128:
ggml_vec_dot_iq2_s_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
break;
case 256:
ggml_vec_dot_iq2_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
break;
default:
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
break;
}
#else
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
static void ggml_vec_dot_iq3_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_iq3_s * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
const uint64_t * grid64 = (const uint64_t *)iq3s_grid;
// --- Pre-load Constants ---
const uint16_t qh_bit_shifts_arr[16] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
vuint8m2_t v_sign_gather_indices = __riscv_vle8_v_u8m2(sign_gather_indices_arr, 64);
vuint8m2_t v_sign_masks = __riscv_vle8_v_u8m2(sign_bit_masks_arr, 64);
vuint16m1_t v_qh_shifts = __riscv_vle16_v_u16m1(qh_bit_shifts_arr, 16);
float sumf = 0.0f;
for (int i = 0; i < nb; ++i) {
const float d = GGML_CPU_FP16_TO_FP32(x[i].d);
const float combined_scale = d * y[i].d;
const uint8_t * GGML_RESTRICT qs = x[i].qs;
const uint8_t * GGML_RESTRICT qh = x[i].qh;
const uint8_t * GGML_RESTRICT scales = x[i].scales;
const uint8_t * GGML_RESTRICT signs = x[i].signs;
const int8_t * GGML_RESTRICT q8 = y[i].qs;
float sum_block = 0.0f;
// Loop: Process 64 weights (16 mini-blocks of 4) per iteration
for (int ib = 0; ib < 4; ++ib) {
vuint8mf2_t v_qs_u8 = __riscv_vle8_v_u8mf2(qs, 16);
qs += 16;
uint16_t qh_val;
memcpy(&qh_val, qh, 2);
qh += 2;
vuint16m1_t v_qh_val = __riscv_vmv_v_x_u16m1(qh_val, 16);
// Extract bits: (qh >> i) & 1
v_qh_val = __riscv_vsrl_vv_u16m1(v_qh_val, v_qh_shifts, 16);
v_qh_val = __riscv_vand_vx_u16m1(v_qh_val, 1, 16);
vuint16m1_t v_qs_u16 = __riscv_vwcvtu_x_x_v_u16m1(v_qs_u8, 16);
v_qs_u16 = __riscv_vsll_vx_u16m1(v_qs_u16, 2, 16);
v_qh_val = __riscv_vsll_vx_u16m1(v_qh_val, 10, 16);
vuint16m1_t v_grid_offsets = __riscv_vor_vv_u16m1(v_qs_u16, v_qh_val, 16);
// Grid value is 4xuint8
vuint32m2_t v_grid_packed = __riscv_vluxei16_v_u32m2((const uint32_t *)grid64, v_grid_offsets, 16);
vuint8m2_t v_grid_u8 = __riscv_vreinterpret_v_u32m2_u8m2(v_grid_packed);
vuint8mf4_t v_signs_raw = __riscv_vle8_v_u8mf4(signs, 8);
signs += 8;
// Generate sign mask
vuint8m2_t v_signs_source = __riscv_vlmul_ext_v_u8mf4_u8m2(v_signs_raw);
vuint8m2_t v_signs_bcast = __riscv_vrgather_vv_u8m2(v_signs_source, v_sign_gather_indices, 64);
vuint8m2_t v_sign_bits = __riscv_vand_vv_u8m2(v_signs_bcast, v_sign_masks, 64);
vbool4_t m_negative = __riscv_vmsne_vx_u8m2_b4(v_sign_bits, 0, 64);
vint8m2_t v_q8 = __riscv_vle8_v_i8m2(q8, 64);
q8 += 64;
// Apply Signs
vint8m2_t v_q8_signed = __riscv_vrsub_vx_i8m2_mu(m_negative, v_q8, v_q8, 0, 64);
vint16m4_t v_dot = __riscv_vwmulsu_vv_i16m4(v_q8_signed, v_grid_u8, 64);
// Reduction
vint16m2_t v_dot_lo = __riscv_vget_v_i16m4_i16m2(v_dot, 0);
vint16m2_t v_dot_hi = __riscv_vget_v_i16m4_i16m2(v_dot, 1);
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, 1);
int32_t s_lo = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(v_dot_lo, v_zero, 32));
int32_t s_hi = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(v_dot_hi, v_zero, 32));
// Apply sub-scales
uint8_t sc_byte = *scales++;
int sc_lo = (sc_byte & 0xF) * 2 + 1;
int sc_hi = (sc_byte >> 4) * 2 + 1;
sum_block += s_lo * sc_lo + s_hi * sc_hi;
}
sumf += sum_block * combined_scale;
}
*s = 0.125f * sumf;
}
void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
#if defined __riscv_v_intrinsic
switch (__riscv_vlenb() * 8) {
case 256:
ggml_vec_dot_iq3_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
break;
default:
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
break;
}
#else
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
static void ggml_vec_dot_tq1_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_tq1_0 * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
float sumf = 0.0f;
uint8_t pow[16] = {1, 1, 1, 1, 3, 3, 3, 3, 9, 9, 9, 9, 27, 27, 27, 27};
for (int i = 0; i < nb; i++) {
// First loop.
vint32m4_t suml1;
{
const int vl = 32;
vuint8m1_t tq = __riscv_vle8_v_u8m1(x[i].qs, vl);
vuint16m2_t tq0 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(tq, 3, vl), 8, vl);
vuint16m2_t tq1 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 3, vl), 3, vl), 8, vl);
vuint16m2_t tq2 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 9, vl), 3, vl), 8, vl);
vuint16m2_t tq3 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 27, vl), 3, vl), 8, vl);
vuint16m2_t tq4 = __riscv_vsrl_vx_u16m2(__riscv_vwmulu_vx_u16m2(__riscv_vmul_vx_u8m1(tq, 81, vl), 3, vl), 8, vl);
vint16m2_t q80 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 0, vl), vl);
vint16m2_t q81 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 32, vl), vl);
vint16m2_t q82 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 64, vl), vl);
vint16m2_t q83 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 96, vl), vl);
vint16m2_t q84 = __riscv_vwcvt_x_x_v_i16m2(__riscv_vle8_v_i8m1(y[i].qs + 128, vl), vl);
vint16m2_t sum0 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq0, 1, vl)), q80, vl);
vint16m2_t sum1 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq1, 1, vl)), q81, vl);
vint16m2_t sum2 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq2, 1, vl)), q82, vl);
vint16m2_t sum3 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq3, 1, vl)), q83, vl);
vint16m2_t sum4 = __riscv_vmul_vv_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vsub_vx_u16m2(tq4, 1, vl)), q84, vl);
vint32m4_t sumi0 = __riscv_vwadd_vv_i32m4(sum0, sum1, vl);
vint32m4_t sumi1 = __riscv_vwadd_vv_i32m4(sum2, sum3, vl);
suml1 = __riscv_vadd_vv_i32m4(__riscv_vwcvt_x_x_v_i32m4(sum4, vl), __riscv_vadd_vv_i32m4(sumi0, sumi1, vl), vl);
}
// Second loop.
vint32m2_t suml2;
{
const int vl = 16;
vuint8mf2_t tq = __riscv_vle8_v_u8mf2(x[i].qs + 32, vl);
vuint16m1_t tq0 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(tq, 3 * 1, vl), 8, vl);
vuint16m1_t tq1 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 3, vl), 3, vl), 8, vl);
vuint16m1_t tq2 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 9, vl), 3, vl), 8, vl);
vuint16m1_t tq3 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 27, vl), 3, vl), 8, vl);
vuint16m1_t tq4 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vx_u8mf2(tq, 81, vl), 3, vl), 8, vl);
vint16m1_t q80 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 160, vl), vl);
vint16m1_t q81 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 176, vl), vl);
vint16m1_t q82 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 192, vl), vl);
vint16m1_t q83 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 208, vl), vl);
vint16m1_t q84 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 224, vl), vl);
vint16m1_t sum0 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq0, 1, vl)), q80, vl);
vint16m1_t sum1 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq1, 1, vl)), q81, vl);
vint16m1_t sum2 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq2, 1, vl)), q82, vl);
vint16m1_t sum3 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq3, 1, vl)), q83, vl);
vint16m1_t sum4 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq4, 1, vl)), q84, vl);
vint32m2_t sumi0 = __riscv_vwadd_vv_i32m2(sum0, sum1, vl);
vint32m2_t sumi1 = __riscv_vwadd_vv_i32m2(sum2, sum3, vl);
suml2 = __riscv_vadd_vv_i32m2(__riscv_vwcvt_x_x_v_i32m2(sum4, vl), __riscv_vadd_vv_i32m2(sumi0, sumi1, vl), vl);
}
// Third loop.
vint32m2_t suml3;
{
const int vl = 16;
uint32_t qh;
memcpy(&qh, &x[i].qh[0], 4);
// Prevent fusion with vmv.
__asm__ __volatile__("" : "+r"(qh));
vuint8mf2_t tq = __riscv_vreinterpret_v_u32mf2_u8mf2(__riscv_vmv_v_x_u32mf2(qh, vl / 4));
vuint8mf2_t p = __riscv_vle8_v_u8mf2(pow, vl);
vuint16m1_t tq0 = __riscv_vsrl_vx_u16m1(__riscv_vwmulu_vx_u16m1(__riscv_vmul_vv_u8mf2(tq, p, vl), 3, vl), 8, vl);
vint16m1_t q80 = __riscv_vwcvt_x_x_v_i16m1(__riscv_vle8_v_i8mf2(y[i].qs + 240, vl), vl);
vint16m1_t sum0 = __riscv_vmul_vv_i16m1(__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vsub_vx_u16m1(tq0, 1, vl)), q80, vl);
suml3 = __riscv_vwcvt_x_x_v_i32m2(sum0, vl);
}
vint32m2_t sumb = __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(suml1, 0), __riscv_vget_v_i32m4_i32m2(suml1, 1), 16);
sumb = __riscv_vadd_vv_i32m2(sumb, suml2, 16);
sumb = __riscv_vadd_vv_i32m2(sumb, suml3, 16);
vint32m1_t sum = __riscv_vredsum_vs_i32m2_i32m1(sumb, __riscv_vmv_v_x_i32m1(0, 1), 16);
sumf += __riscv_vmv_x_s_i32m1_i32(sum) * y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
}
*s = sumf;
}
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
#if defined __riscv_v_intrinsic
switch (__riscv_vlenb() * 8) {
case 256:
ggml_vec_dot_tq1_0_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
break;
default:
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
break;
}
#else
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
static void ggml_vec_dot_tq2_0_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_tq2_0 * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
float sumf = 0.0f;
for (int i = 0; i < nb; ++i) {
int32_t sumi = 0;
for (size_t j = 0; j < sizeof(x[0].qs); j += 32) {
const int8_t * py0 = &y[i].qs[j * 4 + 0 * 32];
const int8_t * py1 = &y[i].qs[j * 4 + 1 * 32];
const int8_t * py2 = &y[i].qs[j * 4 + 2 * 32];
const int8_t * py3 = &y[i].qs[j * 4 + 3 * 32];
const uint8_t* px = &x[i].qs[j];
size_t vlmax_16m2 = __riscv_vsetvl_e16m2(32);
vint16m2_t vacc16 = __riscv_vmv_v_x_i16m2(0, vlmax_16m2);
size_t vl = __riscv_vsetvl_e8m1(32);
vuint8m1_t vx_u8 = __riscv_vle8_v_u8m1(px, vl);
vint8m1_t vy0 = __riscv_vle8_v_i8m1(py0 , vl);
vint8m1_t vy1 = __riscv_vle8_v_i8m1(py1, vl);
vint8m1_t vy2 = __riscv_vle8_v_i8m1(py2, vl);
vint8m1_t vy3 = __riscv_vle8_v_i8m1(py3, vl);
// l=0 (bits 1:0)
vuint8m1_t t0 = __riscv_vand_vx_u8m1(vx_u8, 0x03, vl);
vint8m1_t vq0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t0), 1, vl);
// l=1 (bits 3:2)
vuint8m1_t t1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(vx_u8, 2, vl), 0x03, vl);
vint8m1_t vq1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t1), 1, vl);
// l=2 (bits 5:4)
vuint8m1_t t2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(vx_u8, 4, vl), 0x03, vl);
vint8m1_t vq2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t2), 1, vl);
// l=3 (bits 7:6)
vuint8m1_t t3 = __riscv_vsrl_vx_u8m1(vx_u8, 6, vl); // No final AND needed as vsrl shifts in zeros
vint8m1_t vq3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(t3), 1, vl);
// 4. Multiply and accumulate
vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq0, vy0, vl);
vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq1, vy1, vl);
vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq2, vy2, vl);
vacc16 = __riscv_vwmacc_vv_i16m2(vacc16, vq3, vy3, vl);
vlmax_16m2 = __riscv_vsetvl_e16m2(32);
vint32m1_t vzero32 = __riscv_vmv_v_x_i32m1(0, 1);
vint32m1_t vred32 = __riscv_vwredsum_vs_i16m2_i32m1(vacc16, vzero32, vlmax_16m2);
sumi += __riscv_vmv_x_s_i32m1_i32(vred32);
}
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
sumf += (float)sumi * d;
}
*s = sumf;
}
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
#if defined __riscv_v_intrinsic
switch (__riscv_vlenb() * 8) {
case 256:
ggml_vec_dot_tq2_0_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
break;
default:
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
break;
}
#else
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
static void ggml_vec_dot_iq1_s_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_iq1_s * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
// Load qh once for the entire superblock.
vuint16mf2_t qh = __riscv_vle16_v_u16mf2(x[i].qh, 8);
// Calculate ls.
vuint16mf2_t temp = __riscv_vsrl_vx_u16mf2(qh, 12, 8);
temp = __riscv_vand_vx_u16mf2(temp, 7, 8);
vint32m1_t ls = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vwmulu_vx_u32m1(temp, 2, 8));
ls = __riscv_vadd_vx_i32m1(ls, 1, 8);
// Calculate delta.
vbool32_t mask = __riscv_vmseq_vx_u16mf2_b32(__riscv_vand_vx_u16mf2(qh, 0x8000, 8), 0, 8);
vint32m1_t delta_neg = __riscv_vmv_v_x_i32m1(-1, 8);
vint32m1_t delta_pos = __riscv_vmv_v_x_i32m1(1, 8);
vint32m1_t delta = __riscv_vmerge_vvm_i32m1(delta_neg, delta_pos, mask, 8);
// Load qs.
vuint8m1_t qs = __riscv_vle8_v_u8m1(x[i].qs, 32);
// Prepare the indices.
const uint64_t shift = 0x0009000600030000;
vuint16m2_t qh_shift = __riscv_vreinterpret_v_u64m2_u16m2(__riscv_vmv_v_x_u64m2(shift, 8));
vuint16m2_t qh_gather_index = __riscv_vreinterpret_v_i16m2_u16m2(
__riscv_vdiv_vx_i16m2(__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(32)), 4, 32));
vuint16m2_t qh_ext = __riscv_vlmul_ext_v_u16m1_u16m2(__riscv_vlmul_ext_v_u16mf2_u16m1(qh));
vuint16m2_t qh_index = __riscv_vrgather_vv_u16m2(qh_ext, qh_gather_index, 32);
qh_index = __riscv_vsrl_vv_u16m2(qh_index, qh_shift, 32);
qh_index = __riscv_vand_vx_u16m2(qh_index, 7, 32);
qh_index = __riscv_vsll_vx_u16m2(qh_index, 8, 32);
qh_index = __riscv_vor_vv_u16m2(qh_index, __riscv_vzext_vf2_u16m2(qs, 32), 32);
vuint16m2_t index = __riscv_vsll_vx_u16m2(qh_index, 3, 32);
// Final lsums.
int32_t lsums_s[8];
vint32m1_t one_scalar = __riscv_vmv_v_x_i32m1(0, 1);
// Sub-blocks 1-4
{
vuint16m1_t grid_index0 = __riscv_vget_v_u16m2_u16m1(index, 0);
vint8m4_t grid0 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index0, 16));
vint8m4_t q80 = __riscv_vle8_v_i8m4(y[i].qs, 128);
vint16m8_t lsum0 = __riscv_vwmul_vv_i16m8(grid0, q80, 128);
lsums_s[0] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 0), one_scalar, 32));
lsums_s[1] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 1), one_scalar, 32));
lsums_s[2] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 2), one_scalar, 32));
lsums_s[3] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum0, 3), one_scalar, 32));
}
__asm__ __volatile__("" ::: "memory");
// Sub-blocks 5-8
{
vuint16m1_t grid_index1 = __riscv_vget_v_u16m2_u16m1(index, 1);
vint8m4_t grid1 = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vluxei16_v_i64m4((const int64_t*)iq1s_grid, grid_index1, 16));
vint8m4_t q81 = __riscv_vle8_v_i8m4(&y[i].qs[128], 128);
vint16m8_t lsum1 = __riscv_vwmul_vv_i16m8(grid1, q81, 128);
lsums_s[4] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 0), one_scalar, 32));
lsums_s[5] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 1), one_scalar, 32));
lsums_s[6] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 2), one_scalar, 32));
lsums_s[7] = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(lsum1, 3), one_scalar, 32));
}
__asm__ __volatile__("" ::: "memory");
vint32m1_t lsums = __riscv_vle32_v_i32m1(&lsums_s[0], 8);
// Calculate the bsums.
vint16m1_t bsums_0 = __riscv_vle16_v_i16m1(y[i].bsums, 16);
const vuint32m1_t bsums_i32 = __riscv_vreinterpret_v_u16m1_u32m1(__riscv_vreinterpret_v_i16m1_u16m1(bsums_0));
const vint16mf2_t bsums_i32_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(bsums_i32, 0, 8));
const vint16mf2_t bsums_i32_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(bsums_i32, 16, 8));
const vint32m1_t bsums = __riscv_vwadd_vv_i32m1(bsums_i32_0, bsums_i32_1, 8);
// Accumulation.
vint32m1_t sumi_v = __riscv_vmul_vv_i32m1(ls, lsums, 8);
vint32m1_t sumi1_v = __riscv_vmul_vv_i32m1(__riscv_vmul_vv_i32m1(ls, delta, 8), bsums, 8);
// Update sumf.
int sumi = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m1_i32m1(sumi_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m1_i32m1(sumi1_v, __riscv_vmv_v_x_i32m1(0.0f, 1), 8));
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
}
*s = sumf;
}
void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
#if defined __riscv_v_intrinsic
switch (__riscv_vlenb() * 8) {
case 256:
ggml_vec_dot_iq1_s_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
break;
default:
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
break;
}
#else
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
static void ggml_vec_dot_iq1_m_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_iq1_m * GGML_RESTRICT x = vx;
const block_q8_K * GGML_RESTRICT y = vy;
const int nb = n / QK_K;
iq1m_scale_t scale;
float sumf = 0.0f;
for (int i = 0; i < nb; ++i) {
const int8_t * q8 = y[i].qs;
const uint8_t * qs = x[i].qs;
const uint8_t * qh = x[i].qh;
const uint16_t * sc = (const uint16_t *)x[i].scales;
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
// Accumulators.
vint32m2_t acc1 = __riscv_vmv_v_x_i32m2(0, 16);
vint32m2_t acc2 = __riscv_vmv_v_x_i32m2(0, 16);
// We process 4 sub-blocks together.
for (int ib = 0; ib < QK_K/128; ib++) {
// Load qh for 4 sub-blocks.
const vuint8mf4_t qh_8 = __riscv_vle8_v_u8mf4(qh, 8);
const vuint16mf2_t qh_16_lo = __riscv_vzext_vf2_u16mf2(qh_8, 8);
const vuint16mf2_t qh_16_hi = __riscv_vsll_vx_u16mf2(qh_16_lo, 8, 8);
const vuint16m1_t qhb = __riscv_vzext_vf2_u16m1(
__riscv_vreinterpret_v_u16mf2_u8mf2(__riscv_vor_vv_u16mf2(qh_16_lo, qh_16_hi, 8)), 16);
qh += 8;
// Prepare grid indices.
const vuint16m1_t qsb = __riscv_vzext_vf2_u16m1(__riscv_vle8_v_u8mf2(&qs[0], 16), 16);
const vuint16m1_t shift = __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vmv_v_x_u32m1(0x00040008, 8));
vuint16m1_t index = __riscv_vor_vv_u16m1(qsb, __riscv_vand_vx_u16m1(__riscv_vsll_vv_u16m1(qhb, shift, 16), 0x700, 16), 16);
index = __riscv_vsll_vx_u16m1(index, 3, 16);
qs += 16;
// Load the grid.
const vint8m4_t iq1b = __riscv_vreinterpret_v_i64m4_i8m4(__riscv_vreinterpret_v_u64m4_i64m4(
__riscv_vluxei16_v_u64m4(iq1s_grid, index, 16)));
// Prepare the deltas.
const vbool16_t mask = __riscv_vmsgtu_vx_u16m1_b16(
__riscv_vand_vv_u16m1(qhb, __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vmv_v_x_u32m1(0x00800008, 8)), 16), 0, 16);
const vint64m4_t delta_pos = __riscv_vmv_v_x_i64m4(0x0101010101010101, 16);
const vint64m4_t delta_neg = __riscv_vmv_v_x_i64m4(0xffffffffffffffff, 16);
const vint8m4_t delta = __riscv_vreinterpret_v_i64m4_i8m4(
__riscv_vmerge_vvm_i64m4(delta_pos, delta_neg, mask, 16));
// Load q8 for sub-blocks.
const vint8m4_t q8b = __riscv_vle8_v_i8m4(q8, 128);
q8 += 128;
// Calculate the lsums.
const vint16m8_t lsum1 = __riscv_vwmul_vv_i16m8(iq1b, q8b, 128);
const vint16m8_t lsum2 = __riscv_vwmul_vv_i16m8(delta, q8b, 128);
// Prepare the scales.
const int16_t ls_0_0 = 2*((sc[0] >> 0) & 0x7) + 1;
const int16_t ls_0_1 = 2*((sc[0] >> 3) & 0x7) + 1;
const int16_t ls_1_0 = 2*((sc[0] >> 6) & 0x7) + 1;
const int16_t ls_1_1 = 2*((sc[0] >> 9) & 0x7) + 1;
const int16_t ls_2_0 = 2*((sc[1] >> 0) & 0x7) + 1;
const int16_t ls_2_1 = 2*((sc[1] >> 3) & 0x7) + 1;
const int16_t ls_3_0 = 2*((sc[1] >> 6) & 0x7) + 1;
const int16_t ls_3_1 = 2*((sc[1] >> 9) & 0x7) + 1;
sc += 2;
// Accumulate in acc0 and acc1 for each sub-block.
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_0_0, __riscv_vget_v_i16m8_i16m1(lsum1, 0), 16);
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_0_1, __riscv_vget_v_i16m8_i16m1(lsum1, 1), 16);
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_0_0, __riscv_vget_v_i16m8_i16m1(lsum2, 0), 16);
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_0_1, __riscv_vget_v_i16m8_i16m1(lsum2, 1), 16);
//
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_1_0, __riscv_vget_v_i16m8_i16m1(lsum1, 2), 16);
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_1_1, __riscv_vget_v_i16m8_i16m1(lsum1, 3), 16);
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_1_0, __riscv_vget_v_i16m8_i16m1(lsum2, 2), 16);
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_1_1, __riscv_vget_v_i16m8_i16m1(lsum2, 3), 16);
//
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_2_0, __riscv_vget_v_i16m8_i16m1(lsum1, 4), 16);
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_2_1, __riscv_vget_v_i16m8_i16m1(lsum1, 5), 16);
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_2_0, __riscv_vget_v_i16m8_i16m1(lsum2, 4), 16);
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_2_1, __riscv_vget_v_i16m8_i16m1(lsum2, 5), 16);
//
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_3_0, __riscv_vget_v_i16m8_i16m1(lsum1, 6), 16);
acc1 = __riscv_vwmacc_vx_i32m2(acc1, ls_3_1, __riscv_vget_v_i16m8_i16m1(lsum1, 7), 16);
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_3_0, __riscv_vget_v_i16m8_i16m1(lsum2, 6), 16);
acc2 = __riscv_vwmacc_vx_i32m2(acc2, ls_3_1, __riscv_vget_v_i16m8_i16m1(lsum2, 7), 16);
}
// Reduce and accumulate in `sumf`.
vint32m1_t one = __riscv_vmv_v_x_i32m1(0, 1);
int sumi1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(acc1, one, 16));
int sumi2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vredsum_vs_i32m2_i32m1(acc2, one, 16));
sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (sumi1 + IQ1M_DELTA * sumi2);
}
*s = sumf;
}
void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
#if defined __riscv_v_intrinsic
switch (__riscv_vlenb() * 8) {
case 256:
ggml_vec_dot_iq1_m_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
break;
default:
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
break;
}
#else
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}

View File

@@ -181,11 +181,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
const int16x8_t v_xyl = vec_meadd(v_xls, v_yl, v_xylso);
const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
const int16x8_t v_xyh = vec_meadd(v_xhs, v_yh, v_xyhso);
const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
int16x8_t v_xy_ = v_xyl + v_xyh; v_xy_ += vec_reve(v_xy_);
int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
@@ -890,7 +890,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
const int32x4_t v_mins = vec_meadd(v_ysums, v_minsh, v_minso);
const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
const int32x4_t v_mins = v_minso + v_minse;
sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
const uint8_t * scales = (const uint8_t *)utmp;
@@ -1003,7 +1004,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
const int32x4_t v_mins = vec_meadd(v_ysums, v_minsh, v_minsho);
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
const int32_t mins = vec_hsum_i32x4(v_mins);
const uint8_t * scales = (const uint8_t *)utmp;
@@ -1108,10 +1110,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int16x8_t v_scaleh = vec_unpackl(v_scale);
const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
const int32x4_t v_minsl = vec_meadd(v_ysumsl, v_scalel, v_minslo);
const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
const int32x4_t v_minsh = vec_meadd(v_ysumsh, v_scaleh, v_minsho);
const int32x4_t v_mins = vec_add(v_minsl, v_minsh);
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
const int32_t mins = vec_hsum_i32x4(v_mins);

Some files were not shown because too many files have changed in this diff Show More