mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-05-28 17:27:26 +03:00
Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6b4e4bd582 | ||
|
|
9f0e4b14d2 | ||
|
|
b3a739c9b6 | ||
|
|
4d8cc0c56f | ||
|
|
0d227ec358 | ||
|
|
1d971bba36 | ||
|
|
9777256c31 | ||
|
|
7085492c6f | ||
|
|
b4c0549a49 | ||
|
|
0d18aaa9d1 | ||
|
|
08bc21b459 | ||
|
|
35a74c8fb9 | ||
|
|
5190c2ea8d | ||
|
|
7799d31e68 | ||
|
|
3a3ed153d9 | ||
|
|
ef66bfab68 | ||
|
|
678d43d720 | ||
|
|
ef41a69179 | ||
|
|
3dc7684f39 | ||
|
|
dbe9c0c8ce | ||
|
|
6fe90deffa | ||
|
|
581d020b12 | ||
|
|
7623de11d9 | ||
|
|
c9d98295a3 | ||
|
|
1506d39e76 | ||
|
|
54121f7325 | ||
|
|
192d8ae8b8 | ||
|
|
35c9b1f39e | ||
|
|
4bead4e30d |
6
.github/workflows/build-3rd-party.yml
vendored
6
.github/workflows/build-3rd-party.yml
vendored
@@ -22,9 +22,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-llguidance:
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
android-ndk-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -61,7 +61,7 @@ jobs:
|
||||
linux-iot-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
6
.github/workflows/build-android.yml
vendored
6
.github/workflows/build-android.yml
vendored
@@ -27,9 +27,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
android:
|
||||
|
||||
92
.github/workflows/build-apple.yml
vendored
92
.github/workflows/build-apple.yml
vendored
@@ -32,12 +32,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macOS-latest-ios:
|
||||
macos-latest-arm64:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -48,7 +48,79 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-ios
|
||||
key: macos-latest-arm64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -E "test-llama-archs" --verbose --timeout 900
|
||||
|
||||
macos-latest-x64:
|
||||
runs-on: macos-15-intel
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macos-latest-x64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
macos-latest-ios:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macos-latest-ios
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -117,7 +189,7 @@ jobs:
|
||||
xcodebuild -downloadPlatform iOS
|
||||
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
macOS-latest-tvos:
|
||||
macos-latest-tvos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -128,7 +200,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-tvos
|
||||
key: macos-latest-tvos
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -150,7 +222,7 @@ jobs:
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-visionos:
|
||||
macos-latest-visionos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -176,7 +248,7 @@ jobs:
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-swift:
|
||||
macos-latest-swift:
|
||||
runs-on: macos-latest
|
||||
needs: macos-latest-ios-xcode
|
||||
|
||||
@@ -192,7 +264,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-swift
|
||||
key: macos-latest-swift
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
|
||||
8
.github/workflows/build-cache.yml
vendored
8
.github/workflows/build-cache.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
# key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
# - name: Setup SpacemiT Toolchain
|
||||
# if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
@@ -108,7 +108,7 @@ jobs:
|
||||
id: cache-rocm
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
|
||||
140
.github/workflows/build-cann.yml
vendored
140
.github/workflows/build-cann.yml
vendored
@@ -29,74 +29,76 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
openEuler-latest-cann:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -el {0}
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [x86, aarch64]
|
||||
chip_type: ['910b', '310p']
|
||||
build: ['Release']
|
||||
use_acl_graph: ['on', 'off']
|
||||
exclude:
|
||||
# 310P does not support USE_ACL_GRAPH=on
|
||||
- chip_type: '310p'
|
||||
use_acl_graph: 'on'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build }}
|
||||
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
run: |
|
||||
HOST_UID=$(id -u)
|
||||
HOST_GID=$(id -g)
|
||||
|
||||
docker run --rm \
|
||||
-v "${PWD}:/workspace" \
|
||||
-w /workspace \
|
||||
-e SOC_TYPE=${SOC_TYPE} \
|
||||
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
"${{ steps.cann-image.outputs.image }}" \
|
||||
bash -lc '
|
||||
set -e
|
||||
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
git config --global --add safe.directory "/workspace"
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE} \
|
||||
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
'
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# openEuler-latest-cann:
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash -el {0}
|
||||
# strategy:
|
||||
# matrix:
|
||||
# arch: [x86, aarch64]
|
||||
# chip_type: ['910b', '310p']
|
||||
# build: ['Release']
|
||||
# use_acl_graph: ['on', 'off']
|
||||
# exclude:
|
||||
# # 310P does not support USE_ACL_GRAPH=on
|
||||
# - chip_type: '310p'
|
||||
# use_acl_graph: 'on'
|
||||
# runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
# steps:
|
||||
# - name: Checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Free up disk space
|
||||
# uses: ggml-org/free-disk-space@v1.3.1
|
||||
# with:
|
||||
# tool-cache: true
|
||||
#
|
||||
# - name: Set container image
|
||||
# id: cann-image
|
||||
# run: |
|
||||
# image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
# echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
#
|
||||
# - name: Pull container image
|
||||
# run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
#
|
||||
# - name: Build
|
||||
# env:
|
||||
# BUILD_TYPE: ${{ matrix.build }}
|
||||
# SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
# USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
# run: |
|
||||
# HOST_UID=$(id -u)
|
||||
# HOST_GID=$(id -g)
|
||||
#
|
||||
# docker run --rm \
|
||||
# -v "${PWD}:/workspace" \
|
||||
# -w /workspace \
|
||||
# -e SOC_TYPE=${SOC_TYPE} \
|
||||
# -e BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
# "${{ steps.cann-image.outputs.image }}" \
|
||||
# bash -lc '
|
||||
# set -e
|
||||
# yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
# yum clean all && rm -rf /var/cache/yum
|
||||
# git config --global --add safe.directory "/workspace"
|
||||
# export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
# cmake -S . -B build \
|
||||
# -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -DGGML_CANN=on \
|
||||
# -DSOC_TYPE=${SOC_TYPE} \
|
||||
# -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
# cmake --build build -j $(nproc)
|
||||
#
|
||||
# chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
# '
|
||||
|
||||
2
.github/workflows/build-cross.yml
vendored
2
.github/workflows/build-cross.yml
vendored
@@ -287,7 +287,7 @@ jobs:
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
# key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup SpacemiT Toolchain
|
||||
#if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
|
||||
8
.github/workflows/build-hip.yml
vendored
8
.github/workflows/build-hip.yml
vendored
@@ -31,9 +31,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
@@ -93,7 +93,7 @@ jobs:
|
||||
id: cache-rocm
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
|
||||
6
.github/workflows/build-ibm.yml
vendored
6
.github/workflows/build-ibm.yml
vendored
@@ -29,9 +29,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
|
||||
6
.github/workflows/build-msys.yml
vendored
6
.github/workflows/build-msys.yml
vendored
@@ -15,9 +15,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
windows-msys2:
|
||||
|
||||
6
.github/workflows/build-opencl.yml
vendored
6
.github/workflows/build-opencl.yml
vendored
@@ -30,9 +30,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
|
||||
8
.github/workflows/build-openvino.yml
vendored
8
.github/workflows/build-openvino.yml
vendored
@@ -29,9 +29,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-openvino:
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
|
||||
34
.github/workflows/build-riscv.yml
vendored
34
.github/workflows/build-riscv.yml
vendored
@@ -29,9 +29,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-cpu-riscv64-native:
|
||||
@@ -58,17 +58,20 @@ jobs:
|
||||
ldd --version
|
||||
cmake --version
|
||||
rustc --version
|
||||
env
|
||||
echo "nproc=$(nproc)"
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
with:
|
||||
key: ubuntu-cpu-riscv64-native
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
# note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
# with:
|
||||
# key: ubuntu-cpu-riscv64-native
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -132,12 +135,13 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
with:
|
||||
key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
# note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
# with:
|
||||
# key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
6
.github/workflows/build-rpc.yml
vendored
6
.github/workflows/build-rpc.yml
vendored
@@ -29,9 +29,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
|
||||
62
.github/workflows/build-sanitize.yml
vendored
62
.github/workflows/build-sanitize.yml
vendored
@@ -22,66 +22,78 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
ctest:
|
||||
runs-on: [self-hosted, X64, CPU, Linux]
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
#- name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential libssl-dev
|
||||
|
||||
# with UNDEFINED sanitizer, we have to build in Debug to avoid GCC 13 false-positive warnings
|
||||
- name: Build (undefined)
|
||||
id: cmake_build_undefined
|
||||
if: ${{ matrix.sanitizer == 'UNDEFINED' }}
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
|
||||
cmake --build build --config Debug -j $(nproc)
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
if: ${{ matrix.sanitizer == 'ADDRESS' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
cmake --build build --config RelWithDebInfo -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
cmake --build build --config RelWithDebInfo -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# skip run in Debug - very slow
|
||||
if: ${{ matrix.sanitizer != 'UNDEFINED' }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
ctest -L main -E tokenizer --verbose --timeout 900
|
||||
|
||||
63
.github/workflows/build-self-hosted.yml
vendored
63
.github/workflows/build-self-hosted.yml
vendored
@@ -50,9 +50,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ggml-ci-nvidia-cuda:
|
||||
@@ -297,8 +297,8 @@ jobs:
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-arm64-cpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
ggml-ci-cpu-low-perf:
|
||||
runs-on: [self-hosted, CPU]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -310,8 +310,8 @@ jobs:
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
ggml-ci-cpu-high-perf:
|
||||
runs-on: [self-hosted, CPU]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -354,3 +354,52 @@ jobs:
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-arm64-cpu-kleidiai-graviton4:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
set -euxo pipefail
|
||||
sudo apt-get update
|
||||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
time \
|
||||
git-lfs
|
||||
|
||||
git lfs install
|
||||
|
||||
# install the latest cmake
|
||||
sudo install -d /usr/share/keyrings
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
||||
| gpg --dearmor \
|
||||
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
||||
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
||||
| sudo tee /etc/apt/sources.list.d/kitware.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
|
||||
# note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ggml-ci-arm64-cpu-kleidiai-graviton4
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_KLEIDIAI=1 \
|
||||
GG_BUILD_EXTRA_TESTS_0=1 \
|
||||
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
250
.github/workflows/build-sycl.yml
vendored
250
.github/workflows/build-sycl.yml
vendored
@@ -29,130 +29,134 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
ubuntu-24-sycl:
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# continue-on-error: true
|
||||
#
|
||||
# steps:
|
||||
# - uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
windows-latest-sycl:
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-latest-sycl:
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: windows-latest-sycl
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
8
.github/workflows/build-vulkan.yml
vendored
8
.github/workflows/build-vulkan.yml
vendored
@@ -31,9 +31,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-vulkan-llvmpipe:
|
||||
@@ -68,7 +68,7 @@ jobs:
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
|
||||
37
.github/workflows/build-webgpu.yml
vendored
37
.github/workflows/build-webgpu.yml
vendored
@@ -30,13 +30,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
macOS-latest-arm64-webgpu:
|
||||
macos-latest-webgpu:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -47,7 +46,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-arm64-webgpu
|
||||
key: macos-latest-webgpu
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -100,25 +99,6 @@ jobs:
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||
|
||||
- name: Get latest Vulkan SDK version
|
||||
id: vulkan_sdk_version
|
||||
run: |
|
||||
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Use Vulkan SDK Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-vulkan
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
version: ${{ env.VULKAN_SDK_VERSION }}
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
@@ -157,6 +137,13 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-webgpu-wasm
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Install Emscripten
|
||||
run: |
|
||||
git clone https://github.com/emscripten-core/emsdk.git
|
||||
|
||||
243
.github/workflows/build.yml
vendored
243
.github/workflows/build.yml
vendored
@@ -52,86 +52,14 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
build-cmake-pkg:
|
||||
uses: ./.github/workflows/build-cmake-pkg.yml
|
||||
|
||||
macOS-latest-arm64:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-arm64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -E "test-llama-archs" --verbose --timeout 900
|
||||
|
||||
macOS-latest-x64:
|
||||
runs-on: macos-15-intel
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-x64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu-cpu:
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -253,16 +181,16 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64 (static)'
|
||||
- build: 'x64-cpu-static'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
|
||||
- build: 'openblas-x64'
|
||||
- build: 'x64-openblas'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
- build: 'x64-vulkan'
|
||||
arch: 'x64'
|
||||
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'llvm-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
|
||||
@@ -281,7 +209,7 @@ jobs:
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
if: ${{ matrix.build == 'x64-openblas' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
@@ -294,7 +222,7 @@ jobs:
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'vulkan-x64' }}
|
||||
if: ${{ matrix.build == 'x64-vulkan' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
@@ -315,7 +243,7 @@ jobs:
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
if: ${{ matrix.build == 'x64-openblas' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
@@ -428,31 +356,33 @@ jobs:
|
||||
|
||||
# TODO: simplify the following workflows using a matrix
|
||||
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
|
||||
ggml-ci-x64-cpu-low-perf:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ggml-ci-x64-cpu-low-perf
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
# note: moved to build-self-hosted.yml - can remove from here when everything is stable
|
||||
# ggml-ci-x64-cpu-low-perf:
|
||||
# runs-on: ubuntu-22.04
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ggml-ci-x64-cpu-low-perf
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
# note: moved to build-self-hosted.yml - can remove from here when everything is stable
|
||||
# ggml-ci-arm64-cpu-low-perf:
|
||||
@@ -481,31 +411,32 @@ jobs:
|
||||
# run: |
|
||||
# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-x64-cpu-high-perf:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ggml-ci-x64-cpu-high-perf
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
# note: moved to build-self-hosted.yml - can remove from here when everything is stable
|
||||
# ggml-ci-x64-cpu-high-perf:
|
||||
# runs-on: ubuntu-22.04
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ggml-ci-x64-cpu-high-perf
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
# note: moved to build-self-hosted.yml - can remove from here when everything is stable
|
||||
# ggml-ci-arm64-cpu-high-perf:
|
||||
@@ -585,51 +516,3 @@ jobs:
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-kleidiai-graviton4:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
set -euxo pipefail
|
||||
sudo apt-get update
|
||||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
time \
|
||||
git-lfs
|
||||
|
||||
git lfs install
|
||||
|
||||
# install the latest cmake
|
||||
sudo install -d /usr/share/keyrings
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
||||
| gpg --dearmor \
|
||||
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
||||
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
||||
| sudo tee /etc/apt/sources.list.d/kitware.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-kleidiai-graviton4
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_KLEIDIAI=1 \
|
||||
GG_BUILD_EXTRA_TESTS_0=1 \
|
||||
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
6
.github/workflows/hip-quality-check.yml
vendored
6
.github/workflows/hip-quality-check.yml
vendored
@@ -28,9 +28,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-22-hip-quality-check:
|
||||
|
||||
665
.github/workflows/release.yml
vendored
665
.github/workflows/release.yml
vendored
@@ -37,8 +37,30 @@ env:
|
||||
|
||||
jobs:
|
||||
|
||||
macOS-cpu:
|
||||
check_release:
|
||||
runs-on: [self-hosted, fast]
|
||||
|
||||
outputs:
|
||||
should_release: ${{ steps.check.outputs.should_release }}
|
||||
|
||||
steps:
|
||||
- id: check
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
||||
echo "should_release=true" >> $GITHUB_OUTPUT
|
||||
elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
|
||||
if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
|
||||
echo "should_release=false" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "should_release=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
else
|
||||
echo "should_release=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
macos-cpu:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
@@ -76,7 +98,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-${{ matrix.arch }}
|
||||
key: macos-latest-${{ matrix.arch }}
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Build
|
||||
@@ -109,7 +131,8 @@ jobs:
|
||||
name: llama-bin-macos-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-cpu:
|
||||
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
@@ -186,6 +209,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-vulkan:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -262,6 +287,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
|
||||
android-arm64:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -339,6 +366,8 @@ jobs:
|
||||
name: llama-bin-android-arm64.tar.gz
|
||||
|
||||
ubuntu-24-openvino:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
@@ -385,7 +414,7 @@ jobs:
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
@@ -427,6 +456,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
|
||||
|
||||
windows-cpu:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2025
|
||||
|
||||
@@ -487,6 +518,8 @@ jobs:
|
||||
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||
|
||||
windows:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2025
|
||||
|
||||
@@ -577,6 +610,8 @@ jobs:
|
||||
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
|
||||
windows-cuda:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2022
|
||||
|
||||
@@ -655,212 +690,218 @@ jobs:
|
||||
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-sycl:
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-sycl:
|
||||
#
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: Setup Node.js
|
||||
# uses: actions/setup-node@v6
|
||||
# with:
|
||||
# node-version: "24"
|
||||
# cache: "npm"
|
||||
# cache-dependency-path: "tools/ui/package-lock.json"
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: windows-latest-sycl
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# shell: cmd
|
||||
# run: |
|
||||
# call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
# cmake -G "Ninja" -B build ^
|
||||
# -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
# -DCMAKE_BUILD_TYPE=Release ^
|
||||
# -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
# -DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
# -DLLAMA_BUILD_BORINGSSL=ON
|
||||
# cmake --build build --target ggml-sycl -j
|
||||
#
|
||||
# - name: Build the release package
|
||||
# id: pack_artifacts
|
||||
# run: |
|
||||
# echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
# ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
# if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
# echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
# cp "$ZE_LOADER_DLL" ./build/bin
|
||||
# else
|
||||
# echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
# fi
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
#
|
||||
# echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
# 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
#
|
||||
# - name: Upload the release package
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-bin-win-sycl-x64.zip
|
||||
# name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
cmake -G "Ninja" -B build ^
|
||||
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --target ggml-sycl -j
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
cp "$ZE_LOADER_DLL" ./build/bin
|
||||
else
|
||||
echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
fi
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
ubuntu-24-sycl:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Setup Node.js
|
||||
# uses: actions/setup-node@v6
|
||||
# with:
|
||||
# node-version: "24"
|
||||
# cache: "npm"
|
||||
# cache-dependency-path: "tools/ui/package-lock.json"
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
#
|
||||
# - name: Determine tag name
|
||||
# id: tag
|
||||
# uses: ./.github/actions/get-tag-name
|
||||
#
|
||||
# - name: Pack artifacts
|
||||
# id: pack_artifacts
|
||||
# run: |
|
||||
# cp LICENSE ./build/bin/
|
||||
# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
#
|
||||
# - name: Upload artifacts
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
# name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
|
||||
ubuntu-22-rocm:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -972,6 +1013,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
|
||||
|
||||
windows-hip:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2022
|
||||
|
||||
@@ -1008,7 +1051,7 @@ jobs:
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
@@ -1086,6 +1129,8 @@ jobs:
|
||||
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
|
||||
ios-xcode-build:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
runs-on: macos-15
|
||||
|
||||
steps:
|
||||
@@ -1141,98 +1186,101 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
|
||||
|
||||
openEuler-cann:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
# 910b with aclgraph (both architectures)
|
||||
- arch: x86
|
||||
chip_type: '910b'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'on'
|
||||
- arch: aarch64
|
||||
chip_type: '910b'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'on'
|
||||
# 310p without aclgraph (both architectures)
|
||||
- arch: x86
|
||||
chip_type: '310p'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'off'
|
||||
- arch: aarch64
|
||||
chip_type: '310p'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'off'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build }}
|
||||
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
run: |
|
||||
HOST_UID=$(id -u)
|
||||
HOST_GID=$(id -g)
|
||||
|
||||
docker run --rm \
|
||||
-v "${PWD}:/workspace" \
|
||||
-w /workspace \
|
||||
-e SOC_TYPE=${SOC_TYPE} \
|
||||
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
"${{ steps.cann-image.outputs.image }}" \
|
||||
bash -lc '
|
||||
set -e
|
||||
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
git config --global --add safe.directory "/workspace"
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE} \
|
||||
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
'
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# openEuler-cann:
|
||||
# strategy:
|
||||
# matrix:
|
||||
# include:
|
||||
# # 910b with aclgraph (both architectures)
|
||||
# - arch: x86
|
||||
# chip_type: '910b'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'on'
|
||||
# - arch: aarch64
|
||||
# chip_type: '910b'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'on'
|
||||
# # 310p without aclgraph (both architectures)
|
||||
# - arch: x86
|
||||
# chip_type: '310p'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'off'
|
||||
# - arch: aarch64
|
||||
# chip_type: '310p'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'off'
|
||||
# runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
# steps:
|
||||
# - name: Checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Free up disk space
|
||||
# uses: ggml-org/free-disk-space@v1.3.1
|
||||
# with:
|
||||
# tool-cache: true
|
||||
#
|
||||
# - name: Set container image
|
||||
# id: cann-image
|
||||
# run: |
|
||||
# image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
# echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
#
|
||||
# - name: Pull container image
|
||||
# run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
#
|
||||
# - name: Build
|
||||
# env:
|
||||
# BUILD_TYPE: ${{ matrix.build }}
|
||||
# SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
# USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
# run: |
|
||||
# HOST_UID=$(id -u)
|
||||
# HOST_GID=$(id -g)
|
||||
#
|
||||
# docker run --rm \
|
||||
# -v "${PWD}:/workspace" \
|
||||
# -w /workspace \
|
||||
# -e SOC_TYPE=${SOC_TYPE} \
|
||||
# -e BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
# "${{ steps.cann-image.outputs.image }}" \
|
||||
# bash -lc '
|
||||
# set -e
|
||||
# yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
# yum clean all && rm -rf /var/cache/yum
|
||||
# git config --global --add safe.directory "/workspace"
|
||||
# export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
# cmake -S . -B build \
|
||||
# -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -DGGML_CANN=on \
|
||||
# -DSOC_TYPE=${SOC_TYPE} \
|
||||
# -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
# cmake --build build -j $(nproc)
|
||||
#
|
||||
# chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
# '
|
||||
#
|
||||
# - name: Determine tag name
|
||||
# id: tag
|
||||
# uses: ./.github/actions/get-tag-name
|
||||
#
|
||||
# - name: Pack artifacts
|
||||
# run: |
|
||||
# cp LICENSE ./build/bin/
|
||||
# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
#
|
||||
# - name: Upload artifacts
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
# name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
|
||||
ui-build:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
release:
|
||||
@@ -1249,17 +1297,17 @@ jobs:
|
||||
- windows
|
||||
- windows-cpu
|
||||
- windows-cuda
|
||||
- windows-sycl
|
||||
#- windows-sycl
|
||||
- windows-hip
|
||||
- ubuntu-22-rocm
|
||||
- ubuntu-cpu
|
||||
- ubuntu-vulkan
|
||||
- ubuntu-24-openvino
|
||||
- ubuntu-24-sycl
|
||||
#- ubuntu-24-sycl
|
||||
- android-arm64
|
||||
- macOS-cpu
|
||||
- macos-cpu
|
||||
- ios-xcode-build
|
||||
- openEuler-cann
|
||||
#- openEuler-cann
|
||||
- ui-build
|
||||
|
||||
outputs:
|
||||
@@ -1360,7 +1408,7 @@ jobs:
|
||||
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
|
||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
|
||||
- Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
|
||||
**Android:**
|
||||
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
|
||||
@@ -1371,14 +1419,15 @@ jobs:
|
||||
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
|
||||
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
|
||||
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
|
||||
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
||||
- Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
||||
|
||||
**openEuler:**
|
||||
- [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
|
||||
- [openEuler x86 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86-aclgraph.tar.gz)
|
||||
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
|
||||
- [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)
|
||||
- [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
- openEuler x86 (310p)
|
||||
- openEuler x86 (910b, ACL Graph)
|
||||
- openEuler aarch64 (310p)
|
||||
- openEuler aarch64 (910b, ACL Graph)
|
||||
|
||||
**UI:**
|
||||
- [UI](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-ui.tar.gz)
|
||||
|
||||
36
.github/workflows/server-sanitize.yml
vendored
36
.github/workflows/server-sanitize.yml
vendored
@@ -26,10 +26,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -37,7 +37,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: [self-hosted, CPU, Linux, llama-server]
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -46,19 +46,19 @@ jobs:
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install \
|
||||
build-essential \
|
||||
xxd \
|
||||
git \
|
||||
cmake \
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
libssl-dev
|
||||
#- name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get -y install \
|
||||
# build-essential \
|
||||
# xxd \
|
||||
# git \
|
||||
# cmake \
|
||||
# curl \
|
||||
# wget \
|
||||
# language-pack-en \
|
||||
# libssl-dev
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
|
||||
8
.github/workflows/server-self-hosted.yml
vendored
8
.github/workflows/server-self-hosted.yml
vendored
@@ -29,10 +29,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
|
||||
49
.github/workflows/server.yml
vendored
49
.github/workflows/server.yml
vendored
@@ -44,25 +44,20 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ui-build:
|
||||
name: Build Web UI
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
server:
|
||||
ubuntu:
|
||||
runs-on: ubuntu-latest
|
||||
needs: ui-build
|
||||
|
||||
name: server (${{ matrix.wf_name }})
|
||||
name: ubuntu (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
@@ -98,17 +93,17 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Download built UI
|
||||
uses: actions/download-artifact@v7
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist
|
||||
key: server-ubuntu-default
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
@@ -135,8 +130,8 @@ jobs:
|
||||
export ${{ matrix.extra_args }}
|
||||
SLOW_TESTS=1 pytest -v -x
|
||||
|
||||
server-windows:
|
||||
runs-on: windows-2022
|
||||
windows:
|
||||
runs-on: windows-2025
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -146,16 +141,24 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
node-version: "24"
|
||||
key: server-windows-default
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
cmake -B build -G "Ninja Multi-Config" ^
|
||||
-DCMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_SCHED_NO_REALLOC=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
|
||||
8
.github/workflows/ui-self-hosted.yml
vendored
8
.github/workflows/ui-self-hosted.yml
vendored
@@ -30,10 +30,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
|
||||
8
.github/workflows/ui.yml
vendored
8
.github/workflows/ui.yml
vendored
@@ -26,10 +26,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
|
||||
@@ -63,6 +63,7 @@ After submitting your PR:
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
|
||||
- Let other maintainers merge their own PRs
|
||||
- When merging a PR, make sure you have a good understanding of the changes
|
||||
- If a PR does not warrant a new release, add `[no release]` in the squashed commit to spare CI resources
|
||||
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
|
||||
|
||||
Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
|
||||
|
||||
13
ci/run.sh
13
ci/run.sh
@@ -66,6 +66,8 @@ fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
else
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
@@ -114,10 +116,7 @@ fi
|
||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
||||
|
||||
# if on Mac, disable METAL
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
|
||||
MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
|
||||
MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
|
||||
if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
|
||||
@@ -133,7 +132,7 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
|
||||
|
||||
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
|
||||
if [ -z "${CMAKE_PREFIX_PATH}" ]; then
|
||||
@@ -167,6 +166,8 @@ fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_BLAS} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
|
||||
else
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=OFF"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
|
||||
@@ -700,8 +701,8 @@ function gg_sum_test_backend_ops_cpu {
|
||||
|
||||
## main
|
||||
|
||||
export LLAMA_LOG_PREFIX=1
|
||||
export LLAMA_LOG_TIMESTAMPS=1
|
||||
export LLAMA_ARG_LOG_PREFIX=1
|
||||
export LLAMA_ARG_LOG_TIMESTAMPS=1
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
|
||||
|
||||
@@ -3026,7 +3026,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.default_template_kwargs[item.key()] = item.value().dump();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CHAT_TEMPLATE_KWARGS"));
|
||||
add_opt(common_arg(
|
||||
{"-to", "--timeout"}, "N",
|
||||
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
||||
@@ -3327,7 +3327,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params &, const std::string & value) {
|
||||
common_log_set_file(common_log_main(), value.c_str());
|
||||
}
|
||||
).set_env("LLAMA_LOG_FILE"));
|
||||
).set_env("LLAMA_ARG_LOG_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--log-colors"}, "[on|off|auto]",
|
||||
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
|
||||
@@ -3344,7 +3344,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_LOG_COLORS"));
|
||||
).set_env("LLAMA_ARG_LOG_COLORS"));
|
||||
add_opt(common_arg(
|
||||
{"-v", "--verbose", "--log-verbose"},
|
||||
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
||||
@@ -3359,7 +3359,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.offline = true;
|
||||
}
|
||||
).set_env("LLAMA_OFFLINE"));
|
||||
).set_env("LLAMA_ARG_OFFLINE"));
|
||||
add_opt(common_arg(
|
||||
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
||||
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
|
||||
@@ -3374,7 +3374,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.verbosity = value;
|
||||
common_log_set_verbosity_thold(value);
|
||||
}
|
||||
).set_env("LLAMA_LOG_VERBOSITY"));
|
||||
).set_env("LLAMA_ARG_LOG_VERBOSITY"));
|
||||
add_opt(common_arg(
|
||||
{"--log-prefix"},
|
||||
{"--no-log-prefix"},
|
||||
|
||||
@@ -74,6 +74,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Gemma3nForCausalLM": "gemma",
|
||||
"Gemma3nForConditionalGeneration": "gemma",
|
||||
"Gemma4ForConditionalGeneration": "gemma",
|
||||
"Gemma4ForCausalLM": "gemma",
|
||||
"GemmaForCausalLM": "gemma",
|
||||
"Glm4ForCausalLM": "glm",
|
||||
"Glm4MoeForCausalLM": "glm",
|
||||
@@ -215,6 +216,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"T5EncoderModel": "t5",
|
||||
"T5ForConditionalGeneration": "t5",
|
||||
"T5WithLMHeadModel": "t5",
|
||||
"TalkieForCausalLM": "talkie",
|
||||
"UMT5ForConditionalGeneration": "t5",
|
||||
"UMT5Model": "t5",
|
||||
"UltravoxModel": "ultravox",
|
||||
|
||||
@@ -1622,6 +1622,12 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
|
||||
# ref: https://huggingface.co/sarvamai/sarvam-30b
|
||||
res = "sarvam-moe"
|
||||
if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af":
|
||||
# ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf
|
||||
res = "talkie"
|
||||
if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
|
||||
# ref: https://huggingface.co/openbmb/MiniCPM5-1B
|
||||
res = "minicpm5"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
||||
@@ -614,7 +614,7 @@ class Gemma3NModel(Gemma3Model):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration", "Gemma4ForCausalLM")
|
||||
class Gemma4Model(Gemma3Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA4
|
||||
|
||||
|
||||
53
conversion/talkie.py
Normal file
53
conversion/talkie.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import LazyTorchTensor, ModelBase, TextModel, gguf
|
||||
|
||||
|
||||
@ModelBase.register("TalkieForCausalLM")
|
||||
class TalkieModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.TALKIE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
# Talkie used F.rms_norm without an explicit eps
|
||||
self.gguf_writer.add_layer_norm_rms_eps(torch.finfo(torch.float32).eps)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
prefix = f"model.blocks.{bid}." if bid is not None else ""
|
||||
suffix = name.removeprefix(prefix)
|
||||
|
||||
if suffix == "attn_gain.a_g":
|
||||
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, ".scale"), data_torch
|
||||
return
|
||||
elif suffix == "mlp_gain.a_g":
|
||||
yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, ".scale"), data_torch
|
||||
return
|
||||
elif suffix == "lm_head_gain.w_g":
|
||||
self.gguf_writer.add_logit_scale(LazyTorchTensor.to_eager(data_torch).item())
|
||||
return
|
||||
elif suffix in ("attn.attn_query.weight", "attn.attn_key.weight"):
|
||||
# absorb inverse rope
|
||||
head_dim = self.hparams["head_dim"]
|
||||
shape = data_torch.shape
|
||||
data_torch = torch.reshape(data_torch, (-1, head_dim, shape[-1]))
|
||||
signs = torch.ones((1, head_dim, 1), dtype=data_torch.dtype)
|
||||
signs[:, head_dim // 2 :, :] = -1
|
||||
if self.lazy:
|
||||
signs = LazyTorchTensor.from_eager(signs)
|
||||
# (n_head, head_dim, n_in) -> (n_out, n_in)
|
||||
data_torch = torch.reshape(data_torch * signs, shape)
|
||||
elif suffix == "attn.head_gain.head_g":
|
||||
# allow head gain to broadcast
|
||||
data_torch = data_torch.unsqueeze(-1)
|
||||
|
||||
if not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
@@ -156,6 +156,8 @@ models = [
|
||||
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
|
||||
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
|
||||
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
|
||||
{"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
|
||||
{"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
||||
@@ -208,6 +208,16 @@ class LoraTorchTensor:
|
||||
def to(self, *args, **kwargs):
|
||||
return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
|
||||
|
||||
def __mul__(self, other) -> LoraTorchTensor:
|
||||
# Only output-side multiplication for now
|
||||
# W = B @ A, so M_out * W == (M_out * B) @ A
|
||||
if not isinstance(other, (int, float)) and other.shape and other.shape[-1] != 1:
|
||||
raise NotImplementedError
|
||||
return LoraTorchTensor(self._lora_A, self._lora_B * other)
|
||||
|
||||
def __rmul__(self, other) -> LoraTorchTensor:
|
||||
return self * other
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
||||
del types # unused
|
||||
|
||||
@@ -459,7 +459,7 @@ Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepend
|
||||
|
||||
- Usage: `./bin/llama-template-analysis path/to/template.jinja`
|
||||
|
||||
**Debug Logging**: Enable with `LLAMA_LOG_VERBOSITY=2`
|
||||
**Debug Logging**: Enable with `LLAMA_ARG_LOG_VERBOSITY=2`
|
||||
|
||||
- Shows detailed analysis steps, pattern extraction results, and generated parser structure
|
||||
|
||||
|
||||
@@ -743,6 +743,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
|
||||
| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
|
||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||
| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
|
||||
|
||||
@@ -753,6 +754,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
|
||||
| Name | Function |
|
||||
|-----------------|----------------------------------------------------------------------------------|
|
||||
| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
|
||||
| DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. |
|
||||
|
||||
## Design Rule
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
|
||||
This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.7
|
||||
[d]/> cd /workspace
|
||||
```
|
||||
|
||||
|
||||
@@ -176,7 +176,7 @@ Note that currently you cannot quantize the visual encoder because granite visio
|
||||
|
||||
|
||||
### 5. Running the Model in Llama cpp
|
||||
Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
|
||||
Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the llama.cpp banner.
|
||||
|
||||
```bash
|
||||
$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
|
||||
|
||||
@@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO
|
||||
|
||||
## HuggingFace utilities
|
||||
The following targets are useful for creating collections and model repositories
|
||||
on Hugging Face in the the ggml-org. These can be used when preparing a release
|
||||
on Hugging Face in the ggml-org. These can be used when preparing a release
|
||||
to script the process for new model releases.
|
||||
|
||||
For the following targets a `HF_TOKEN` environment variable is required.
|
||||
|
||||
@@ -19,6 +19,7 @@ __global__ void fwht_cuda(const float * src, float * dst, const int64_t n_rows,
|
||||
float reg[el_w];
|
||||
const int lane = threadIdx.x;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
#pragma unroll
|
||||
for (int i = 0; i < el_w; ++i) {
|
||||
reg[i] = src[i * warp_size + lane] * scale;
|
||||
@@ -57,10 +58,11 @@ __global__ void fwht_cuda(const float * src, float * dst, const int64_t n_rows,
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_are_same_shape(src, dst));
|
||||
GGML_ASSERT(ggml_is_contiguous(src));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
if (!ggml_is_contiguous(src) || !ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
const int n = src->ne[0];
|
||||
const int64_t rows = ggml_nrows(src);
|
||||
|
||||
@@ -68,7 +70,6 @@ void ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src,
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||
GGML_ASSERT(n % warp_size == 0);
|
||||
const int rows_per_block = 4;
|
||||
|
||||
const int64_t num_blocks = (rows + rows_per_block - 1) / rows_per_block;
|
||||
@@ -83,26 +84,18 @@ void ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src,
|
||||
|
||||
switch (n) {
|
||||
case 64:
|
||||
{
|
||||
ggml_cuda_kernel_launch(fwht_cuda<64>, launch_params, src_d, dst_d, rows, scale);
|
||||
break;
|
||||
}
|
||||
ggml_cuda_kernel_launch(fwht_cuda<64>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
case 128:
|
||||
{
|
||||
ggml_cuda_kernel_launch(fwht_cuda<128>, launch_params, src_d, dst_d, rows, scale);
|
||||
break;
|
||||
}
|
||||
ggml_cuda_kernel_launch(fwht_cuda<128>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
case 256:
|
||||
{
|
||||
ggml_cuda_kernel_launch(fwht_cuda<256>, launch_params, src_d, dst_d, rows, scale);
|
||||
break;
|
||||
}
|
||||
ggml_cuda_kernel_launch(fwht_cuda<256>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
case 512:
|
||||
{
|
||||
ggml_cuda_kernel_launch(fwht_cuda<512>, launch_params, src_d, dst_d, rows, scale);
|
||||
break;
|
||||
}
|
||||
ggml_cuda_kernel_launch(fwht_cuda<512>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst);
|
||||
// Returns whether the Fast Walsh-Hadamard transform could be used.
|
||||
bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst);
|
||||
|
||||
@@ -2596,9 +2596,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
|
||||
|
||||
const int32_t hint = ggml_get_op_params_i32(dst, 1);
|
||||
if (hint == GGML_HINT_SRC0_IS_HADAMARD) {
|
||||
GGML_ASSERT(!split);
|
||||
ggml_cuda_op_fwht(ctx, src1, dst);
|
||||
if (hint == GGML_HINT_SRC0_IS_HADAMARD && !split && ggml_cuda_op_fwht(ctx, src1, dst)) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -2874,6 +2874,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
case GGML_OP_NORM: return HTP_OP_NORM;
|
||||
case GGML_OP_L2_NORM: return HTP_OP_L2_NORM;
|
||||
case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
|
||||
case GGML_OP_CONCAT: return HTP_OP_CONCAT;
|
||||
case GGML_OP_SCALE: return HTP_OP_SCALE;
|
||||
case GGML_OP_SQR: return HTP_OP_SQR;
|
||||
case GGML_OP_SQRT: return HTP_OP_SQRT;
|
||||
@@ -3286,6 +3287,25 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_concat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
int dim = ((const int32_t *) op->op_params)[0];
|
||||
if (dim < 0 || dim >= GGML_MAX_DIMS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
||||
const struct ggml_tensor * src = op->src[i];
|
||||
if (!src) {
|
||||
continue;
|
||||
}
|
||||
if (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_I32 && src->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
@@ -3434,6 +3454,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_cumsum(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_CONCAT:
|
||||
supp = ggml_hexagon_supported_concat(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_FILL:
|
||||
supp = ggml_hexagon_supported_fill(sess, op);
|
||||
break;
|
||||
|
||||
@@ -35,6 +35,7 @@ add_library(${HTP_LIB} SHARED
|
||||
ssm-conv.c
|
||||
cumsum-ops.c
|
||||
fill-ops.c
|
||||
concat-ops.c
|
||||
diag-ops.c
|
||||
solve-tri-ops.c
|
||||
gated-delta-net-ops.c
|
||||
|
||||
275
ggml/src/ggml-hexagon/htp/concat-ops.c
Normal file
275
ggml/src/ggml-hexagon/htp/concat-ops.c
Normal file
@@ -0,0 +1,275 @@
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hexagon_types.h"
|
||||
#include "hexagon_protos.h"
|
||||
#include "hvx_hexagon_protos.h"
|
||||
#include "hex-dma.h"
|
||||
#include "vtcm-utils.h"
|
||||
#include "hvx-utils.h"
|
||||
#include "hex-fastdiv.h"
|
||||
#include <string.h>
|
||||
|
||||
struct htp_concat_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t dim;
|
||||
uint32_t nrows_per_thread;
|
||||
struct fastdiv_values div_ne0;
|
||||
struct fastdiv_values div_ne1;
|
||||
struct fastdiv_values div_ne2;
|
||||
};
|
||||
|
||||
static void concat_2d_f32_transposed(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_concat_context * cctx = (struct htp_concat_context *) data;
|
||||
struct htp_ops_context * octx = cctx->octx;
|
||||
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * src1 = octx->src[1];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
const uint32_t src0_ne0 = src0->ne[0];
|
||||
const uint32_t src1_ne0 = src1->ne[0];
|
||||
const uint32_t ne1 = dst->ne[1];
|
||||
|
||||
const uint32_t start_i = ith * cctx->nrows_per_thread;
|
||||
const uint32_t end_i = (start_i + cctx->nrows_per_thread < ne1) ? (start_i + cctx->nrows_per_thread) : ne1;
|
||||
if (start_i >= end_i) return;
|
||||
|
||||
dma_queue * q = octx->ctx->dma[ith];
|
||||
|
||||
uint8_t * spad0_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
|
||||
uint8_t * spad1_base = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
|
||||
|
||||
const uint32_t block_i = 32;
|
||||
const uint32_t spad1_stride = block_i * sizeof(float);
|
||||
|
||||
int32_t offsets[32] __attribute__((aligned(128)));
|
||||
for(int k=0; k<32; k++) {
|
||||
offsets[k] = k * spad1_stride;
|
||||
}
|
||||
HVX_Vector vv = *(HVX_Vector*)offsets;
|
||||
const uint32_t src1_ne0_padded = hex_round_up(src1_ne0, 32);
|
||||
const uint32_t spad0_row_bytes = hex_round_up((src0_ne0 + src1_ne0_padded) * sizeof(float), VLEN);
|
||||
uint32_t mu = src1_ne0_padded * spad1_stride;
|
||||
|
||||
for (uint32_t i = start_i; i < end_i; i += block_i) {
|
||||
uint32_t current_block_i = (end_i - i < block_i) ? (end_i - i) : block_i;
|
||||
|
||||
uint32_t src1_width_bytes = current_block_i * sizeof(float);
|
||||
uint8_t * src1_ptr = (uint8_t *)src1->data + i * src1->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(spad1_base, src1_ptr), spad1_stride, src1->nb[0], src1_width_bytes, src1_ne0);
|
||||
|
||||
uint32_t src0_row_bytes = src0_ne0 * sizeof(float);
|
||||
uint8_t * src0_ptr = (uint8_t *)src0->data + i * src0->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(spad0_base, src0_ptr), spad0_row_bytes, src0->nb[1], src0_row_bytes, current_block_i);
|
||||
|
||||
dma_queue_pop(q); // src1
|
||||
|
||||
HVX_Vector * vtcm_tmp = (HVX_Vector *)(spad1_base + src1_ne0_padded * spad1_stride);
|
||||
|
||||
for (uint32_t j = 0; j < src1_ne0_padded; j += 32) {
|
||||
#pragma unroll(4)
|
||||
for (uint32_t ii = 0; ii < current_block_i; ii++) {
|
||||
size_t rt = (size_t)(spad1_base + j * spad1_stride + ii * sizeof(float));
|
||||
Q6_vgather_ARMVw(&vtcm_tmp[ii], rt, mu, vv);
|
||||
uint8_t * dst_ptr = spad0_base + ii * spad0_row_bytes + (src0_ne0 + j) * sizeof(float);
|
||||
hvx_vmemu(dst_ptr) = vtcm_tmp[ii];
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_pop(q); // src0
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *)dst->data + i * dst->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(dst_ptr, spad0_base), dst->nb[1], spad0_row_bytes, (src0_ne0 + src1_ne0) * sizeof(float), current_block_i);
|
||||
|
||||
dma_queue_pop(q);
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_2d_f16_transposed(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_concat_context * cctx = (struct htp_concat_context *) data;
|
||||
struct htp_ops_context * octx = cctx->octx;
|
||||
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * src1 = octx->src[1];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
const uint32_t src0_ne0 = src0->ne[0];
|
||||
const uint32_t src1_ne0 = src1->ne[0];
|
||||
const uint32_t ne1 = dst->ne[1];
|
||||
|
||||
const uint32_t start_i = ith * cctx->nrows_per_thread;
|
||||
const uint32_t end_i = (start_i + cctx->nrows_per_thread < ne1) ? (start_i + cctx->nrows_per_thread) : ne1;
|
||||
if (start_i >= end_i) return;
|
||||
|
||||
dma_queue * q = octx->ctx->dma[ith];
|
||||
|
||||
uint8_t * spad0_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
|
||||
uint8_t * spad1_base = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
|
||||
|
||||
const uint32_t block_i = 64;
|
||||
const uint32_t spad1_stride = block_i * sizeof(__fp16);
|
||||
|
||||
int16_t offsets[64] __attribute__((aligned(128)));
|
||||
for(int k=0; k<64; k++) {
|
||||
offsets[k] = k * spad1_stride;
|
||||
}
|
||||
HVX_Vector vv = *(HVX_Vector*)offsets;
|
||||
const uint32_t src1_ne0_padded = hex_round_up(src1_ne0, 64);
|
||||
const uint32_t spad0_row_bytes = hex_round_up((src0_ne0 + src1_ne0_padded) * sizeof(__fp16), VLEN);
|
||||
uint32_t mu = src1_ne0_padded * spad1_stride;
|
||||
|
||||
for (uint32_t i = start_i; i < end_i; i += block_i) {
|
||||
uint32_t current_block_i = (end_i - i < block_i) ? (end_i - i) : block_i;
|
||||
|
||||
uint32_t src1_width_bytes = current_block_i * sizeof(__fp16);
|
||||
uint8_t * src1_ptr = (uint8_t *)src1->data + i * src1->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(spad1_base, src1_ptr), spad1_stride, src1->nb[0], src1_width_bytes, src1_ne0);
|
||||
|
||||
uint32_t src0_row_bytes = src0_ne0 * sizeof(__fp16);
|
||||
uint8_t * src0_ptr = (uint8_t *)src0->data + i * src0->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(spad0_base, src0_ptr), spad0_row_bytes, src0->nb[1], src0_row_bytes, current_block_i);
|
||||
|
||||
dma_queue_pop(q); // src1
|
||||
|
||||
HVX_Vector * vtcm_tmp = (HVX_Vector *)(spad1_base + src1_ne0_padded * spad1_stride);
|
||||
|
||||
for (uint32_t j = 0; j < src1_ne0_padded; j += 64) {
|
||||
#pragma unroll(4)
|
||||
for (uint32_t ii = 0; ii < current_block_i; ii++) {
|
||||
size_t rt = (size_t)(spad1_base + j * spad1_stride + ii * sizeof(__fp16));
|
||||
Q6_vgather_ARMVh(&vtcm_tmp[ii], rt, mu, vv);
|
||||
uint8_t * dst_ptr = spad0_base + ii * spad0_row_bytes + (src0_ne0 + j) * sizeof(__fp16);
|
||||
hvx_vmemu(dst_ptr) = vtcm_tmp[ii];
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_pop(q); // src0
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *)dst->data + i * dst->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(dst_ptr, spad0_base), dst->nb[1], spad0_row_bytes, (src0_ne0 + src1_ne0) * sizeof(__fp16), current_block_i);
|
||||
|
||||
dma_queue_pop(q);
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_generic(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_concat_context * cctx = (struct htp_concat_context *) data;
|
||||
struct htp_ops_context * octx = cctx->octx;
|
||||
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * src1 = octx->src[1];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
const int dim = cctx->dim;
|
||||
const uint32_t type_size = (dst->type == HTP_TYPE_F32 || dst->type == HTP_TYPE_I32) ? 4 : 2;
|
||||
|
||||
const uint32_t ne[4] = {dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]};
|
||||
const uint32_t total_elements = ne[0] * ne[1] * ne[2] * ne[3];
|
||||
const uint32_t chunk_size = (total_elements + nth - 1) / nth;
|
||||
|
||||
const uint32_t start_idx = MIN(ith * chunk_size, total_elements);
|
||||
const uint32_t end_idx = MIN(start_idx + chunk_size, total_elements);
|
||||
|
||||
// Naive scalar element-wise copy
|
||||
for (uint32_t idx = start_idx; idx < end_idx; idx++) {
|
||||
uint32_t idx_div_ne0 = fastdiv(idx, &cctx->div_ne0);
|
||||
uint32_t i0 = idx - idx_div_ne0 * ne[0];
|
||||
|
||||
uint32_t idx_div_ne01 = fastdiv(idx_div_ne0, &cctx->div_ne1);
|
||||
uint32_t i1 = idx_div_ne0 - idx_div_ne01 * ne[1];
|
||||
|
||||
uint32_t idx_div_ne012 = fastdiv(idx_div_ne01, &cctx->div_ne2);
|
||||
uint32_t i2 = idx_div_ne01 - idx_div_ne012 * ne[2];
|
||||
uint32_t i3 = idx_div_ne012;
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2] + i1 * dst->nb[1] + i0 * dst->nb[0];
|
||||
|
||||
uint32_t idx_dim = 0;
|
||||
if (dim == 0) idx_dim = i0;
|
||||
else if (dim == 1) idx_dim = i1;
|
||||
else if (dim == 2) idx_dim = i2;
|
||||
else if (dim == 3) idx_dim = i3;
|
||||
|
||||
const struct htp_tensor * src = (idx_dim < src0->ne[dim]) ? src0 : src1;
|
||||
|
||||
uint32_t s0 = i0;
|
||||
uint32_t s1 = i1;
|
||||
uint32_t s2 = i2;
|
||||
uint32_t s3 = i3;
|
||||
|
||||
if (dim == 0 && src == src1) s0 -= src0->ne[0];
|
||||
if (dim == 1 && src == src1) s1 -= src0->ne[1];
|
||||
if (dim == 2 && src == src1) s2 -= src0->ne[2];
|
||||
if (dim == 3 && src == src1) s3 -= src0->ne[3];
|
||||
|
||||
uint8_t * src_ptr = (uint8_t *)src->data + s3 * src->nb[3] + s2 * src->nb[2] + s1 * src->nb[1] + s0 * src->nb[0];
|
||||
|
||||
if (type_size == 4) {
|
||||
*(float*)dst_ptr = *(float*)src_ptr;
|
||||
} else {
|
||||
*(__fp16*)dst_ptr = *(__fp16*)src_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int op_concat(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * src1 = octx->src[1];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
int dim = octx->op_params[0];
|
||||
|
||||
bool is_2d = dst->ne[2] == 1 && dst->ne[3] == 1;
|
||||
|
||||
const uint32_t type_size = (dst->type == HTP_TYPE_F32 || dst->type == HTP_TYPE_I32) ? 4 : 2;
|
||||
bool is_src1_transposed = (src1->nb[0] > src1->nb[1]);
|
||||
bool is_src0_transposed = (src0->nb[0] > src0->nb[1]);
|
||||
|
||||
uint32_t n_threads = octx->n_threads;
|
||||
struct htp_concat_context cctx;
|
||||
cctx.octx = octx;
|
||||
cctx.dim = dim;
|
||||
cctx.div_ne0 = init_fastdiv_values(dst->ne[0]);
|
||||
cctx.div_ne1 = init_fastdiv_values(dst->ne[1]);
|
||||
cctx.div_ne2 = init_fastdiv_values(dst->ne[2]);
|
||||
|
||||
void (*worker_func)(unsigned int, unsigned int, void *) = concat_generic;
|
||||
|
||||
if (dim == 0 && is_2d && is_src1_transposed && !is_src0_transposed) {
|
||||
n_threads = MIN(dst->ne[1], n_threads);
|
||||
if (n_threads < 1) {
|
||||
n_threads = 1;
|
||||
}
|
||||
uint32_t block_i = (type_size == 4) ? 32 : 64;
|
||||
|
||||
cctx.nrows_per_thread = hmx_ceil_div(dst->ne[1], n_threads);
|
||||
|
||||
// Allocate VTCM
|
||||
uint32_t spad1_stride = block_i * type_size;
|
||||
|
||||
uint32_t src1_ne0_padded = hex_round_up(src1->ne[0], block_i);
|
||||
uint32_t spad0_row_bytes = hex_round_up((src0->ne[0] + src1_ne0_padded) * type_size, VLEN);
|
||||
|
||||
octx->src0_spad.size_per_thread = block_i * spad0_row_bytes;
|
||||
octx->src1_spad.size_per_thread = src1_ne0_padded * spad1_stride + block_i * VLEN;
|
||||
|
||||
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
|
||||
octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
|
||||
|
||||
if (octx->src0_spad.size + octx->src1_spad.size > octx->ctx->vtcm_size) {
|
||||
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||
}
|
||||
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
|
||||
if (type_size == 4) {
|
||||
worker_func = concat_2d_f32_transposed;
|
||||
} else {
|
||||
worker_func = concat_2d_f16_transposed;
|
||||
}
|
||||
}
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, worker_func, &cctx, n_threads);
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
@@ -28,158 +28,170 @@ struct htp_copy_context {
|
||||
uint32_t dst_blocks_per_row;
|
||||
|
||||
uint32_t src0_nrows_per_thread;
|
||||
|
||||
void (*copy)(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith);
|
||||
};
|
||||
|
||||
#define cpy_preamble \
|
||||
const struct htp_tensor *src0 = octx->src[0]; \
|
||||
const struct htp_tensor *dst = octx->dst; \
|
||||
\
|
||||
const uint32_t ne00 = src0->ne[0]; \
|
||||
const uint32_t ne01 = src0->ne[1]; \
|
||||
const uint32_t ne02 = src0->ne[2]; \
|
||||
const uint32_t ne03 = src0->ne[3]; \
|
||||
\
|
||||
const uint32_t nb00 = src0->nb[0]; \
|
||||
const uint32_t nb01 = src0->nb[1]; \
|
||||
const uint32_t nb02 = src0->nb[2]; \
|
||||
const uint32_t nb03 = src0->nb[3]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb0 = dst->nb[0]; \
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3]; \
|
||||
\
|
||||
const uint32_t ne00 = src0->ne[0]; \
|
||||
const uint32_t ne01 = src0->ne[1]; \
|
||||
const uint32_t ne02 = src0->ne[2]; \
|
||||
const uint32_t ne03 = src0->ne[3]; \
|
||||
\
|
||||
const uint32_t nb00 = src0->nb[0]; \
|
||||
const uint32_t nb01 = src0->nb[1]; \
|
||||
const uint32_t nb02 = src0->nb[2]; \
|
||||
const uint32_t nb03 = src0->nb[3]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb0 = dst->nb[0]; \
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3]; \
|
||||
\
|
||||
const uint32_t nr = ne01;
|
||||
|
||||
static void cpy_thread_sametype_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
|
||||
cpy_preamble;
|
||||
|
||||
// parallelize by src0 rows
|
||||
const uint32_t dr = ct->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
|
||||
|
||||
// copy by rows
|
||||
for (uint32_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (uint32_t i02 = 0; i02 < ne02; i02++) {
|
||||
#pragma unroll(2)
|
||||
for (uint32_t i01 = ir0; i01 < ir1; i01++) {
|
||||
uint8_t* dst_ptr = (uint8_t*) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
|
||||
uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
||||
hex_l2fetch(src0_ptr, ne00 * ct->src0_type_size, nb01, 2);
|
||||
hvx_copy_uu(dst_ptr, src0_ptr, ne00, ct->src0_type_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
#define DEFINE_CPY_SAMESHAPE(NAME, ELEM_TYPE, ELEM_SIZE) \
|
||||
static void cpy_thread_##NAME##_sameshape(unsigned int nth, unsigned int ith, void * data) { \
|
||||
struct htp_copy_context * ct = (struct htp_copy_context *) data; \
|
||||
struct htp_ops_context * octx = ct->octx; \
|
||||
cpy_preamble; \
|
||||
const uint32_t dr = ct->src0_nrows_per_thread; \
|
||||
const uint32_t ir0 = dr * ith; \
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; \
|
||||
if (ir0 >= nr) return; \
|
||||
for (uint32_t i03 = 0; i03 < ne03; i03++) { \
|
||||
for (uint32_t i02 = 0; i02 < ne02; i02++) { \
|
||||
_Pragma("unroll(4)") \
|
||||
for (uint32_t i01 = ir0; i01 < ir1; i01++) { \
|
||||
uint8_t* dst_ptr = (uint8_t*) dst->data + i01*nb1 + i02*nb2 + i03*nb3; \
|
||||
uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03; \
|
||||
hex_l2fetch(src0_ptr, ne00 * ELEM_SIZE, nb01, 2); \
|
||||
hvx_copy_uu(dst_ptr, src0_ptr, ne00, ELEM_SIZE); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith) {
|
||||
cpy_preamble;
|
||||
DEFINE_CPY_SAMESHAPE(f32, float, 4)
|
||||
DEFINE_CPY_SAMESHAPE(f16, __fp16, 2)
|
||||
|
||||
// parallelize by src0 rows
|
||||
const uint32_t dr = ct->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
|
||||
|
||||
// Fast path: when both src0 and dst are contiguous in memory
|
||||
// Replace the element-by-element loop with a single bulk HVX copy per (i03, i02) slice.
|
||||
const bool src0_contig = (nb00 == ct->src0_type_size) &&
|
||||
(nb01 == ne00 * nb00) &&
|
||||
(nb02 == ne01 * nb01) &&
|
||||
(nb03 == ne02 * nb02);
|
||||
const bool dst_contig = (nb0 == ct->dst_type_size) &&
|
||||
(nb1 == ne0 * nb0) &&
|
||||
(nb2 == ne1 * nb1) &&
|
||||
(nb3 == ne2 * nb2);
|
||||
|
||||
if (src0_contig && dst_contig) {
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01;
|
||||
uint32_t flat = ((i03*ne02 + i02)*ne01 + ir0) * ne00;
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + flat * ct->src0_type_size;
|
||||
hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ct->src0_type_size);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// dst counters
|
||||
int64_t k10 = 0;
|
||||
int64_t i11 = 0;
|
||||
int64_t i12 = 0;
|
||||
int64_t i13 = 0;
|
||||
|
||||
// number of blocks in a row
|
||||
const int64_t nk00 = ct->src0_blocks_per_row;
|
||||
const int64_t nk0 = ct->dst_blocks_per_row;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
k10 += nk00 * ir0;
|
||||
while (k10 >= nk0) {
|
||||
k10 -= nk0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
||||
for (int64_t k00 = 0; k00 < nk00; k00++) {
|
||||
const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
||||
memcpy(dst_ptr, src0_ptr, ct->dst_type_size);
|
||||
|
||||
if (++k10 == nk0) {
|
||||
k10 = 0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
k10 += nk00 * (ne01 - ir1);
|
||||
while (k10 >= nk0) {
|
||||
k10 -= nk0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#define DEFINE_CPY_RESHAPE(NAME, ELEM_TYPE, ELEM_SIZE) \
|
||||
static void cpy_thread_##NAME##_reshape(unsigned int nth, unsigned int ith, void * data) { \
|
||||
struct htp_copy_context * ct = (struct htp_copy_context *) data; \
|
||||
struct htp_ops_context * octx = ct->octx; \
|
||||
cpy_preamble; \
|
||||
const uint32_t dr = ct->src0_nrows_per_thread; \
|
||||
const uint32_t ir0 = dr * ith; \
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; \
|
||||
if (ir0 >= nr) return; \
|
||||
const bool src0_contig = (nb00 == ELEM_SIZE) && \
|
||||
(nb01 == ne00 * nb00) && \
|
||||
(nb02 == ne01 * nb01) && \
|
||||
(nb03 == ne02 * nb02); \
|
||||
const bool dst_contig = (nb0 == ELEM_SIZE) && \
|
||||
(nb1 == ne0 * nb0) && \
|
||||
(nb2 == ne1 * nb1) && \
|
||||
(nb3 == ne2 * nb2); \
|
||||
if (src0_contig && dst_contig) { \
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) { \
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) { \
|
||||
uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01; \
|
||||
uint32_t flat = ((i03*ne02 + i02)*ne01 + ir0) * ne00; \
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + flat * ELEM_SIZE; \
|
||||
hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ELEM_SIZE); \
|
||||
} \
|
||||
} \
|
||||
return; \
|
||||
} \
|
||||
const bool reshape_flat_fast = (ne03 == 1 && ne2 == 1 && ne3 == 1) && \
|
||||
(ne0 == ne00 * ne01) && (ne1 == ne02) && \
|
||||
(nb00 == ELEM_SIZE) && (nb0 == ELEM_SIZE); \
|
||||
if (reshape_flat_fast) { \
|
||||
for (uint32_t i02 = 0; i02 < ne02; i02++) { \
|
||||
for (uint32_t i01 = ir0; i01 < ir1; i01++) { \
|
||||
uint8_t * src0_ptr = (uint8_t *) src0->data + i01 * nb01 + i02 * nb02; \
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i01 * ne00 * ELEM_SIZE + i02 * nb1; \
|
||||
hvx_copy_uu(dst_ptr, src0_ptr, ne00, ELEM_SIZE); \
|
||||
} \
|
||||
} \
|
||||
return; \
|
||||
} \
|
||||
int64_t k10 = 0; \
|
||||
int64_t i11 = 0; \
|
||||
int64_t i12 = 0; \
|
||||
int64_t i13 = 0; \
|
||||
const int64_t nk00 = ct->src0_blocks_per_row; \
|
||||
const int64_t nk0 = ct->dst_blocks_per_row; \
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) { \
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) { \
|
||||
k10 += nk00 * ir0; \
|
||||
while (k10 >= nk0) { \
|
||||
k10 -= nk0; \
|
||||
if (++i11 == ne1) { \
|
||||
i11 = 0; \
|
||||
if (++i12 == ne2) { \
|
||||
i12 = 0; \
|
||||
if (++i13 == ne3) { \
|
||||
i13 = 0; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
for (int64_t i01 = ir0; i01 < ir1; i01++) { \
|
||||
for (int64_t k00 = 0; k00 < nk00; k00++) { \
|
||||
const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); \
|
||||
char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); \
|
||||
memcpy(dst_ptr, src0_ptr, ELEM_SIZE); \
|
||||
if (++k10 == nk0) { \
|
||||
k10 = 0; \
|
||||
if (++i11 == ne1) { \
|
||||
i11 = 0; \
|
||||
if (++i12 == ne2) { \
|
||||
i12 = 0; \
|
||||
if (++i13 == ne3) { \
|
||||
i13 = 0; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
k10 += nk00 * (ne01 - ir1); \
|
||||
while (k10 >= nk0) { \
|
||||
k10 -= nk0; \
|
||||
if (++i11 == ne1) { \
|
||||
i11 = 0; \
|
||||
if (++i12 == ne2) { \
|
||||
i12 = 0; \
|
||||
if (++i13 == ne3) { \
|
||||
i13 = 0; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
|
||||
DEFINE_CPY_RESHAPE(f32, float, 4)
|
||||
DEFINE_CPY_RESHAPE(f16, __fp16, 2)
|
||||
|
||||
static void cpy_thread_f16_f32_sameshape(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_copy_context * ct = (struct htp_copy_context *) data;
|
||||
struct htp_ops_context * octx = ct->octx;
|
||||
cpy_preamble;
|
||||
|
||||
// parallelize by src0 rows
|
||||
const uint32_t dr = ct->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
|
||||
if (ir0 >= nr) return;
|
||||
|
||||
// copy by rows
|
||||
for (uint32_t i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -195,13 +207,16 @@ static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct ht
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
|
||||
static void cpy_thread_f32_f16_sameshape(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_copy_context * ct = (struct htp_copy_context *) data;
|
||||
struct htp_ops_context * octx = ct->octx;
|
||||
cpy_preamble;
|
||||
|
||||
// parallelize by src0 rows
|
||||
const uint32_t dr = ct->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
|
||||
if (ir0 >= nr) return;
|
||||
|
||||
// copy by rows
|
||||
for (uint32_t i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -217,11 +232,6 @@ static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct ht
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_work_func(unsigned int n, unsigned int i, void *data) {
|
||||
struct htp_copy_context *ct = (struct htp_copy_context *) data;
|
||||
ct->copy(ct, ct->octx, n, i);
|
||||
}
|
||||
|
||||
int op_cpy(struct htp_ops_context * octx) {
|
||||
cpy_preamble;
|
||||
|
||||
@@ -254,22 +264,32 @@ int op_cpy(struct htp_ops_context * octx) {
|
||||
|
||||
ct.src0_nrows_per_thread = (nr + n_threads - 1) / n_threads;
|
||||
|
||||
worker_callback_t copy_fun;
|
||||
|
||||
if (sametype && sameshape) {
|
||||
ct.copy = cpy_thread_sametype_sameshape;
|
||||
if (src0->type == HTP_TYPE_F32) {
|
||||
copy_fun = cpy_thread_f32_sameshape;
|
||||
} else {
|
||||
copy_fun = cpy_thread_f16_sameshape;
|
||||
}
|
||||
} else if (sameshape) {
|
||||
/**/ if (dst->type == HTP_TYPE_F16 && src0->type == HTP_TYPE_F32)
|
||||
ct.copy = cpy_thread_f16_f32_sameshape;
|
||||
copy_fun = cpy_thread_f16_f32_sameshape;
|
||||
else if (dst->type == HTP_TYPE_F32 && src0->type == HTP_TYPE_F16)
|
||||
ct.copy = cpy_thread_f32_f16_sameshape;
|
||||
copy_fun = cpy_thread_f32_f16_sameshape;
|
||||
else
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
} else if (sametype) {
|
||||
ct.copy = cpy_thread_sametype_reshape;
|
||||
if (src0->type == HTP_TYPE_F32) {
|
||||
copy_fun = cpy_thread_f32_reshape;
|
||||
} else {
|
||||
copy_fun = cpy_thread_f16_reshape;
|
||||
}
|
||||
} else {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, cpy_work_func, &ct, n_threads);
|
||||
worker_pool_run_func(octx->ctx->worker_pool, copy_fun, &ct, n_threads);
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
@@ -17,9 +17,13 @@
|
||||
|
||||
struct get_rows_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t src1_nrows_per_thread;
|
||||
uint32_t tasks_per_thread;
|
||||
uint32_t total_tasks;
|
||||
uint32_t chunks_per_row;
|
||||
uint32_t chunk_size;
|
||||
struct fastdiv_values get_rows_div_ne10;
|
||||
struct fastdiv_values get_rows_div_ne10_ne11;
|
||||
struct fastdiv_values get_rows_div_chunks_per_row;
|
||||
};
|
||||
|
||||
#define get_rows_preamble \
|
||||
@@ -52,20 +56,23 @@ struct get_rows_context {
|
||||
\
|
||||
const uint32_t nr = ne10 * ne11 * ne12;
|
||||
|
||||
static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
|
||||
static void get_rows_thread_f32_f32_dma(unsigned int nth, unsigned int ith, void *data) {
|
||||
struct get_rows_context * grctx = (struct get_rows_context *)data;
|
||||
struct htp_ops_context * octx = grctx->octx;
|
||||
get_rows_preamble;
|
||||
|
||||
uint64_t qt = HAP_perf_get_qtimer_count();
|
||||
|
||||
// parallelize by src1 elements (which correspond to dst rows)
|
||||
const uint32_t dr = grctx->src1_nrows_per_thread;
|
||||
const uint32_t dr = grctx->tasks_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||
if (ir0 >= grctx->total_tasks) {
|
||||
return;
|
||||
}
|
||||
const uint32_t ir1 = MIN(ir0 + dr, grctx->total_tasks);
|
||||
|
||||
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
|
||||
|
||||
dma_queue * dma_queue = octx->ctx->dma[ith];
|
||||
for (uint32_t i = ir0; i < ir1; ++i) {
|
||||
const uint32_t i12 = fastdiv(i, &grctx->get_rows_div_ne10_ne11);
|
||||
const uint32_t rem = i - i12 * ne11 * ne10;
|
||||
@@ -73,29 +80,77 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
|
||||
const uint32_t i10 = rem - i11 * ne10;
|
||||
|
||||
const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
|
||||
|
||||
uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
||||
|
||||
if (i01 >= ne01) {
|
||||
// invalid index, skip for now to avoid crash
|
||||
continue;
|
||||
}
|
||||
|
||||
const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03;
|
||||
const uintptr_t dst_ptr = octx->dst->data + i10*nb1 + i11*nb2 + i12*nb3;
|
||||
hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
|
||||
|
||||
while (!dma_queue_push(dma_queue, dma_make_ptr((void *)dst_ptr, (const void *)src0_ptr), nb1, nb01, ne00 * sizeof(float), 1)) {
|
||||
dma_queue_pop(dma_queue);
|
||||
}
|
||||
}
|
||||
dma_queue_flush(dma_queue);
|
||||
|
||||
qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
|
||||
FARF(HIGH, "get-rows-f32-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
|
||||
ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
|
||||
}
|
||||
|
||||
static void get_rows_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) {
|
||||
struct get_rows_context * grctx = (struct get_rows_context *)data;
|
||||
struct htp_ops_context * octx = grctx->octx;
|
||||
get_rows_preamble;
|
||||
|
||||
uint64_t qt = HAP_perf_get_qtimer_count();
|
||||
|
||||
const uint32_t dr = grctx->tasks_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
if (ir0 >= grctx->total_tasks) {
|
||||
return;
|
||||
}
|
||||
const uint32_t ir1 = MIN(ir0 + dr, grctx->total_tasks);
|
||||
|
||||
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
|
||||
|
||||
const uint32_t chunks_per_row = grctx->chunks_per_row;
|
||||
const uint32_t chunk_size = grctx->chunk_size;
|
||||
for (uint32_t i = ir0; i < ir1; ++i) {
|
||||
const uint32_t row_idx = fastdiv(i, &grctx->get_rows_div_chunks_per_row);
|
||||
const uint32_t chunk_idx = i - row_idx * chunks_per_row;
|
||||
|
||||
const uint32_t i12 = fastdiv(row_idx, &grctx->get_rows_div_ne10_ne11);
|
||||
const uint32_t rem = row_idx - i12 * ne11 * ne10;
|
||||
const uint32_t i11 = fastdiv(rem, &grctx->get_rows_div_ne10);
|
||||
const uint32_t i10 = rem - i11 * ne10;
|
||||
|
||||
const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
|
||||
uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
||||
|
||||
if (i01 >= ne01) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint32_t offset = chunk_idx * chunk_size;
|
||||
if (offset < ne00) {
|
||||
const uint32_t copy_size = MIN(chunk_size, ne00 - offset);
|
||||
const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03 + offset * sizeof(float);
|
||||
const uintptr_t dst_ptr = octx->dst->data + i10*nb1 + i11*nb2 + i12*nb3 + offset * sizeof(float);
|
||||
hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, copy_size);
|
||||
}
|
||||
}
|
||||
|
||||
qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
|
||||
FARF(HIGH, "get-rows-f32-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
|
||||
FARF(HIGH, "get-rows-f32-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
|
||||
ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
|
||||
}
|
||||
|
||||
int op_get_rows(struct htp_ops_context * octx) {
|
||||
get_rows_preamble;
|
||||
|
||||
const uint32_t n_threads = MIN(nr, octx->n_threads);
|
||||
|
||||
if (octx->src[0]->type != HTP_TYPE_F32) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
@@ -112,13 +167,52 @@ int op_get_rows(struct htp_ops_context * octx) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
const uint32_t nb00 = octx->src[0]->nb[0];
|
||||
const uint32_t nb0 = octx->dst->nb[0];
|
||||
|
||||
const bool can_use_dma = (nb00 == sizeof(float)) && (nb0 == sizeof(float));
|
||||
const bool use_dma = can_use_dma && (ne00 >= 2048);
|
||||
|
||||
struct get_rows_context grctx;
|
||||
grctx.octx = octx;
|
||||
grctx.get_rows_div_ne10 = init_fastdiv_values(octx->src[1]->ne[0]);
|
||||
grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src[1]->ne[0] * octx->src[1]->ne[1]);
|
||||
|
||||
grctx.src1_nrows_per_thread = (nr + n_threads - 1) / n_threads;
|
||||
if (use_dma) {
|
||||
grctx.chunks_per_row = 1;
|
||||
grctx.chunk_size = ne00;
|
||||
grctx.total_tasks = nr;
|
||||
grctx.get_rows_div_chunks_per_row = init_fastdiv_values(1);
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32, &grctx, n_threads);
|
||||
const uint32_t n_threads = MIN(nr, octx->n_threads);
|
||||
grctx.tasks_per_thread = (nr + n_threads - 1) / n_threads;
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32_dma, &grctx, n_threads);
|
||||
} else {
|
||||
uint32_t chunks_per_row = 1;
|
||||
uint32_t chunk_size = ne00;
|
||||
uint32_t total_tasks = nr;
|
||||
|
||||
if (nr < octx->n_threads) {
|
||||
const uint32_t min_chunk_size = 1024;
|
||||
uint32_t max_chunks = ne00 / min_chunk_size;
|
||||
if (max_chunks == 0) {
|
||||
max_chunks = 1;
|
||||
}
|
||||
chunks_per_row = MIN((octx->n_threads + nr - 1) / nr, max_chunks);
|
||||
chunk_size = (ne00 + chunks_per_row - 1) / chunks_per_row;
|
||||
total_tasks = nr * chunks_per_row;
|
||||
}
|
||||
|
||||
grctx.chunks_per_row = chunks_per_row;
|
||||
grctx.chunk_size = chunk_size;
|
||||
grctx.total_tasks = total_tasks;
|
||||
grctx.get_rows_div_chunks_per_row = init_fastdiv_values(chunks_per_row);
|
||||
|
||||
const uint32_t n_threads = MIN(total_tasks, octx->n_threads);
|
||||
grctx.tasks_per_thread = (total_tasks + n_threads - 1) / n_threads;
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32_hvx, &grctx, n_threads);
|
||||
}
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
@@ -50,8 +50,8 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV,
|
||||
const size_t g_br = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS);
|
||||
const size_t q_tile_size = hex_align_up(g_br * DK * sizeof(__fp16), 4096); // Q: [g_br, DK]
|
||||
const size_t o_tile_size = hex_align_up(g_br * DV * sizeof(__fp16), 4096); // O: [g_br, DV] x2 ping-pong
|
||||
const size_t k_dma_size = hex_align_up(Bc * DK * sizeof(__fp16), 4096); // K DMA: [Bc, DK] x2 double-buf
|
||||
const size_t v_dma_size = hex_align_up(Bc * DV * sizeof(__fp16), 4096); // V DMA: [Bc, DV] x2 double-buf
|
||||
const size_t k_dma_size = hex_align_up(Bc * hex_round_up(DK * sizeof(__fp16), 128), 4096); // K DMA: [Bc, DK] x2 double-buf
|
||||
const size_t v_dma_size = hex_align_up(Bc * hex_round_up(DV * sizeof(__fp16), 128), 4096); // V DMA: [Bc, DV] x2 double-buf
|
||||
const size_t k_tile_size = hex_align_up(Bc * DK * sizeof(__fp16), 4096); // K tiles: [Bc, DK] interleaved
|
||||
const size_t v_tile_size = hex_align_up(Bc * DV * sizeof(__fp16), 4096); // V tiles: [Bc, DV] interleaved
|
||||
const size_t s_tile_size = hex_align_up(g_br * Bc * sizeof(__fp16), 4096); // S/P:[g_br, Bc]
|
||||
@@ -1278,7 +1278,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
struct hmx_fa_context factx;
|
||||
memset(&factx, 0, sizeof(factx));
|
||||
factx.octx = octx;
|
||||
factx.n_threads = octx->ctx->n_threads;
|
||||
factx.n_threads = n_threads;
|
||||
factx.DK = DK;
|
||||
factx.DV = DV;
|
||||
factx.n_kv = nek1;
|
||||
@@ -1328,10 +1328,15 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
factx.m1 = powf(2.0f, -(max_bias / 2.0f) / factx.n_head_log2);
|
||||
|
||||
// ======== VTCM allocation (GQA-aware) ========
|
||||
const size_t size_k_row = DK * sizeof(__fp16);
|
||||
const size_t size_v_row = DV * sizeof(__fp16);
|
||||
const size_t size_k_row_padded = hex_round_up(size_k_row, 128);
|
||||
const size_t size_v_row_padded = hex_round_up(size_v_row, 128);
|
||||
|
||||
const size_t q_tile_bytes = hex_align_up(g_br * DK * sizeof(__fp16), 4096);
|
||||
const size_t o_tile_bytes = hex_align_up(g_br * DV * sizeof(__fp16), 4096);
|
||||
const size_t k_dma_bytes = hex_align_up(Bc * DK * sizeof(__fp16), 4096);
|
||||
const size_t v_dma_bytes = hex_align_up(Bc * DV * sizeof(__fp16), 4096);
|
||||
const size_t k_dma_bytes = hex_align_up(Bc * size_k_row_padded, 4096);
|
||||
const size_t v_dma_bytes = hex_align_up(Bc * size_v_row_padded, 4096);
|
||||
const size_t k_tile_bytes = hex_align_up(Bc * DK * sizeof(__fp16), 4096);
|
||||
const size_t v_tile_bytes = hex_align_up(Bc * DV * sizeof(__fp16), 4096);
|
||||
const size_t s_tile_bytes = hex_align_up(g_br * Bc * sizeof(__fp16), 4096);
|
||||
@@ -1401,11 +1406,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
// ======== DMA setup ========
|
||||
dma_queue * const dma = ctx->dma[0];
|
||||
|
||||
// Padded row sizes for DMA
|
||||
const size_t size_k_row = nek0 * sizeof(__fp16);
|
||||
const size_t size_v_row = nev0 * sizeof(__fp16);
|
||||
const size_t size_k_row_padded = hex_round_up(nek0 * sizeof(__fp16), 128);
|
||||
const size_t size_v_row_padded = hex_round_up(nev0 * sizeof(__fp16), 128);
|
||||
// Padded row sizes for DMA (defined in outer scope)
|
||||
|
||||
const size_t n_row_tiles_g_br = g_br / HMX_FP16_TILE_N_ROWS;
|
||||
const size_t n_tiles_per_bc = Bc / HMX_FP16_TILE_N_COLS;
|
||||
|
||||
@@ -104,6 +104,7 @@ int op_argsort(struct htp_ops_context * octx);
|
||||
int op_ssm_conv(struct htp_ops_context * octx);
|
||||
int op_cumsum(struct htp_ops_context * octx);
|
||||
int op_fill(struct htp_ops_context * octx);
|
||||
int op_concat(struct htp_ops_context * octx);
|
||||
int op_diag(struct htp_ops_context * octx);
|
||||
int op_solve_tri(struct htp_ops_context * octx);
|
||||
int op_gated_delta_net(struct htp_ops_context * octx);
|
||||
|
||||
@@ -89,6 +89,7 @@ enum htp_op_code {
|
||||
HTP_OP_TRI,
|
||||
HTP_OP_PAD,
|
||||
HTP_OP_NORM,
|
||||
HTP_OP_CONCAT,
|
||||
|
||||
HTP_OP_INVALID
|
||||
};
|
||||
|
||||
90
ggml/src/ggml-hexagon/htp/hvx-sin-cos.h
Normal file
90
ggml/src/ggml-hexagon/htp/hvx-sin-cos.h
Normal file
@@ -0,0 +1,90 @@
|
||||
#ifndef HVX_SIN_COS_H
|
||||
#define HVX_SIN_COS_H
|
||||
|
||||
#include "hvx-base.h"
|
||||
#include "hvx-floor.h"
|
||||
|
||||
static inline HVX_Vector hvx_vec_cos_f32(HVX_Vector x) {
|
||||
HVX_Vector const_inv_pi = hvx_vec_splat_f32(0.3183098861837907f);
|
||||
HVX_Vector const_half = hvx_vec_splat_f32(0.5f);
|
||||
HVX_Vector const_pi = hvx_vec_splat_f32(3.141592653589793f);
|
||||
HVX_Vector const_one = hvx_vec_splat_f32(1.0f);
|
||||
HVX_Vector const_neg_one = hvx_vec_splat_f32(-1.0f);
|
||||
|
||||
// n = floor(x * (1/pi) + 0.5)
|
||||
HVX_Vector n_float = hvx_vec_floor_f32(hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(x, const_inv_pi), const_half));
|
||||
|
||||
// y = x - n * pi
|
||||
HVX_Vector y = hvx_vec_sub_f32_f32(x, hvx_vec_mul_f32_f32(n_float, const_pi));
|
||||
|
||||
// Sign determination: if n is odd, sign is -1.0f, else 1.0f
|
||||
// half_n = n * 0.5f
|
||||
HVX_Vector half_n = hvx_vec_mul_f32_f32(n_float, const_half);
|
||||
// floor_half_n = floor(half_n)
|
||||
HVX_Vector floor_half_n = hvx_vec_floor_f32(half_n);
|
||||
// is_odd = half_n > floor_half_n
|
||||
HVX_VectorPred is_odd = Q6_Q_vcmp_gt_VsfVsf(half_n, floor_half_n);
|
||||
// sign = vmux(is_odd, -1.0f, 1.0f)
|
||||
HVX_Vector sign = Q6_V_vmux_QVV(is_odd, const_neg_one, const_one);
|
||||
|
||||
// z = y^2
|
||||
HVX_Vector z = hvx_vec_mul_f32_f32(y, y);
|
||||
|
||||
// Chebyshev approximation for cos(y)
|
||||
HVX_Vector c4 = hvx_vec_splat_f32(2.3557242013849433e-05f);
|
||||
HVX_Vector c3 = hvx_vec_splat_f32(-0.0013871428263450528f);
|
||||
HVX_Vector c2 = hvx_vec_splat_f32(0.041665895266688284f);
|
||||
HVX_Vector c1 = hvx_vec_splat_f32(-0.4999999360426369f);
|
||||
HVX_Vector c0 = hvx_vec_splat_f32(0.9999999999071725f);
|
||||
|
||||
HVX_Vector cos_y = hvx_vec_add_f32_f32(c3, hvx_vec_mul_f32_f32(z, c4));
|
||||
cos_y = hvx_vec_add_f32_f32(c2, hvx_vec_mul_f32_f32(z, cos_y));
|
||||
cos_y = hvx_vec_add_f32_f32(c1, hvx_vec_mul_f32_f32(z, cos_y));
|
||||
cos_y = hvx_vec_add_f32_f32(c0, hvx_vec_mul_f32_f32(z, cos_y));
|
||||
|
||||
return hvx_vec_mul_f32_f32(cos_y, sign);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_sin_f32(HVX_Vector x) {
|
||||
HVX_Vector const_inv_pi = hvx_vec_splat_f32(0.3183098861837907f);
|
||||
HVX_Vector const_half = hvx_vec_splat_f32(0.5f);
|
||||
HVX_Vector const_pi = hvx_vec_splat_f32(3.141592653589793f);
|
||||
HVX_Vector const_one = hvx_vec_splat_f32(1.0f);
|
||||
HVX_Vector const_neg_one = hvx_vec_splat_f32(-1.0f);
|
||||
|
||||
// n = floor(x * (1/pi) + 0.5)
|
||||
HVX_Vector n_float = hvx_vec_floor_f32(hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(x, const_inv_pi), const_half));
|
||||
|
||||
// y = x - n * pi
|
||||
HVX_Vector y = hvx_vec_sub_f32_f32(x, hvx_vec_mul_f32_f32(n_float, const_pi));
|
||||
|
||||
// Sign determination: if n is odd, sign is -1.0f, else 1.0f
|
||||
// half_n = n * 0.5f
|
||||
HVX_Vector half_n = hvx_vec_mul_f32_f32(n_float, const_half);
|
||||
// floor_half_n = floor(half_n)
|
||||
HVX_Vector floor_half_n = hvx_vec_floor_f32(half_n);
|
||||
// is_odd = half_n > floor_half_n
|
||||
HVX_VectorPred is_odd = Q6_Q_vcmp_gt_VsfVsf(half_n, floor_half_n);
|
||||
// sign = vmux(is_odd, -1.0f, 1.0f)
|
||||
HVX_Vector sign = Q6_V_vmux_QVV(is_odd, const_neg_one, const_one);
|
||||
|
||||
// z = y^2
|
||||
HVX_Vector z = hvx_vec_mul_f32_f32(y, y);
|
||||
|
||||
// Chebyshev approximation for sin(y)
|
||||
HVX_Vector s4 = hvx_vec_splat_f32(2.642186986152672e-06f);
|
||||
HVX_Vector s3 = hvx_vec_splat_f32(-0.00019825318964070864f);
|
||||
HVX_Vector s2 = hvx_vec_splat_f32(0.00833326283319605f);
|
||||
HVX_Vector s1 = hvx_vec_splat_f32(-0.16666666082087775f);
|
||||
HVX_Vector s0 = hvx_vec_splat_f32(0.999999999915155f);
|
||||
|
||||
HVX_Vector sin_y = hvx_vec_add_f32_f32(s3, hvx_vec_mul_f32_f32(z, s4));
|
||||
sin_y = hvx_vec_add_f32_f32(s2, hvx_vec_mul_f32_f32(z, sin_y));
|
||||
sin_y = hvx_vec_add_f32_f32(s1, hvx_vec_mul_f32_f32(z, sin_y));
|
||||
sin_y = hvx_vec_add_f32_f32(s0, hvx_vec_mul_f32_f32(z, sin_y));
|
||||
sin_y = hvx_vec_mul_f32_f32(y, sin_y);
|
||||
|
||||
return hvx_vec_mul_f32_f32(sin_y, sign);
|
||||
}
|
||||
|
||||
#endif /* HVX_SIN_COS_H */
|
||||
@@ -14,6 +14,8 @@
|
||||
#include "hvx-sqrt.h"
|
||||
#include "hvx-arith.h"
|
||||
#include "hvx-div.h"
|
||||
#include "hvx-floor.h"
|
||||
#include "hvx-sin-cos.h"
|
||||
#include "hvx-base.h"
|
||||
|
||||
#endif /* HVX_UTILS_H */
|
||||
|
||||
@@ -420,8 +420,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
||||
|
||||
ctx->n_threads = n_hvx;
|
||||
for (int i = 0; i < ctx->n_threads; i++) {
|
||||
// see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
|
||||
ctx->dma[i] = dma_queue_create(128);
|
||||
ctx->dma[i] = dma_queue_create(256); // queue depth
|
||||
}
|
||||
|
||||
// init worker pool
|
||||
@@ -601,6 +600,9 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_PAD:
|
||||
return op_pad(octx);
|
||||
|
||||
case HTP_OP_CONCAT:
|
||||
return op_concat(octx);
|
||||
|
||||
case HTP_OP_GATED_DELTA_NET:
|
||||
return op_gated_delta_net(octx);
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hvx-utils.h"
|
||||
@@ -75,6 +76,9 @@ struct htp_rope_context {
|
||||
size_t theta_cache_offset;
|
||||
uint32_t src0_nrows;
|
||||
|
||||
struct fastdiv_values div_ne2_ne1;
|
||||
struct fastdiv_values div_ne1;
|
||||
|
||||
uint64_t t_start;
|
||||
};
|
||||
|
||||
@@ -117,13 +121,84 @@ static __attribute__((noinline)) void rope_cache_init(const float theta_base,
|
||||
float * cache,
|
||||
const float theta_scale) {
|
||||
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||
float theta = theta_base;
|
||||
#if __HVX_ARCH__ >= 79
|
||||
const bool is_v79_or_newer = true;
|
||||
#else
|
||||
const bool is_v79_or_newer = false;
|
||||
#endif
|
||||
|
||||
for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
|
||||
rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
|
||||
if (is_v79_or_newer && ext_factor == 0.0f) {
|
||||
// Fast path: fully vectorized
|
||||
// We process 32 pairs (64 elements) per iteration.
|
||||
const uint32_t n_blocks = ne0 / 64;
|
||||
|
||||
theta *= theta_scale;
|
||||
// Initialize theta scale powers: [1.0f, theta_scale, theta_scale^2, ..., theta_scale^31]
|
||||
float __attribute__((aligned(128))) theta_powers[32];
|
||||
theta_powers[0] = 1.0f;
|
||||
for (int j = 1; j < 32; j++) {
|
||||
theta_powers[j] = theta_powers[j - 1] * theta_scale;
|
||||
}
|
||||
HVX_Vector v_theta_powers = hvx_vmem(theta_powers);
|
||||
|
||||
HVX_Vector v_freq_scale = hvx_vec_splat_f32(freq_scale);
|
||||
HVX_Vector v_mscale = hvx_vec_splat_f32(mscale);
|
||||
|
||||
// Base theta starts at theta_base
|
||||
float theta_block = theta_base;
|
||||
// The scale factor for the next block is theta_scale^32
|
||||
float theta_scale_32 = 1.0f;
|
||||
for (int j = 0; j < 32; j++) {
|
||||
theta_scale_32 *= theta_scale;
|
||||
}
|
||||
|
||||
for (uint32_t b = 0; b < n_blocks; b++) {
|
||||
uint32_t i0 = b * 64;
|
||||
HVX_Vector v_theta_base = hvx_vec_splat_f32(theta_block);
|
||||
HVX_Vector v_theta = hvx_vec_mul_f32_f32(v_theta_base, v_theta_powers);
|
||||
|
||||
if (freq_factors) {
|
||||
// Load 32 elements of freq_factors
|
||||
HVX_Vector v_ff = hvx_vmemu(freq_factors + i0 / 2);
|
||||
HVX_Vector v_inv_ff = hvx_vec_inverse_f32(v_ff);
|
||||
v_theta = hvx_vec_mul_f32_f32(v_theta, v_inv_ff);
|
||||
}
|
||||
|
||||
HVX_Vector v_theta_final = hvx_vec_mul_f32_f32(v_theta, v_freq_scale);
|
||||
|
||||
HVX_Vector vcos = hvx_vec_cos_f32(v_theta_final);
|
||||
HVX_Vector vsin = hvx_vec_sin_f32(v_theta_final);
|
||||
|
||||
vcos = hvx_vec_mul_f32_f32(vcos, v_mscale);
|
||||
vsin = hvx_vec_mul_f32_f32(vsin, v_mscale);
|
||||
|
||||
HVX_VectorPair vstore = Q6_W_vshuff_VVR(vsin, vcos, -4);
|
||||
|
||||
if (((uintptr_t)cache) % 128 == 0) {
|
||||
hvx_vmem(cache + i0 + 0) = Q6_V_lo_W(vstore);
|
||||
hvx_vmem(cache + i0 + 32) = Q6_V_hi_W(vstore);
|
||||
} else {
|
||||
hvx_vec_store_u(cache + i0 + 0, 32 * sizeof(float), Q6_V_lo_W(vstore));
|
||||
hvx_vec_store_u(cache + i0 + 32, 32 * sizeof(float), Q6_V_hi_W(vstore));
|
||||
}
|
||||
|
||||
theta_block *= theta_scale_32;
|
||||
}
|
||||
|
||||
// Leftovers
|
||||
float theta = theta_block;
|
||||
for (uint32_t i0 = n_blocks * 64; i0 < ne0; i0 += 2) {
|
||||
const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
|
||||
rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
|
||||
theta *= theta_scale;
|
||||
}
|
||||
} else {
|
||||
// Fallback to original scalar loop
|
||||
float theta = theta_base;
|
||||
for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
|
||||
rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
|
||||
theta *= theta_scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -195,24 +270,18 @@ static void rope_corr_dims(int n_dims,
|
||||
}
|
||||
|
||||
static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
|
||||
const HVX_Vector * restrict vsrc = (const HVX_Vector *) src0;
|
||||
const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
|
||||
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
|
||||
const uint32_t he = ne / 2;
|
||||
const uint32_t nvec = he / 32;
|
||||
const uint32_t nloe = he % 32;
|
||||
|
||||
uint32_t nvec = (ne / (VLEN_FP32 * 2) * 2); // 2 vecs per loop, step of 2
|
||||
for (uint32_t i = 0; i < nvec; i++) {
|
||||
HVX_Vector v0 = ((const HVX_Vector *) src0)[i];
|
||||
HVX_Vector v1 = hvx_vmemu(src0 + he + i * 32);
|
||||
|
||||
uint32_t he = ne / 2; // half_dims offset in elements
|
||||
uint32_t hv = he / VLEN_FP32; // half_dims offset in vectors
|
||||
HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[i * 2 + 0];
|
||||
HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[i * 2 + 1];
|
||||
|
||||
#pragma unroll(2)
|
||||
for (uint32_t i = 0; i < nvec; i += 2) {
|
||||
HVX_Vector v0 = vsrc[i/2+0];
|
||||
HVX_Vector v1 = vsrc[i/2+hv];
|
||||
|
||||
HVX_Vector v2 = vtheta[i+0];
|
||||
HVX_Vector v3 = vtheta[i+1];
|
||||
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
|
||||
@@ -222,37 +291,45 @@ static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * rest
|
||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||
|
||||
vdst[i/2+0] = Q6_Vsf_equals_Vqf32(v4);
|
||||
vdst[i/2+hv] = Q6_Vsf_equals_Vqf32(v5);
|
||||
((HVX_Vector *) dst)[i] = Q6_Vsf_equals_Vqf32(v4);
|
||||
hvx_vmemu(dst + he + i * 32) = Q6_Vsf_equals_Vqf32(v5);
|
||||
}
|
||||
|
||||
for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
|
||||
const float cos_theta = theta_cache[i+0];
|
||||
const float sin_theta = theta_cache[i+1];
|
||||
float x0 = src0[i/2];
|
||||
float x1 = src0[i/2 + he];
|
||||
dst[i/2] = x0 * cos_theta - x1 * sin_theta;
|
||||
dst[i/2 + he] = x0 * sin_theta + x1 * cos_theta;
|
||||
if (nloe > 0) {
|
||||
HVX_Vector v0 = hvx_vmemu(src0 + nvec * 32);
|
||||
HVX_Vector v1 = hvx_vmemu(src0 + he + nvec * 32);
|
||||
|
||||
HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[nvec * 2 + 0];
|
||||
HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[nvec * 2 + 1];
|
||||
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
|
||||
HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_hi_W(vcos_sin));
|
||||
|
||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||
|
||||
hvx_vec_store_u(dst + nvec * 32, nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v4));
|
||||
hvx_vec_store_u(dst + he + nvec * 32, nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v5));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
|
||||
const HVX_Vector * restrict vsrc = (const HVX_Vector *) src0;
|
||||
const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
|
||||
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
|
||||
const uint32_t nvec = ne / 64;
|
||||
const uint32_t nloe = ne % 64;
|
||||
|
||||
uint32_t nvec = (ne / (VLEN_FP32 * 2)) * 2; // 2 vecs per loop, step of two
|
||||
for (uint32_t i = 0; i < nvec; i++) {
|
||||
HVX_Vector v0 = ((const HVX_Vector *) src0)[i * 2 + 0];
|
||||
HVX_Vector v1 = ((const HVX_Vector *) src0)[i * 2 + 1];
|
||||
|
||||
#pragma unroll(2)
|
||||
for (uint32_t i = 0; i < nvec; i+=2) {
|
||||
HVX_Vector v0 = vsrc[i+0];
|
||||
HVX_Vector v1 = vsrc[i+1];
|
||||
HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[i * 2 + 0];
|
||||
HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[i * 2 + 1];
|
||||
|
||||
HVX_Vector v2 = vtheta[i+0];
|
||||
HVX_Vector v3 = vtheta[i+1];
|
||||
|
||||
HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4); // vx0_x1[0] = x0, vx0_x1[1] = x1
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
|
||||
HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4);
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
|
||||
@@ -264,17 +341,52 @@ static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict
|
||||
|
||||
HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
|
||||
|
||||
vdst[i+0] = Q6_V_lo_W(vstore);
|
||||
vdst[i+1] = Q6_V_hi_W(vstore);
|
||||
((HVX_Vector *) dst)[i * 2 + 0] = Q6_V_lo_W(vstore);
|
||||
((HVX_Vector *) dst)[i * 2 + 1] = Q6_V_hi_W(vstore);
|
||||
}
|
||||
|
||||
for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
|
||||
const float cos_theta = theta_cache[i+0];
|
||||
const float sin_theta = theta_cache[i+1];
|
||||
float x0 = src0[i+0];
|
||||
float x1 = src0[i+1];
|
||||
dst[i+0] = x0 * cos_theta - x1 * sin_theta;
|
||||
dst[i+1] = x0 * sin_theta + x1 * cos_theta;
|
||||
if (nloe > 0) {
|
||||
if (nloe <= 32) {
|
||||
HVX_Vector v0 = hvx_vmemu(src0 + nvec * 64);
|
||||
HVX_Vector v2 = hvx_vmemu(theta_cache + nvec * 64);
|
||||
|
||||
HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(Q6_V_vzero(), v0, -4);
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(Q6_V_vzero(), v2, -4);
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
|
||||
HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
|
||||
|
||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||
|
||||
HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
|
||||
|
||||
hvx_vec_store_u(dst + nvec * 64, nloe * sizeof(float), Q6_V_lo_W(vstore));
|
||||
} else {
|
||||
HVX_Vector v0 = hvx_vmemu(src0 + nvec * 64);
|
||||
HVX_Vector v1 = hvx_vmemu(src0 + nvec * 64 + 32);
|
||||
|
||||
HVX_Vector v2 = hvx_vmemu(theta_cache + nvec * 64);
|
||||
HVX_Vector v3 = hvx_vmemu(theta_cache + nvec * 64 + 32);
|
||||
|
||||
HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4);
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
|
||||
HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
|
||||
|
||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||
|
||||
HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
|
||||
|
||||
((HVX_Vector *) dst)[nvec * 2 + 0] = Q6_V_lo_W(vstore);
|
||||
hvx_vec_store_u(dst + nvec * 64 + 32, (nloe - 32) * sizeof(float), Q6_V_hi_W(vstore));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -348,13 +460,19 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
const int32_t * pos = (const int32_t *) src1->data;
|
||||
const float * freq_factors = src2 ? (const float *) src2->data : NULL;
|
||||
|
||||
uint32_t ir = 0;
|
||||
const uint32_t i3_start = fastdiv(src0_start_row, &rctx->div_ne2_ne1);
|
||||
const uint32_t rem = fastmodulo(src0_start_row, ne2 * ne1, &rctx->div_ne2_ne1);
|
||||
const uint32_t i2_start = fastdiv(rem, &rctx->div_ne1);
|
||||
const uint32_t i1_start = fastmodulo(rem, ne1, &rctx->div_ne1);
|
||||
|
||||
uint32_t ir = src0_start_row;
|
||||
uint32_t prev_i2 = (uint32_t) -1;
|
||||
|
||||
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
|
||||
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
|
||||
for (uint32_t i1 = 0; i1 < ne1; ) { // attn-heads
|
||||
if (ir < src0_start_row) { ir++; i1++; continue; }
|
||||
for (uint32_t i3 = i3_start; i3 < ne3; i3++) { // batch
|
||||
const uint32_t i2_init = (i3 == i3_start) ? i2_start : 0;
|
||||
for (uint32_t i2 = i2_init; i2 < ne2; i2++) { // seq-len
|
||||
const uint32_t i1_init = (i3 == i3_start && i2 == i2_start) ? i1_start : 0;
|
||||
for (uint32_t i1 = i1_init; i1 < ne1; ) { // attn-heads
|
||||
if (ir >= src0_end_row) goto done;
|
||||
|
||||
// Rows in this block
|
||||
@@ -407,9 +525,6 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
ne0, rctx->ext_factor, rctx->attn_factor,
|
||||
theta_cache, rctx->theta_scale);
|
||||
}
|
||||
|
||||
// FARF(HIGH, "rope-theta %u: ir %u i1 %u i2 %u i3 %u cache %p : usec %u", ith, ir, i1, i2, i3, theta_cache,
|
||||
// (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
|
||||
}
|
||||
|
||||
// Skip output DMA transactions from prev block (if any)
|
||||
@@ -489,7 +604,7 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
|
||||
// Aligned row sizes for VTCM
|
||||
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
|
||||
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
||||
const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 128);
|
||||
const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 256);
|
||||
|
||||
// Calculate spad sizes per thread
|
||||
size_t src0_spad_per_thread = theta_cache_size_aligned + HTP_ROPE_SPAD_NROWS * src0_row_size_aligned;
|
||||
@@ -546,6 +661,11 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
|
||||
rctx.src0_nrows = src0_nrows;
|
||||
rctx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
|
||||
|
||||
if (src0_nrows > 0) {
|
||||
rctx.div_ne2_ne1 = init_fastdiv_values(dst->ne[2] * dst->ne[1]);
|
||||
rctx.div_ne1 = init_fastdiv_values(dst->ne[1]);
|
||||
}
|
||||
|
||||
FARF(HIGH, "rope-f32 n-rows %u n-dims %d ne0 %u ext-factor %.6f theta-scale %.6f attn-factor %.6f\n", rctx.src0_nrows, rctx.n_dims, ne0,
|
||||
rctx.ext_factor, rctx.theta_scale, rctx.attn_factor);
|
||||
|
||||
|
||||
@@ -65,6 +65,9 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
|
||||
// parallelize by rows of src0
|
||||
const uint32_t dr = srctx->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
if (ir0 >= nr) {
|
||||
return;
|
||||
}
|
||||
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||
|
||||
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
|
||||
@@ -109,6 +112,9 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da
|
||||
// parallelize by rows of src0
|
||||
const uint32_t dr = srctx->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
if (ir0 >= nr) {
|
||||
return;
|
||||
}
|
||||
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||
|
||||
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
|
||||
|
||||
@@ -207,7 +207,7 @@ static void hvx_fast_norm_f32(const uint8_t * restrict src,
|
||||
|
||||
// scale = rsqrt(variance + epsilon), mean_x broadcast for subtraction
|
||||
HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(var_epsilon_v));
|
||||
HVX_Vector mean_x_b = hvx_vec_splat_f32(hvx_vec_get_f32(Q6_Vsf_equals_Vqf32(mean_x_v)));
|
||||
HVX_Vector mean_x_b = hvx_vec_repl_f32(Q6_Vsf_equals_Vqf32(mean_x_v));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < nvec; i++) {
|
||||
|
||||
@@ -215,6 +215,30 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
|
||||
// device
|
||||
//
|
||||
|
||||
enum ggml_metal_device_id {
|
||||
GGML_METAL_DEVICE_GENERIC = 0,
|
||||
|
||||
GGML_METAL_DEVICE_M1,
|
||||
GGML_METAL_DEVICE_M1_PRO,
|
||||
GGML_METAL_DEVICE_M1_MAX,
|
||||
GGML_METAL_DEVICE_M1_ULTRA,
|
||||
GGML_METAL_DEVICE_M2,
|
||||
GGML_METAL_DEVICE_M2_PRO,
|
||||
GGML_METAL_DEVICE_M2_MAX,
|
||||
GGML_METAL_DEVICE_M2_ULTRA,
|
||||
GGML_METAL_DEVICE_M3,
|
||||
GGML_METAL_DEVICE_M3_PRO,
|
||||
GGML_METAL_DEVICE_M3_MAX,
|
||||
GGML_METAL_DEVICE_M3_ULTRA,
|
||||
GGML_METAL_DEVICE_M4,
|
||||
GGML_METAL_DEVICE_M4_PRO,
|
||||
GGML_METAL_DEVICE_M4_MAX,
|
||||
GGML_METAL_DEVICE_M5,
|
||||
GGML_METAL_DEVICE_M5_PRO,
|
||||
GGML_METAL_DEVICE_M5_MAX,
|
||||
GGML_METAL_DEVICE_M5_ULTRA,
|
||||
};
|
||||
|
||||
struct ggml_metal_device_props {
|
||||
int device;
|
||||
char name[128];
|
||||
@@ -234,6 +258,8 @@ struct ggml_metal_device_props {
|
||||
|
||||
bool supports_gpu_family_apple7;
|
||||
|
||||
enum ggml_metal_device_id device_id;
|
||||
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
|
||||
@@ -628,6 +628,50 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
|
||||
free(rsets);
|
||||
}
|
||||
|
||||
static enum ggml_metal_device_id ggml_metal_device_id_parse(const char * name) {
|
||||
if (!name) {
|
||||
return GGML_METAL_DEVICE_GENERIC;
|
||||
}
|
||||
|
||||
static const char prefix[] = "Apple ";
|
||||
if (strncmp(name, prefix, sizeof(prefix) - 1) != 0) {
|
||||
return GGML_METAL_DEVICE_GENERIC;
|
||||
}
|
||||
const char * suffix = name + sizeof(prefix) - 1;
|
||||
|
||||
static const struct {
|
||||
const char * name;
|
||||
enum ggml_metal_device_id id;
|
||||
} table[] = {
|
||||
{"M1", GGML_METAL_DEVICE_M1},
|
||||
{"M1 Pro", GGML_METAL_DEVICE_M1_PRO},
|
||||
{"M1 Max", GGML_METAL_DEVICE_M1_MAX},
|
||||
{"M1 Ultra", GGML_METAL_DEVICE_M1_ULTRA},
|
||||
{"M2", GGML_METAL_DEVICE_M2},
|
||||
{"M2 Pro", GGML_METAL_DEVICE_M2_PRO},
|
||||
{"M2 Max", GGML_METAL_DEVICE_M2_MAX},
|
||||
{"M2 Ultra", GGML_METAL_DEVICE_M2_ULTRA},
|
||||
{"M3", GGML_METAL_DEVICE_M3},
|
||||
{"M3 Pro", GGML_METAL_DEVICE_M3_PRO},
|
||||
{"M3 Max", GGML_METAL_DEVICE_M3_MAX},
|
||||
{"M3 Ultra", GGML_METAL_DEVICE_M3_ULTRA},
|
||||
{"M4", GGML_METAL_DEVICE_M4},
|
||||
{"M4 Pro", GGML_METAL_DEVICE_M4_PRO},
|
||||
{"M4 Max", GGML_METAL_DEVICE_M4_MAX},
|
||||
{"M5", GGML_METAL_DEVICE_M5},
|
||||
{"M5 Pro", GGML_METAL_DEVICE_M5_PRO},
|
||||
{"M5 Max", GGML_METAL_DEVICE_M5_MAX},
|
||||
{"M5 Ultra", GGML_METAL_DEVICE_M5_ULTRA},
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < sizeof(table)/sizeof(table[0]); ++i) {
|
||||
if (strcmp(suffix, table[i].name) == 0) {
|
||||
return table[i].id;
|
||||
}
|
||||
}
|
||||
return GGML_METAL_DEVICE_GENERIC;
|
||||
}
|
||||
|
||||
ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
|
||||
|
||||
@@ -795,6 +839,8 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
|
||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
|
||||
dev->props.device_id = ggml_metal_device_id_parse([[dev->mtl_device name] UTF8String]);
|
||||
|
||||
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
||||
|
||||
@@ -224,6 +224,7 @@ struct sycl_device_info {
|
||||
int max_wg_per_cu; // max work groups per compute unit - refer to
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
bool vmm; // virtual memory support
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
sycl_hw_info hw_info;
|
||||
optimize_feature opt_feature;
|
||||
@@ -244,6 +245,8 @@ struct ggml_sycl_device_info {
|
||||
|
||||
const ggml_sycl_device_info & ggml_sycl_info();
|
||||
|
||||
static constexpr size_t SYCL_BUFFER_ALIGNMENT = 128;
|
||||
|
||||
struct ggml_sycl_pool {
|
||||
virtual ~ggml_sycl_pool() = default;
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include <cstdlib>
|
||||
#include <float.h>
|
||||
#include <limits>
|
||||
#include <optional>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
@@ -37,6 +38,11 @@
|
||||
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
|
||||
#endif
|
||||
#if SYCL_EXT_ONEAPI_VIRTUAL_MEM
|
||||
# include <sycl/ext/oneapi/virtual_mem/physical_mem.hpp>
|
||||
# include <sycl/ext/oneapi/virtual_mem/virtual_mem.hpp>
|
||||
# define GGML_SYCL_USE_VMM
|
||||
#endif
|
||||
#include <sycl/half_type.hpp>
|
||||
|
||||
#include "ggml.h"
|
||||
@@ -70,6 +76,7 @@ int g_ggml_sycl_debug = 0;
|
||||
int g_ggml_sycl_disable_optimize = 0;
|
||||
int g_ggml_sycl_disable_graph = 0;
|
||||
int g_ggml_sycl_disable_dnn = 0;
|
||||
int g_ggml_sycl_enable_vmm = 1;
|
||||
int g_ggml_sycl_prioritize_dmmv = 0;
|
||||
int g_ggml_sycl_use_async_mem_op = 0;
|
||||
int g_ggml_sycl_use_async_mem_op_requested = 1;
|
||||
@@ -96,13 +103,30 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
// GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
|
||||
// #endif
|
||||
for (int i = 0; i < info.device_count; ++i) {
|
||||
info.devices[i].vmm = 0;
|
||||
dpct::device_info prop;
|
||||
auto & device = dpct::dev_mgr::instance().get_device(i);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, device)));
|
||||
|
||||
#if !defined(GGML_SYCL_USE_VMM)
|
||||
info.devices[i].vmm = 0;
|
||||
#else
|
||||
info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem);
|
||||
if (info.devices[i].vmm) {
|
||||
// NB: SYCL's get_mem_granularity always returns the _minimum_ granularity,
|
||||
// but the L0 API requires a larger page size for allocs above 2 MiB and
|
||||
// rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic].
|
||||
// Here we clamp it to 2 MiB for simplicity, but other devices may require
|
||||
// calling zeVirtualMemQueryPageSize or yet unexposed public API.
|
||||
const size_t physical_page = 2ull << 20; // 2 MiB
|
||||
info.devices[i].vmm_granularity = std::max<size_t>(
|
||||
sycl::ext::oneapi::experimental::get_mem_granularity(
|
||||
device, sycl::context(device)),
|
||||
physical_page);
|
||||
}
|
||||
#endif
|
||||
|
||||
info.default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
@@ -234,6 +258,7 @@ static void ggml_check_sycl() try {
|
||||
g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
||||
g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
|
||||
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
|
||||
@@ -275,6 +300,11 @@ static void ggml_check_sycl() try {
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
|
||||
#endif
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: no\n");
|
||||
#endif
|
||||
|
||||
GGML_LOG_INFO("Running with Environment Variables:\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
||||
@@ -293,6 +323,11 @@ static void ggml_check_sycl() try {
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
|
||||
#endif
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
|
||||
#endif
|
||||
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
||||
g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
||||
@@ -754,7 +789,7 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
return SYCL_BUFFER_ALIGNMENT;
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
@@ -1177,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg
|
||||
}
|
||||
|
||||
static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
return SYCL_BUFFER_ALIGNMENT;
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
@@ -1462,6 +1497,121 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
}
|
||||
};
|
||||
|
||||
// pool with virtual memory management
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
struct ggml_sycl_pool_vmm : public ggml_sycl_pool {
|
||||
static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
|
||||
int device;
|
||||
sycl::context ctx;
|
||||
sycl::device dev;
|
||||
|
||||
uintptr_t pool_addr = 0;
|
||||
size_t pool_used = 0;
|
||||
size_t pool_size = 0;
|
||||
size_t granularity;
|
||||
|
||||
// physical_mem owns the commits (unlike cuMemMap)
|
||||
struct mapping {
|
||||
sycl::ext::oneapi::experimental::physical_mem phys;
|
||||
void * map_ptr;
|
||||
};
|
||||
std::vector<mapping> mappings;
|
||||
|
||||
explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) :
|
||||
device(device_),
|
||||
ctx(qptr_->get_context()),
|
||||
dev(qptr_->get_device()),
|
||||
granularity(ggml_sycl_info().devices[device_].vmm_granularity) {
|
||||
}
|
||||
|
||||
~ggml_sycl_pool_vmm() {
|
||||
if (pool_addr == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Per spec, unmap must (a) match the exact (ptr, size) of an earlier
|
||||
// physical_mem::map() call and (b) precede destruction of the
|
||||
// physical_mem objects (their dtors won't unmap).
|
||||
for (auto & m : mappings) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap(
|
||||
m.map_ptr, m.phys.size(), ctx)));
|
||||
}
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem(
|
||||
pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx)));
|
||||
}
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
|
||||
size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT);
|
||||
|
||||
size_t avail = pool_size - pool_used;
|
||||
|
||||
if (size > avail) {
|
||||
// round up to the next multiple of the granularity
|
||||
size_t reserve_size = GGML_PAD(size - avail, granularity);
|
||||
|
||||
GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE);
|
||||
|
||||
// allocate more physical memory
|
||||
std::optional<sycl::ext::oneapi::experimental::physical_mem> phys;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size)));
|
||||
|
||||
// reserve virtual address space (if not already reserved)
|
||||
if (pool_addr == 0) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem(
|
||||
SYCL_POOL_VMM_MAX_SIZE, ctx)));
|
||||
}
|
||||
|
||||
// map at the end of the pool
|
||||
void * map_ptr = nullptr;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
map_ptr = phys->map(pool_addr + pool_size, reserve_size,
|
||||
sycl::ext::oneapi::experimental::address_access_mode::read_write)));
|
||||
|
||||
// stash these so we could unmap this exact range in dtor
|
||||
mappings.push_back({
|
||||
std::move(*phys),
|
||||
map_ptr,
|
||||
});
|
||||
|
||||
// add to the pool
|
||||
pool_size += reserve_size;
|
||||
|
||||
#ifdef DEBUG_SYCL_MALLOC
|
||||
GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
||||
device, (unsigned long long) (pool_size/1024/1024),
|
||||
(unsigned long long) (reserve_size/1024/1024));
|
||||
#endif
|
||||
}
|
||||
|
||||
GGML_ASSERT(pool_addr != 0);
|
||||
|
||||
void * ptr = reinterpret_cast<void *>(pool_addr + pool_used);
|
||||
*actual_size = size;
|
||||
pool_used += size;
|
||||
|
||||
#ifdef DEBUG_SYCL_MALLOC
|
||||
GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr);
|
||||
#endif
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void free(void * ptr, size_t size) override {
|
||||
#ifdef DEBUG_SYCL_MALLOC
|
||||
GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr);
|
||||
#endif
|
||||
|
||||
pool_used -= size;
|
||||
|
||||
// all deallocations must be in reverse order of the allocations
|
||||
GGML_ASSERT(ptr == reinterpret_cast<void *>(pool_addr + pool_used));
|
||||
}
|
||||
};
|
||||
#endif // defined(GGML_SYCL_USE_VMM)
|
||||
|
||||
struct ggml_sycl_pool_host : public ggml_sycl_pool {
|
||||
queue_ptr qptr;
|
||||
int device;
|
||||
@@ -1542,20 +1692,19 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(que
|
||||
}
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
|
||||
// TBD: NO VMM support
|
||||
// if (ggml_sycl_info().devices[device].vmm) {
|
||||
// return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
|
||||
// }
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) {
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(qptr, device));
|
||||
}
|
||||
#endif // defined(GGML_SYCL_USE_VMM)
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
|
||||
return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
|
||||
}
|
||||
|
||||
// TBD pool with virtual memory management
|
||||
// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
|
||||
|
||||
/// kernels
|
||||
typedef void (*ggml_sycl_op_mul_mat_t)(
|
||||
ggml_backend_sycl_context & ctx,
|
||||
|
||||
@@ -398,6 +398,7 @@ enum vk_conv_shapes {
|
||||
CONV_SHAPE_128x128,
|
||||
CONV_SHAPE_64x32,
|
||||
CONV_SHAPE_32x256,
|
||||
CONV_SHAPE_64x128,
|
||||
CONV_SHAPE_COUNT,
|
||||
};
|
||||
|
||||
@@ -412,6 +413,7 @@ vk_conv_block_size vk_conv_block_sizes[CONV_SHAPE_COUNT] = {
|
||||
{ 128, 128, 16 }, // CONV_SHAPE_128x128
|
||||
{ 64, 32, 32 }, // CONV_SHAPE_64x32
|
||||
{ 32, 256, 16 }, // CONV_SHAPE_32x256
|
||||
{ 64, 128, 16 }, // CONV_SHAPE_64x128
|
||||
};
|
||||
|
||||
enum dmmv_wg_sizes {
|
||||
@@ -447,14 +449,16 @@ struct vk_fa_pipeline_state {
|
||||
};
|
||||
|
||||
struct vk_conv2d_pipeline_state {
|
||||
vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH)
|
||||
: s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH) {}
|
||||
vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH, uint32_t aligned)
|
||||
: s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH), aligned(aligned) {}
|
||||
|
||||
uint32_t s0, s1, p0, p1, d0, d1, KW, KH;
|
||||
// when set, shader can skip K/CRS/NPQ bounds checks and address clamps
|
||||
uint32_t aligned;
|
||||
|
||||
bool operator<(const vk_conv2d_pipeline_state &b) const {
|
||||
return std::tie(s0, s1, p0, p1, d0, d1, KW, KH) <
|
||||
std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH);
|
||||
return std::tie(s0, s1, p0, p1, d0, d1, KW, KH, aligned) <
|
||||
std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH, b.aligned);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -4934,7 +4938,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
// conv2d, conv_transpose_2d
|
||||
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
|
||||
uint32_t conv2d_WG_SIZE = 256;
|
||||
// smaller WG for the small-tile fallback gives more concurrent WGs per SM
|
||||
uint32_t conv2d_WG_SIZE = (s == CONV_SHAPE_64x32) ? 128 : 256;
|
||||
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
|
||||
uint32_t conv2d_TS_K = (s == CONV_SHAPE_64x32) ? 4 : 8;
|
||||
uint32_t conv2d_SHMEM_PAD = 4;
|
||||
@@ -4973,18 +4978,77 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
conv2d_BS.CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used.
|
||||
}
|
||||
|
||||
uint32_t conv2d_shmem_req =
|
||||
(conv2d_BS.K * (conv2d_BS.CRS + conv2d_SHMEM_PAD) + conv2d_BS.CRS * (conv2d_BS.NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
|
||||
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
|
||||
// cm1 is used only when cm2 is unavailable; capped at 64x128 (due to shared memory size).
|
||||
// Requires 16x16x16 f16-acc since that's the fragment shape hard-coded in the shader.
|
||||
// Subgroup size must be 32 or 64 (to keep WG_SIZE sane) and we need
|
||||
// subgroup_size_control to force the driver to actually use it.
|
||||
bool conv2d_use_cm1 = false;
|
||||
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
conv2d_use_cm1 = !device->coopmat2 &&
|
||||
device->coopmat_support && device->coopmat_support_16x16x16_f16acc &&
|
||||
device->subgroup_size_control &&
|
||||
(device->subgroup_size == 32 || device->subgroup_size == 64) &&
|
||||
s != CONV_SHAPE_128x128;
|
||||
#endif
|
||||
|
||||
const uint32_t conv2d_cm1_shmem_pad = 8;
|
||||
|
||||
auto shmem_req = [&](uint32_t pad, bool csh_store, bool fp16_shmem) {
|
||||
const uint32_t elem_size = fp16_shmem ? (uint32_t)sizeof(uint16_t) : (uint32_t)sizeof(float);
|
||||
const uint32_t csh_elems = csh_store ? conv2d_BS.K * conv2d_BS.NPQ : 0u;
|
||||
return (conv2d_BS.K * (conv2d_BS.CRS + pad) + conv2d_BS.CRS * (conv2d_BS.NPQ + pad) + csh_elems) * elem_size;
|
||||
};
|
||||
|
||||
// coopmat1 needs to store the output through shared memory, so check up front
|
||||
// whether it'll fit and disable it before applying coopmat1 parameters.
|
||||
if (conv2d_use_cm1 && device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_cm1_shmem_pad, true, true)) {
|
||||
conv2d_use_cm1 = false;
|
||||
}
|
||||
|
||||
uint32_t conv2d_WM = 16, conv2d_WN = 16; // cm1 subgroup tile, ignored otherwise
|
||||
if (conv2d_use_cm1) {
|
||||
conv2d_SHMEM_PAD = conv2d_cm1_shmem_pad;
|
||||
// 16x16x16 fragments; pick WM/WN to keep WG_SIZE at 256
|
||||
// (i.e. 8 subgroups for sg=32, 4 subgroups for sg=64).
|
||||
const bool sg64 = (device->subgroup_size == 64);
|
||||
switch (s) {
|
||||
case CONV_SHAPE_64x32: conv2d_WM = sg64 ? 32 : 16; conv2d_WN = 16; break;
|
||||
case CONV_SHAPE_64x128: conv2d_WM = 32; conv2d_WN = sg64 ? 64 : 32; break;
|
||||
case CONV_SHAPE_32x256: conv2d_WM = sg64 ? 16 : 32; conv2d_WN = sg64 ? 128 : 32; break;
|
||||
default: break;
|
||||
}
|
||||
const uint32_t warps_M = conv2d_BS.K / conv2d_WM;
|
||||
const uint32_t warps_N = conv2d_BS.NPQ / conv2d_WN;
|
||||
conv2d_WG_SIZE = warps_M * warps_N * device->subgroup_size;
|
||||
}
|
||||
|
||||
// stage cm2 accumulator through shmem for coalesced global stores;
|
||||
// skipped on 128x128 where the extra Csh footprint hurts occupancy.
|
||||
// cm1 always uses the staged path.
|
||||
uint32_t conv2d_csh_store = (device->coopmat2 && s != CONV_SHAPE_128x128) ? 1u : 0u;
|
||||
if (conv2d_use_cm1) {
|
||||
conv2d_csh_store = 1;
|
||||
}
|
||||
|
||||
// shmem is fp16 on cm2/cm1 (matches Csh), fp32 on scalar
|
||||
const bool conv2d_use_fp16_shmem = device->coopmat2 || conv2d_use_cm1;
|
||||
|
||||
// shrink CRS if the non-cm1 config still doesn't fit
|
||||
if (device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_SHMEM_PAD, conv2d_csh_store, conv2d_use_fp16_shmem)) {
|
||||
GGML_ASSERT(!conv2d_use_cm1);
|
||||
conv2d_BS.CRS = 8;
|
||||
if (use_collectives) {
|
||||
conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS);
|
||||
}
|
||||
conv2d_csh_store = 0;
|
||||
}
|
||||
|
||||
std::array<uint32_t, 3> wg_denoms = { conv2d_BS.K, 1, 1 };
|
||||
std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
|
||||
|
||||
// cm1 needs a fixed subgroup width to match the WG_SIZE we computed
|
||||
const uint32_t conv2d_required_subgroup_size = conv2d_use_cm1 ? device->subgroup_size : 0;
|
||||
|
||||
#define CREATE_CONV(name, type_suffix, spv_suffix) \
|
||||
for (auto &c : device->pipeline_##name##type_suffix[s]) { \
|
||||
const vk_conv2d_pipeline_state &state = c.first; \
|
||||
@@ -4997,10 +5061,14 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
spec_constants_cpy.push_back(state.d1); \
|
||||
spec_constants_cpy.push_back(state.KW); \
|
||||
spec_constants_cpy.push_back(state.KH); \
|
||||
spec_constants_cpy.push_back(state.aligned); \
|
||||
spec_constants_cpy.push_back(conv2d_csh_store); \
|
||||
spec_constants_cpy.push_back(conv2d_WM); \
|
||||
spec_constants_cpy.push_back(conv2d_WN); \
|
||||
ggml_vk_create_pipeline( \
|
||||
device, c.second, #name #type_suffix, \
|
||||
name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \
|
||||
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives); \
|
||||
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives || conv2d_required_subgroup_size, conv2d_required_subgroup_size); \
|
||||
}
|
||||
#define CREATE_CONVS(spv_suffix) \
|
||||
CREATE_CONV(conv2d, _f32, spv_suffix) \
|
||||
@@ -5011,6 +5079,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
if (device->coopmat2) {
|
||||
CREATE_CONVS(_cm2)
|
||||
} else
|
||||
#endif
|
||||
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (conv2d_use_cm1) {
|
||||
CREATE_CONVS(_cm1)
|
||||
} else
|
||||
#endif
|
||||
if (conv2d_UNROLL) {
|
||||
CREATE_CONVS(_unroll)
|
||||
@@ -5768,8 +5841,12 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
|
||||
ggml_vk_load_shaders(device);
|
||||
|
||||
// Only use transfer queue on AMD non-GCN, when the graphics queue is not enabled
|
||||
const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN && !allow_graphics_queue;
|
||||
// Prefer a dedicated transfer queue on AMD dGPUs (non-GCN) when graphics queue use is disabled.
|
||||
const bool prefers_transfer_queue =
|
||||
device->vendor_id == VK_VENDOR_ID_AMD &&
|
||||
device->architecture != AMD_GCN &&
|
||||
!device->uma &&
|
||||
!allow_graphics_queue;
|
||||
|
||||
if (!device->single_queue) {
|
||||
const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
|
||||
@@ -9473,10 +9550,23 @@ static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, u
|
||||
// so small convolutions will still choose a smaller tile.
|
||||
const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
|
||||
|
||||
if (K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
|
||||
// 128x128 isn't used with cm1 due to shared memory size; fall through to a smaller tile.
|
||||
bool allow_128x128 = true;
|
||||
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (!ctx->device->coopmat2 && ctx->device->coopmat_support && ctx->device->coopmat_support_16x16x16_f16acc) {
|
||||
allow_128x128 = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
|
||||
return CONV_SHAPE_128x128;
|
||||
} else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) {
|
||||
return CONV_SHAPE_32x256;
|
||||
} else if (K <= 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) {
|
||||
return CONV_SHAPE_64x128;
|
||||
} else if (!allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) {
|
||||
// cm1 fallback for large K when 128x128 isn't available
|
||||
return CONV_SHAPE_64x128;
|
||||
} else {
|
||||
return CONV_SHAPE_64x32;
|
||||
}
|
||||
@@ -10008,7 +10098,18 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0;
|
||||
uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1;
|
||||
uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1;
|
||||
vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH);
|
||||
|
||||
// tile-aligned shapes let the shader skip bounds checks
|
||||
const uint32_t Cin = (uint32_t)src1->ne[2];
|
||||
const uint32_t CRS = Cin * KW * KH;
|
||||
const uint32_t BS_K = vk_conv_block_sizes[shape].K;
|
||||
const uint32_t BS_CRS = vk_conv_block_sizes[shape].CRS;
|
||||
const uint32_t BS_NPQ = vk_conv_block_sizes[shape].NPQ;
|
||||
const uint32_t aligned = ((K % BS_K == 0) &&
|
||||
(CRS % BS_CRS == 0) &&
|
||||
(NPQ % BS_NPQ == 0)) ? 1u : 0u;
|
||||
|
||||
vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH, aligned);
|
||||
|
||||
std::map<vk_conv2d_pipeline_state, vk_pipeline> *pipelines = nullptr;
|
||||
if (op == GGML_OP_CONV_2D) {
|
||||
|
||||
@@ -7,6 +7,13 @@
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
#extension GL_KHR_cooperative_matrix : enable
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#endif
|
||||
|
||||
#ifdef USE_COLLECTIVES
|
||||
# extension GL_KHR_shader_subgroup_shuffle : enable
|
||||
#endif
|
||||
@@ -77,6 +84,39 @@ layout(constant_id = 12) const uint d1 = 1;
|
||||
// Kernel spatial sizes
|
||||
layout(constant_id = 13) const uint KW = 1;
|
||||
layout(constant_id = 14) const uint KH = 1;
|
||||
// when set, skip bounds checks and address clamps (K/CRS/NPQ are tile-aligned)
|
||||
layout(constant_id = 15) const uint aligned = 0;
|
||||
// stage cm2 result through shmem (Csh) for coalesced stores. cm1 always does this.
|
||||
layout(constant_id = 16) const uint csh_store = 0;
|
||||
|
||||
#ifdef COOPMAT
|
||||
// cm1 subgroup tile: each subgroup computes a WM x WN region as a grid of
|
||||
// TM x TN x TK fragments. Requires WM%TM == WN%TN == BS_K%WM == BS_NPQ%WN ==
|
||||
// BS_CRS%TK == 0, and WG_SIZE == (BS_K/WM) * (BS_NPQ/WN) * subgroup_size.
|
||||
layout(constant_id = 17) const uint WM = 32;
|
||||
layout(constant_id = 18) const uint WN = 32;
|
||||
const uint TM = 16;
|
||||
const uint TN = 16;
|
||||
const uint TK = 16;
|
||||
const uint cms_per_row = WM / TM;
|
||||
const uint cms_per_col = WN / TN;
|
||||
const uint warps_M = BS_K / WM;
|
||||
const uint warps_N = BS_NPQ / WN;
|
||||
#endif
|
||||
|
||||
// without padding, H_idx/W_idx are in bounds by construction (non-TRANSPOSE only)
|
||||
#ifdef TRANSPOSE
|
||||
const bool hw_in_bounds = false;
|
||||
#else
|
||||
const bool hw_in_bounds = (p0 == 0) && (p1 == 0);
|
||||
#endif
|
||||
|
||||
// TRANSPOSE stride alignment is trivially satisfied for stride 1
|
||||
#ifdef TRANSPOSE
|
||||
const bool stride_in_bounds = (s0 == 1) && (s1 == 1);
|
||||
#else
|
||||
const bool stride_in_bounds = true;
|
||||
#endif
|
||||
|
||||
uint32_t tid = gl_LocalInvocationID.x;
|
||||
const uint32_t WG_SIZE = gl_WorkGroupSize.x;
|
||||
@@ -94,7 +134,7 @@ uint32_t n_elems_out = K * NPQ;
|
||||
// Number of blocktiles per input
|
||||
uint32_t NB_CRS = splitWork(CRS, BS_CRS);
|
||||
|
||||
#ifdef COOPMAT2
|
||||
#if defined(COOPMAT2) || defined(COOPMAT)
|
||||
#define SHMEM_TYPE float16_t
|
||||
#else
|
||||
#define SHMEM_TYPE float
|
||||
@@ -112,6 +152,17 @@ const uint32_t Bsh_len = BS_CRS * Bsh_stride;
|
||||
shared SHMEM_TYPE Ash[Ash_len]; // K x CRS
|
||||
shared SHMEM_TYPE Bsh[Bsh_len]; // CRS x NPQ
|
||||
|
||||
#if defined(COOPMAT2) || defined(COOPMAT)
|
||||
// stage matC through shmem so global stores are row-major (NPQ-contiguous)
|
||||
const uint32_t Csh_stride = BS_NPQ;
|
||||
#ifdef COOPMAT
|
||||
const uint32_t Csh_len = BS_K * Csh_stride;
|
||||
#else
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 1;
|
||||
#endif
|
||||
shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ
|
||||
#endif
|
||||
|
||||
// Threadtile sizes
|
||||
const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
|
||||
|
||||
@@ -161,7 +212,7 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T
|
||||
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
|
||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||
if (K_idx < K && NPQ_idx < NPQ) {
|
||||
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
|
||||
dst_data[dst_idx] = D_TYPE(elem);
|
||||
}
|
||||
return elem;
|
||||
@@ -176,6 +227,13 @@ void main() {
|
||||
#ifdef COOPMAT2
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
|
||||
matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
|
||||
#elif defined(COOPMAT)
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
|
||||
[[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
|
||||
sums[i] = coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0);
|
||||
}
|
||||
const uint warp_r = gl_SubgroupID / warps_N;
|
||||
const uint warp_c = gl_SubgroupID % warps_N;
|
||||
#else
|
||||
float regC[TS_K][TS_NPQ];
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
@@ -228,12 +286,15 @@ void main() {
|
||||
uint32_t B_lx = Ac;
|
||||
uint32_t K_idx = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/
|
||||
#ifdef TRANSPOSE
|
||||
uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03, K * CRS - 1);
|
||||
uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03;
|
||||
#else
|
||||
uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1);
|
||||
uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03;
|
||||
#endif
|
||||
if (aligned == 0) {
|
||||
knl_idx = min(knl_idx, K * CRS - 1);
|
||||
}
|
||||
float val = knl_data[knl_idx];
|
||||
if (K_idx >= K || CRS_idx_a >= CRS) {
|
||||
if (aligned == 0 && (K_idx >= K || CRS_idx_a >= CRS)) {
|
||||
val = 0.0;
|
||||
}
|
||||
Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
|
||||
@@ -282,15 +343,27 @@ void main() {
|
||||
uint32_t H_idx = OH_idx * s1 + KH_idx_b * d1 - p1;
|
||||
uint32_t W_idx = OW_idx * s0 + KW_idx_b * d0 - p0;
|
||||
#endif
|
||||
uint32_t src_idx =
|
||||
min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
|
||||
uint32_t src_idx = W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13;
|
||||
// skip clamp when address can't go OOB
|
||||
if (aligned == 0 || !hw_in_bounds || !stride_in_bounds) {
|
||||
src_idx = min(max(src_idx, 0), p.Cin * p.N * p.W * p.H - 1);
|
||||
}
|
||||
float val = src_data[src_idx];
|
||||
if (CRS_idx_b >= CRS || NPQ_idx >= NPQ
|
||||
|| H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
|
||||
bool oob = false;
|
||||
if (aligned == 0 && (CRS_idx_b >= CRS || NPQ_idx >= NPQ)) {
|
||||
oob = true;
|
||||
}
|
||||
// also catches lower-bound underflow (idx wraps to 0x80000000+)
|
||||
if (!hw_in_bounds && (H_idx >= p.H || W_idx >= p.W)) {
|
||||
oob = true;
|
||||
}
|
||||
#ifdef TRANSPOSE
|
||||
|| (H_idx_x_s1 - H_idx * s1 != 0) || (W_idx_x_s0 - W_idx * s0 != 0)
|
||||
if (!stride_in_bounds &&
|
||||
((H_idx_x_s1 - H_idx * s1 != 0) || (W_idx_x_s0 - W_idx * s0 != 0))) {
|
||||
oob = true;
|
||||
}
|
||||
#endif
|
||||
) {
|
||||
if (oob) {
|
||||
val = 0.0;
|
||||
}
|
||||
Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
|
||||
@@ -303,6 +376,23 @@ void main() {
|
||||
coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
matC = coopMatMulAdd(matA, matB, matC);
|
||||
#elif defined(COOPMAT)
|
||||
// each subgroup multiplies its grid of fragments per TK-sized CRS chunk
|
||||
[[unroll]] for (uint k_step = 0; k_step < BS_CRS / TK; k_step++) {
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a[cms_per_row];
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
const uint a_off = (warp_r * WM + cm_row * TM) * Ash_stride + k_step * TK;
|
||||
coopMatLoad(cache_a[cm_row], Ash, a_off, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
}
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
|
||||
const uint b_off = k_step * TK * Bsh_stride + warp_c * WN + cm_col * TN;
|
||||
coopMatLoad(cache_b, Bsh, b_off, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a[cm_row], cache_b, sums[cm_col * cms_per_row + cm_row]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (T_y * TS_K < K) {
|
||||
UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
|
||||
@@ -325,8 +415,51 @@ void main() {
|
||||
barrier();
|
||||
}
|
||||
/* Save C* */
|
||||
#if defined(COOPMAT2) || defined(COOPMAT)
|
||||
// stage matC into Csh, then write to dst with coalesced NPQ-contiguous stores
|
||||
#ifdef COOPMAT
|
||||
const bool use_staged_store = true;
|
||||
#else
|
||||
const bool use_staged_store = (csh_store != 0);
|
||||
#endif
|
||||
if (use_staged_store) {
|
||||
#ifdef COOPMAT
|
||||
// cm1: each subgroup stores its fragment grid into its Csh slot
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
const uint csh_off = (warp_r * WM + cm_row * TM) * Csh_stride + warp_c * WN + cm_col * TN;
|
||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], Csh, csh_off, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
}
|
||||
}
|
||||
#else
|
||||
coopMatStore(matC, Csh, 0, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
#endif
|
||||
barrier();
|
||||
|
||||
// cooperative shmem->global: WG threads spread across BS_NPQ (the
|
||||
// contiguous direction of dst), each iter covers store_rows_per_iter K-rows
|
||||
const uint32_t store_rows_per_iter = WG_SIZE / BS_NPQ;
|
||||
const uint32_t store_iters = BS_K / store_rows_per_iter;
|
||||
const uint32_t k_thread_offset = tid / BS_NPQ;
|
||||
const uint32_t npq_thread = tid % BS_NPQ;
|
||||
[[unroll]] for (uint32_t i = 0; i < store_iters; i++) {
|
||||
uint32_t k_local = i * store_rows_per_iter + k_thread_offset;
|
||||
uint32_t K_idx = B_idx_K * BS_K + k_local;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + npq_thread;
|
||||
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL);
|
||||
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL);
|
||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
|
||||
dst_data[dst_idx] = D_TYPE(Csh[k_local * Csh_stride + npq_thread]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef COOPMAT2
|
||||
coopMatPerElementNV(matC, matC, perElemOpStore);
|
||||
else {
|
||||
coopMatPerElementNV(matC, matC, perElemOpStore);
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
if (T_y * TS_K < K) {
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
@@ -337,7 +470,7 @@ void main() {
|
||||
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
|
||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||
if (K_idx < K && NPQ_idx < NPQ) {
|
||||
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
|
||||
dst_data[dst_idx] = regC[T_ly][T_lx];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -984,8 +984,16 @@ void process_shaders() {
|
||||
string_to_spv(name + (unroll ? "_unroll" : ""), "conv2d_mm.comp", defines);
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
if (unroll) {
|
||||
defines["COOPMAT2"] = "1";
|
||||
string_to_spv(name, "conv2d_mm.comp", defines, true, false, true);
|
||||
auto cm2_defines = defines;
|
||||
cm2_defines["COOPMAT2"] = "1";
|
||||
string_to_spv(name, "conv2d_mm.comp", cm2_defines, true, false, true);
|
||||
}
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (unroll) {
|
||||
auto cm1_defines = defines;
|
||||
cm1_defines["COOPMAT"] = "1";
|
||||
string_to_spv(name, "conv2d_mm.comp", cm1_defines, true, true, false);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@
|
||||
#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 4
|
||||
#define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG 4
|
||||
|
||||
// default size for legacy matrix multiplication
|
||||
// default size for reg-tile matrix multiplication
|
||||
#define WEBGPU_MUL_MAT_WG_SIZE 256
|
||||
|
||||
// Same hash combine function as in boost
|
||||
@@ -93,6 +93,8 @@ struct ggml_webgpu_shader_lib_context {
|
||||
uint32_t sg_mat_k = 0;
|
||||
uint32_t min_subgroup_size = 0;
|
||||
uint32_t max_subgroup_size = 0;
|
||||
bool supports_dot_product = false;
|
||||
std::string vendor;
|
||||
};
|
||||
|
||||
struct webgpu_pipeline {
|
||||
@@ -850,31 +852,15 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
|
||||
|
||||
/** Matrix Multiplication **/
|
||||
|
||||
struct ggml_webgpu_legacy_mul_mat_pipeline_key {
|
||||
ggml_type src0_type;
|
||||
ggml_type src1_type;
|
||||
|
||||
bool operator==(const ggml_webgpu_legacy_mul_mat_pipeline_key & other) const {
|
||||
return src0_type == other.src0_type && src1_type == other.src1_type;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_legacy_mul_mat_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_legacy_mul_mat_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.src0_type);
|
||||
ggml_webgpu_hash_combine(seed, key.src1_type);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_mul_mat_vec_pipeline_key {
|
||||
ggml_type src0_type;
|
||||
ggml_type src1_type;
|
||||
int vectorized;
|
||||
bool use_mmvq;
|
||||
|
||||
bool operator==(const ggml_webgpu_mul_mat_vec_pipeline_key & other) const {
|
||||
return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized;
|
||||
return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized &&
|
||||
use_mmvq == other.use_mmvq;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -884,6 +870,7 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key_hash {
|
||||
ggml_webgpu_hash_combine(seed, key.src0_type);
|
||||
ggml_webgpu_hash_combine(seed, key.src1_type);
|
||||
ggml_webgpu_hash_combine(seed, key.vectorized);
|
||||
ggml_webgpu_hash_combine(seed, key.use_mmvq);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
@@ -894,6 +881,20 @@ struct ggml_webgpu_mul_mat_vec_shader_decisions {
|
||||
uint32_t vec_size;
|
||||
};
|
||||
|
||||
struct ggml_webgpu_quantize_q8_pipeline_key {
|
||||
ggml_type src0_type;
|
||||
|
||||
bool operator==(const ggml_webgpu_quantize_q8_pipeline_key & other) const { return src0_type == other.src0_type; }
|
||||
};
|
||||
|
||||
struct ggml_webgpu_quantize_q8_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_quantize_q8_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.src0_type);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_mul_mat_pipeline_key {
|
||||
ggml_type src0_type;
|
||||
ggml_type src1_type;
|
||||
@@ -1051,6 +1052,36 @@ struct ggml_webgpu_soft_max_pipeline_key_hash {
|
||||
}
|
||||
};
|
||||
|
||||
/** MMVQ **/
|
||||
|
||||
inline bool ggml_webgpu_can_use_mmvq(const ggml_tensor * src0,
|
||||
const ggml_tensor * src1,
|
||||
bool supports_dot_product,
|
||||
const std::string & vendor) {
|
||||
if (src1->ne[1] == 1) {
|
||||
bool supports_dp4a = vendor == "amd" || vendor == "intel" || vendor == "nvidia";
|
||||
if (supports_dp4a && supports_dot_product) {
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F32:
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return src0->ne[0] % 4 == 0;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
class ggml_webgpu_shader_lib {
|
||||
wgpu::Device device;
|
||||
pre_wgsl::Preprocessor preprocessor;
|
||||
@@ -1099,14 +1130,12 @@ class ggml_webgpu_shader_lib {
|
||||
webgpu_pipeline,
|
||||
ggml_webgpu_flash_attn_blk_pipeline_key_hash>
|
||||
flash_attn_blk_pipelines;
|
||||
std::unordered_map<ggml_webgpu_legacy_mul_mat_pipeline_key,
|
||||
webgpu_pipeline,
|
||||
ggml_webgpu_legacy_mul_mat_pipeline_key_hash>
|
||||
mul_mat_legacy_pipelines; // legacy mul_mat (non-subgroup/non-regtile/non-vec)
|
||||
std::unordered_map<ggml_webgpu_mul_mat_vec_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_vec_pipeline_key_hash>
|
||||
mul_mat_vec_pipelines; // fast mat-vec (n==1)
|
||||
std::unordered_map<ggml_webgpu_mul_mat_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_pipeline_key_hash>
|
||||
mul_mat_fast_pipelines; // fast mat-mat (reg-tile or subgroup)
|
||||
std::unordered_map<ggml_webgpu_quantize_q8_pipeline_key, webgpu_pipeline, ggml_webgpu_quantize_q8_pipeline_key_hash>
|
||||
quantize_q8_pipelines;
|
||||
std::unordered_map<int, webgpu_pipeline> mul_mat_id_gather_pipelines; // key is fixed
|
||||
std::unordered_map<ggml_webgpu_mul_mat_id_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_id_pipeline_key_hash>
|
||||
mul_mat_id_pipelines; // src0_type/src1_type
|
||||
@@ -1631,7 +1660,7 @@ class ggml_webgpu_shader_lib {
|
||||
key.type = context.dst->type;
|
||||
key.d_state = (int) context.src0->ne[0];
|
||||
key.xbc_overlap = ggml_webgpu_tensor_overlap(context.src1, context.src4) &&
|
||||
ggml_webgpu_tensor_overlap(context.src1, context.src5);
|
||||
ggml_webgpu_tensor_overlap(context.src1, context.src5);
|
||||
|
||||
auto it = ssm_scan_pipelines.find(key);
|
||||
if (it != ssm_scan_pipelines.end()) {
|
||||
@@ -1744,6 +1773,44 @@ class ggml_webgpu_shader_lib {
|
||||
return pad_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_quantize_q8_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_quantize_q8_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
|
||||
auto it = quantize_q8_pipelines.find(key);
|
||||
if (it != quantize_q8_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
const char * shader_src = wgsl_quantize_q8;
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "quantize_q8";
|
||||
|
||||
uint32_t wg_size = WEBGPU_MUL_MAT_VEC_WG_SIZE;
|
||||
|
||||
defines.push_back("SRC1_INNER_TYPE=f32");
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
|
||||
const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type);
|
||||
std::string src0_name = src0_traits->type_name;
|
||||
std::string type_upper = src0_name;
|
||||
variant += "_" + src0_name;
|
||||
std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
|
||||
|
||||
defines.push_back("MUL_ACC_" + type_upper);
|
||||
defines.push_back("Q8_1_T");
|
||||
|
||||
defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
|
||||
variant += context.supports_subgroups ? "_sg_reduce" : "_wg_reduce";
|
||||
|
||||
auto processed = preprocessor.preprocess(shader_src, defines);
|
||||
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
|
||||
decisions->wg_size = wg_size;
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
pipeline.context = decisions;
|
||||
quantize_q8_pipelines[key] = pipeline;
|
||||
return quantize_q8_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_mul_mat_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_mul_mat_vec_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
@@ -1752,6 +1819,8 @@ class ggml_webgpu_shader_lib {
|
||||
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
|
||||
1 :
|
||||
0;
|
||||
key.use_mmvq =
|
||||
ggml_webgpu_can_use_mmvq(context.src0, context.src1, context.supports_dot_product, context.vendor);
|
||||
|
||||
auto it = mul_mat_vec_pipelines.find(key);
|
||||
if (it != mul_mat_vec_pipelines.end()) {
|
||||
@@ -1788,6 +1857,19 @@ class ggml_webgpu_shader_lib {
|
||||
defines.push_back("U32_DEQUANT_HELPERS");
|
||||
defines.push_back("SRC0_INNER_TYPE=u32");
|
||||
switch (context.src0->type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
if (key.use_mmvq) {
|
||||
defines.push_back("LEGACY_QUANTS");
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
if (key.use_mmvq) {
|
||||
defines.push_back("K_QUANTS");
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
@@ -1840,6 +1922,11 @@ class ggml_webgpu_shader_lib {
|
||||
outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
|
||||
}
|
||||
|
||||
if (key.use_mmvq) {
|
||||
defines.push_back("MMVQ");
|
||||
defines.push_back("Q8_1_T");
|
||||
}
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
defines.push_back(std::string("OUTPUTS_PER_WG=") + std::to_string(outputs_per_wg));
|
||||
defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
|
||||
@@ -2018,100 +2105,6 @@ class ggml_webgpu_shader_lib {
|
||||
return mul_mat_fast_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_mul_mat_legacy_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_legacy_mul_mat_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
key.src1_type = context.src1->type;
|
||||
|
||||
auto it = mul_mat_legacy_pipelines.find(key);
|
||||
if (it != mul_mat_legacy_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "mul_mat";
|
||||
|
||||
switch (context.src1->type) {
|
||||
case GGML_TYPE_F32:
|
||||
defines.push_back("SRC1_TYPE=f32");
|
||||
variant += "_f32";
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
defines.push_back("SRC1_TYPE=f16");
|
||||
variant += "_f16";
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported src1 type for mul_mat legacy shader");
|
||||
}
|
||||
|
||||
const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type);
|
||||
const char * src0_name = src0_traits->type_name;
|
||||
|
||||
switch (context.src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
defines.push_back("SRC0_TYPE=f32");
|
||||
defines.push_back("FLOAT");
|
||||
variant += "_f32";
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
defines.push_back("SRC0_TYPE=f16");
|
||||
defines.push_back("FLOAT");
|
||||
variant += "_f16";
|
||||
break;
|
||||
default:
|
||||
{
|
||||
std::string type_upper = src0_name;
|
||||
std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
|
||||
|
||||
switch (context.src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
{
|
||||
// Quantized types using u32 buffers for portability.
|
||||
defines.push_back("SRC0_TYPE=u32");
|
||||
defines.push_back("U32_DEQUANT_HELPERS");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
defines.push_back(std::string("SRC0_TYPE=") + src0_name);
|
||||
}
|
||||
}
|
||||
|
||||
defines.push_back("BYTE_HELPERS");
|
||||
defines.push_back(type_upper + "_T");
|
||||
defines.push_back(type_upper);
|
||||
defines.push_back(type_upper + "_SCALE_MIN");
|
||||
defines.push_back(type_upper + "_TABLES");
|
||||
defines.push_back(type_upper + "_GRID");
|
||||
|
||||
variant += std::string("_") + src0_name;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_mul_mat, defines);
|
||||
|
||||
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
|
||||
decisions->wg_size = WEBGPU_MUL_MAT_WG_SIZE;
|
||||
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
pipeline.context = decisions;
|
||||
mul_mat_legacy_pipelines[key] = pipeline;
|
||||
return mul_mat_legacy_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_mul_mat_id_gather_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
auto it = mul_mat_id_gather_pipelines.find(1);
|
||||
if (it != mul_mat_id_gather_pipelines.end()) {
|
||||
|
||||
@@ -181,6 +181,7 @@ struct webgpu_capabilities {
|
||||
wgpu::Limits limits;
|
||||
bool supports_subgroups = false;
|
||||
bool supports_subgroup_matrix = false;
|
||||
bool supports_dot_product = false;
|
||||
|
||||
uint32_t sg_mat_m = 0;
|
||||
uint32_t sg_mat_n = 0;
|
||||
@@ -210,6 +211,8 @@ struct webgpu_global_context_struct {
|
||||
wgpu::Buffer memset_params_buf;
|
||||
webgpu_pipeline memset_pipeline;
|
||||
|
||||
std::string vendor;
|
||||
|
||||
// TODO: We should rework the CPU profiling time handling to make it more useful. ref: https://github.com/ggml-org/llama.cpp/pull/22050
|
||||
#ifdef GGML_WEBGPU_CPU_PROFILE
|
||||
// Profiling: labeled CPU time in ms (total)
|
||||
@@ -259,6 +262,7 @@ struct webgpu_context_struct {
|
||||
wgpu::Buffer set_rows_host_error_buf;
|
||||
wgpu::CommandEncoder active_command_encoder;
|
||||
wgpu::ComputePassEncoder active_compute_pass;
|
||||
bool batch_compute_passes = true;
|
||||
|
||||
size_t memset_bytes_per_thread;
|
||||
|
||||
@@ -590,9 +594,18 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(webgpu_context &
|
||||
}
|
||||
#else
|
||||
for (size_t i = 0; i < dispatches.size(); i++) {
|
||||
ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
|
||||
ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
|
||||
ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
|
||||
if (ctx->batch_compute_passes) {
|
||||
ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
|
||||
ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
|
||||
ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second,
|
||||
1);
|
||||
} else {
|
||||
wgpu::ComputePassEncoder pass = ctx->active_command_encoder.BeginComputePass();
|
||||
pass.SetPipeline(dispatches[i].pipeline.pipeline);
|
||||
pass.SetBindGroup(0, bind_groups[i]);
|
||||
pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
|
||||
pass.End();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1384,6 +1397,58 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx,
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_quantize_q8_dispatch(webgpu_context & ctx,
|
||||
ggml_tensor * src0,
|
||||
ggml_tensor * src1,
|
||||
ggml_tensor * dst,
|
||||
std::vector<webgpu_dispatch_desc> & dispatches) {
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
|
||||
|
||||
shader_lib_ctx.src0 = src0;
|
||||
shader_lib_ctx.src1 = src1;
|
||||
shader_lib_ctx.dst = dst;
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups;
|
||||
|
||||
webgpu_pipeline qq8_pipeline = ctx->shader_lib->get_quantize_q8_pipeline(shader_lib_ctx);
|
||||
|
||||
// quantize_q8 pipeline
|
||||
const size_t dst_offset = ggml_webgpu_tensor_offset(dst);
|
||||
const size_t q8_src1_align_offset = ROUNDUP_POW2(
|
||||
dst_offset + ggml_nbytes(dst), ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
|
||||
const size_t q8_src1_binding_size =
|
||||
ROUNDUP_POW2(src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)),
|
||||
WEBGPU_STORAGE_BUF_BINDING_MULT);
|
||||
|
||||
std::vector<uint32_t> q8_params = {
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
|
||||
(uint32_t) src1->ne[0],
|
||||
(uint32_t) src1->ne[2],
|
||||
(uint32_t) src1->ne[3],
|
||||
};
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> q8_entries = {
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src1),
|
||||
ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), q8_src1_align_offset, q8_src1_binding_size)
|
||||
};
|
||||
|
||||
auto q8_decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(qq8_pipeline.context.get());
|
||||
|
||||
uint32_t q8_wg_size = q8_decisions->wg_size;
|
||||
uint32_t q8_wg_x = 1;
|
||||
uint32_t q8_wg_y = 1;
|
||||
const uint32_t wg_per_vec = (src0->ne[0] / 4 + (q8_wg_size - 1)) / q8_wg_size;
|
||||
const uint32_t q8_total_wg = src1->ne[2] * src1->ne[3] * wg_per_vec;
|
||||
const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
compute_2d_workgroups(q8_total_wg, max_wg_per_dim, q8_wg_x, q8_wg_y);
|
||||
|
||||
dispatches.push_back({
|
||||
qq8_pipeline, std::move(q8_params), std::move(q8_entries), { q8_wg_x, q8_wg_y }
|
||||
});
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
ggml_tensor * src0,
|
||||
ggml_tensor * src1,
|
||||
@@ -1391,47 +1456,9 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
// Determine if this is a mat-vec operation
|
||||
bool is_vec = (dst->ne[1] == 1);
|
||||
|
||||
// Determine if we should use fast path
|
||||
bool use_fast = false;
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16:
|
||||
use_fast = (src0->type == GGML_TYPE_F16);
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
// TODO: implement better mat-mat for k-quants, mat-vec for all k-quants except q6_K
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_MXFP4:
|
||||
use_fast = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// use MMVQ path for mat-vec
|
||||
bool use_mmvq = ggml_webgpu_can_use_mmvq(src0, src1, ctx->global_ctx->capabilities.supports_dot_product,
|
||||
ctx->global_ctx->vendor);
|
||||
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
|
||||
|
||||
@@ -1446,16 +1473,20 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
shader_lib_ctx.sg_mat_k = ctx->global_ctx->capabilities.sg_mat_k;
|
||||
shader_lib_ctx.min_subgroup_size = ctx->global_ctx->capabilities.min_subgroup_size;
|
||||
shader_lib_ctx.max_subgroup_size = ctx->global_ctx->capabilities.max_subgroup_size;
|
||||
shader_lib_ctx.supports_dot_product = ctx->global_ctx->capabilities.supports_dot_product;
|
||||
shader_lib_ctx.vendor = ctx->global_ctx->vendor;
|
||||
|
||||
// Get or create pipeline
|
||||
webgpu_pipeline pipeline;
|
||||
webgpu_pipeline pipeline;
|
||||
std::vector<webgpu_dispatch_desc> dispatches;
|
||||
|
||||
if (use_fast && is_vec) {
|
||||
if (is_vec) {
|
||||
if (use_mmvq) {
|
||||
ggml_webgpu_quantize_q8_dispatch(ctx, src0, src1, dst, dispatches);
|
||||
}
|
||||
pipeline = ctx->shader_lib->get_mul_mat_vec_pipeline(shader_lib_ctx);
|
||||
} else if (use_fast) {
|
||||
pipeline = ctx->shader_lib->get_mul_mat_fast_pipeline(shader_lib_ctx);
|
||||
} else {
|
||||
pipeline = ctx->shader_lib->get_mul_mat_legacy_pipeline(shader_lib_ctx);
|
||||
pipeline = ctx->shader_lib->get_mul_mat_fast_pipeline(shader_lib_ctx);
|
||||
}
|
||||
|
||||
// Build params
|
||||
@@ -1479,25 +1510,31 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
};
|
||||
|
||||
// Build bind group entries
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0),
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1),
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
|
||||
};
|
||||
std::vector<wgpu::BindGroupEntry> entries = {};
|
||||
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0));
|
||||
if (use_mmvq) {
|
||||
auto & mmvq_qq8_entry = dispatches[0].bind_group_entries[1];
|
||||
entries.push_back(ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), mmvq_qq8_entry.offset,
|
||||
mmvq_qq8_entry.size));
|
||||
} else {
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1));
|
||||
}
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst));
|
||||
|
||||
// Calculate workgroup dimensions
|
||||
uint32_t wg_x = 1;
|
||||
uint32_t wg_y = 1;
|
||||
const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
|
||||
if (use_fast && is_vec) {
|
||||
if (is_vec) {
|
||||
auto * decisions = static_cast<ggml_webgpu_mul_mat_vec_shader_decisions *>(pipeline.context.get());
|
||||
|
||||
uint32_t batches = dst->ne[2] * dst->ne[3];
|
||||
uint32_t output_groups = CEIL_DIV(dst->ne[0], decisions->outputs_per_wg);
|
||||
uint32_t total_wg = output_groups * batches;
|
||||
compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
|
||||
} else if (use_fast) {
|
||||
} else {
|
||||
auto * decisions = static_cast<ggml_webgpu_mul_mat_shader_decisions *>(pipeline.context.get());
|
||||
|
||||
// Fast-path tiled/subgroup calculations
|
||||
@@ -1518,15 +1555,13 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
}
|
||||
uint32_t total_wg = wg_m * wg_n * dst->ne[2] * dst->ne[3];
|
||||
compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
|
||||
|
||||
} else { // legacy
|
||||
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
|
||||
uint32_t wg_size = decisions->wg_size;
|
||||
uint32_t total_wg = CEIL_DIV(dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3], wg_size);
|
||||
compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
|
||||
}
|
||||
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
dispatches.push_back({
|
||||
pipeline, std::move(params), std::move(entries), { wg_x, wg_y }
|
||||
});
|
||||
|
||||
return ggml_backend_webgpu_build_multi(ctx, dispatches);
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_mul_mat_id_vec(webgpu_context & ctx,
|
||||
@@ -1956,10 +1991,10 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
|
||||
std::vector<wgpu::BindGroupEntry> reduce_entries;
|
||||
if (use_vec_reduce) {
|
||||
const uint32_t reduce_sg_size = ctx->global_ctx->capabilities.max_subgroup_size;
|
||||
const uint32_t reduce_wg_size =
|
||||
std::max(reduce_sg_size, (uint32_t) std::min<uint64_t>(
|
||||
(uint64_t) nwg * reduce_sg_size,
|
||||
ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup));
|
||||
const uint32_t reduce_wg_size = std::max(
|
||||
reduce_sg_size,
|
||||
(uint32_t) std::min<uint64_t>((uint64_t) nwg * reduce_sg_size,
|
||||
ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup));
|
||||
ggml_webgpu_shader_lib_context reduce_shader_ctx = shader_lib_ctx;
|
||||
reduce_shader_ctx.max_wg_size = reduce_wg_size;
|
||||
reduce_pipeline = ctx->shader_lib->get_flash_attn_vec_reduce_pipeline(reduce_shader_ctx);
|
||||
@@ -3110,18 +3145,16 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
||||
uint32_t num_batched_kernels = 0;
|
||||
uint32_t num_inflight_batches = 0;
|
||||
bool contains_set_rows = false;
|
||||
bool batch_compute_passes = true;
|
||||
int num_encoded_ops = 1;
|
||||
int node_idx = 0;
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ctx->profile_timestamp_query_count = 0;
|
||||
batch_compute_passes = false;
|
||||
std::vector<std::string> profile_pipeline_names;
|
||||
#endif
|
||||
|
||||
ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
|
||||
if (batch_compute_passes) {
|
||||
if (ctx->batch_compute_passes) {
|
||||
ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
|
||||
}
|
||||
|
||||
@@ -3148,7 +3181,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
||||
|
||||
// reset state for next batch
|
||||
ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
|
||||
if (batch_compute_passes) {
|
||||
if (ctx->batch_compute_passes) {
|
||||
ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
|
||||
}
|
||||
ctx->param_arena.reset();
|
||||
@@ -3548,8 +3581,8 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
|
||||
const uint32_t kv_tile = decisions.kv_tile;
|
||||
|
||||
const uint32_t vec_nwg_cap = ctx->webgpu_global_ctx->capabilities.min_subgroup_size;
|
||||
uint32_t nwg = 1u;
|
||||
const uint64_t kv_span = (uint64_t) std::max(1u, kv_tile);
|
||||
uint32_t nwg = 1u;
|
||||
const uint64_t kv_span = (uint64_t) std::max(1u, kv_tile);
|
||||
while ((2u * nwg * kv_span) < (uint64_t) K->ne[1] && nwg < vec_nwg_cap) {
|
||||
nwg <<= 1;
|
||||
}
|
||||
@@ -3582,6 +3615,22 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
|
||||
}
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
const ggml_tensor * src0 = tensor->src[0];
|
||||
const ggml_tensor * src1 = tensor->src[1];
|
||||
bool use_mmvq =
|
||||
ggml_webgpu_can_use_mmvq(src0, src1, ctx->webgpu_global_ctx->capabilities.supports_dot_product,
|
||||
ctx->webgpu_global_ctx->vendor);
|
||||
if (use_mmvq) {
|
||||
const size_t q8_src1_size =
|
||||
src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32));
|
||||
res = ROUNDUP_POW2(res + q8_src1_size +
|
||||
ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment,
|
||||
WEBGPU_STORAGE_BUF_BINDING_MULT);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
const ggml_tensor * src0 = tensor->src[0];
|
||||
@@ -3707,12 +3756,16 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
ctx->webgpu_global_ctx->adapter.GetInfo(&info);
|
||||
ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size();
|
||||
ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches();
|
||||
ctx->webgpu_global_ctx->vendor = info.vendor;
|
||||
wgpu::SupportedFeatures features;
|
||||
ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
|
||||
// we require f16 support
|
||||
GGML_ASSERT(ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
|
||||
ctx->webgpu_global_ctx->capabilities.supports_subgroups =
|
||||
ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);
|
||||
// for dot4I8packed
|
||||
ctx->webgpu_global_ctx->capabilities.supports_dot_product = ctx->webgpu_global_ctx->instance.HasWGSLLanguageFeature(
|
||||
wgpu::WGSLLanguageFeatureName::Packed4x8IntegerDotProduct);
|
||||
|
||||
bool valid_subgroup_matrix_config = false;
|
||||
#ifndef __EMSCRIPTEN__
|
||||
@@ -3839,6 +3892,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
|
||||
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf");
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
webgpu_ctx->batch_compute_passes = false;
|
||||
ggml_webgpu_create_buffer(
|
||||
webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
|
||||
wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, "profile_timestamp_dev_buf");
|
||||
|
||||
@@ -95,11 +95,10 @@ struct q5_1 {
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef Q8_1_T
|
||||
struct q8_1 {
|
||||
d: f16,
|
||||
m: f16,
|
||||
s: f16, // d * sum(qs[i])
|
||||
qs: array<u32, 8>
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -1,747 +0,0 @@
|
||||
enable f16;
|
||||
|
||||
#define DECLARE_BYTE_LOADERS_SRC0
|
||||
#include "common_decls.tmpl"
|
||||
|
||||
|
||||
#ifdef FLOAT
|
||||
const BLOCK_SIZE = 1u;
|
||||
|
||||
#elif defined(Q4_0) || defined(Q4_1) || defined(Q5_0) || defined(Q5_1) || defined(Q8_0) || defined(Q8_1) || defined(IQ4_NL)
|
||||
const BLOCK_SIZE = 32u;
|
||||
|
||||
#elif defined(Q2_K) || defined(Q3_K) || defined(Q4_K) || defined(Q5_K) || defined(Q6_K) || defined(IQ2_XXS) || defined(IQ2_XS) || defined(IQ2_S) || defined(IQ3_XXS) || defined(IQ3_S) || defined(IQ1_S) || defined(IQ1_M) || defined(IQ4_XS)
|
||||
const BLOCK_SIZE = 256u;
|
||||
#endif
|
||||
|
||||
#ifdef FLOAT
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
return f32(src0[src0_idx_base + offset]) * f32(src1[src1_idx_base + offset]);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q4_0
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 18; // Block stride: 18 bytes
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base);
|
||||
var sum: f32 = 0.0;
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let q_byte_offset = block_byte_base + 2 + j * 4;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0f) * d;
|
||||
let q_lo = (f32(q_byte & 0xF) - 8.0f) * d;
|
||||
let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
|
||||
sum += q_lo * f32(src1[src1_offset]);
|
||||
sum += q_hi * f32(src1[src1_offset + 16]);
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q4_1
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_q4_1 = src0[src0_idx_base + offset];
|
||||
let d = f32(block_q4_1.d);
|
||||
let m = f32(block_q4_1.m);
|
||||
var sum: f32 = 0.0;
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let q_packed = block_q4_1.qs[j];
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_hi = f32((q_byte >> 4) & 0xF) * d + m;
|
||||
let q_lo = f32(q_byte & 0xF) * d + m;
|
||||
let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
|
||||
sum += q_lo * f32(src1[src1_offset]);
|
||||
sum += q_hi * f32(src1[src1_offset + 16]);
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q5_0
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 22; // Block stride: 22 bytes
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base);
|
||||
var sum: f32 = 0.0;
|
||||
let qh_packed = load_u32_at_src0(block_byte_base + 2);
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let q_byte_offset = block_byte_base + 6 + j * 4;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let qh_hi = (qh_packed >> (j * 4 + k + 12)) & 0x10;
|
||||
let q_hi = (f32(((q_byte >> 4) & 0xF) | qh_hi) - 16.0) * d;
|
||||
let qh_lo = ((qh_packed >> (j * 4 + k)) << 4) & 0x10;
|
||||
let q_lo = (f32((q_byte & 0xF) | qh_lo) - 16.0) * d;
|
||||
let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
|
||||
sum += q_lo * f32(src1[src1_offset]);
|
||||
sum += q_hi * f32(src1[src1_offset + 16]);
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q5_1
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_q5_1 = src0[src0_idx_base + offset];
|
||||
let d = f32(block_q5_1.d);
|
||||
let m = f32(block_q5_1.m);
|
||||
var sum: f32 = 0.0;
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let q_packed = block_q5_1.qs[j];
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let qh_hi = (block_q5_1.qh >> (j * 4 + k + 12)) & 0x10;
|
||||
let q_hi = f32(((q_byte >> 4) & 0xF) | qh_hi) * d + m;
|
||||
let qh_lo = ((block_q5_1.qh >> (j * 4 + k)) << 4) & 0x10;
|
||||
let q_lo = f32((q_byte & 0xF) | qh_lo) * d + m;
|
||||
let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
|
||||
sum += q_lo * f32(src1[src1_offset]);
|
||||
sum += q_hi * f32(src1[src1_offset + 16]);
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q8_0
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 34; // Block stride: 34 bytes
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base);
|
||||
var sum: f32 = 0.0;
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let q_byte_offset = block_byte_base + 2 + j * 4;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
for (var k: u32 = 0u; k < 4u; k++) {
|
||||
let q_byte = get_byte_i32(q_packed, k);
|
||||
let q_val = f32(q_byte) * d;
|
||||
let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
|
||||
sum += q_val * f32(src1[src1_offset]);
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q8_1
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_q8_1 = src0[src0_idx_base + offset];
|
||||
let d = f32(block_q8_1.d);
|
||||
let m = f32(block_q8_1.m);
|
||||
var sum: f32 = 0.0;
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let q_packed = block_q8_1.qs[j];
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte_i32(q_packed, k);
|
||||
let q_val = f32(q_byte) * d + m;
|
||||
let src1_offset = src1_idx_base + offset * 32 + j * 4 + k;
|
||||
sum += q_val * f32(src1[src1_offset]);
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q2_K
|
||||
// 16 blocks of 16 elements each
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block = src0[src0_idx_base + offset];
|
||||
let d = f32(block.d);
|
||||
let m = f32(block.dmin);
|
||||
var sum = 0.0;
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
var is: u32 = 0;
|
||||
// 2 halves of the block (128 elements each)
|
||||
for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) {
|
||||
// 4 groups (each group has 2 blocks of 16 elements)
|
||||
for (var shift: u32 = 0; shift < 8; shift += 2) {
|
||||
// 2 blocks
|
||||
for (var k: u32 = 0; k < 32; k += 16) {
|
||||
let sc = get_byte(block.scales[is / 4], is % 4);
|
||||
is++;
|
||||
let dl = d * f32(sc & 0xF);
|
||||
let ml = m * f32(sc >> 4);
|
||||
for (var l: u32 = 0u; l < 16; l++) {
|
||||
let q_idx = q_b_idx + k + l;
|
||||
let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4);
|
||||
let qs_val = (q_byte >> shift) & 3;
|
||||
sum += (f32(qs_val) * dl - ml) * src1[src1_i];
|
||||
src1_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q3_K
|
||||
// 16 blocks of 16 elements each
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 110; // Block stride: 110 bytes
|
||||
|
||||
// Bytes 108-109: f16 scale 'd'
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base + 108);
|
||||
|
||||
// extract 6-bit scales, which consist of 4-bits from first 8 bytes of scale,
|
||||
// and 2-bits from the last 4 bytes
|
||||
// Bytes 96-107: 12 bytes of scales (3 u32s)
|
||||
let kmask1: u32 = 0x03030303;
|
||||
let kmask2: u32 = 0x0f0f0f0f;
|
||||
var scale_vals: array<u32, 4>;
|
||||
scale_vals[0] = load_u32_at_src0(block_byte_base + 96);
|
||||
scale_vals[1] = load_u32_at_src0(block_byte_base + 100);
|
||||
scale_vals[2] = load_u32_at_src0(block_byte_base + 104);
|
||||
|
||||
var tmp: u32 = scale_vals[2];
|
||||
scale_vals[2] = ((scale_vals[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
scale_vals[3] = ((scale_vals[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
scale_vals[0] = (scale_vals[0] & kmask2) | ((tmp & kmask1) << 4);
|
||||
scale_vals[1] = (scale_vals[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
|
||||
// Bytes 0-31: 32 bytes of hmask (8 u32s)
|
||||
var hmask_vals: array<u32, 8>;
|
||||
for (var i: u32 = 0; i < 8; i++) {
|
||||
hmask_vals[i] = load_u32_at_src0(block_byte_base + i * 4);
|
||||
}
|
||||
|
||||
// Bytes 32-95: 64 bytes of qs (16 u32s)
|
||||
var qs_vals: array<u32, 16>;
|
||||
for (var i: u32 = 0u; i < 16; i++) {
|
||||
qs_vals[i] = load_u32_at_src0(block_byte_base + 32 + i * 4);
|
||||
}
|
||||
|
||||
var sum = 0.0;
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
var is: u32 = 0;
|
||||
var m: u32 = 1;
|
||||
// 2 halves of the block (128 elements each)
|
||||
for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) {
|
||||
// 4 groups (each group has 2 blocks of 16 elements)
|
||||
for (var shift: u32 = 0; shift < 8; shift += 2) {
|
||||
// 2 blocks
|
||||
for (var k: u32 = 0; k < 32; k += 16) {
|
||||
let sc = get_byte(scale_vals[is / 4], is % 4);
|
||||
is++;
|
||||
let dl = d * (f32(sc) - 32.0);
|
||||
for (var l: u32 = 0u; l < 16u; l++) {
|
||||
let q_idx = q_b_idx + k + l;
|
||||
let hm_idx = k + l;
|
||||
let q_byte = get_byte(qs_vals[q_idx / 4], q_idx % 4);
|
||||
let hmask_byte = get_byte(hmask_vals[hm_idx / 4], hm_idx % 4);
|
||||
let hm = select(4.0, 0.0, (hmask_byte & m) != 0);
|
||||
let qs_val = (q_byte >> shift) & 3;
|
||||
sum += ((f32(qs_val) - hm) * dl) * src1[src1_i];
|
||||
src1_i++;
|
||||
}
|
||||
}
|
||||
m <<= 1;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q4_K
|
||||
// 8 blocks of 32 elements each
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block = src0[src0_idx_base + offset];
|
||||
let d = f32(block.d);
|
||||
let m = f32(block.dmin);
|
||||
var sum = 0.0;
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
var is: u32 = 0;
|
||||
// 2 blocks each iteration
|
||||
for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) {
|
||||
for (var shift: u32 = 0; shift < 8; shift += 4) {
|
||||
let scale_min = get_scale_min(is, block.scales);
|
||||
is++;
|
||||
let dl = d * scale_min.x;
|
||||
let ml = m * scale_min.y;
|
||||
for (var l: u32 = 0; l < 32; l++) {
|
||||
let q_idx = q_b_idx + l;
|
||||
let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4);
|
||||
let qs_val = (q_byte >> shift) & 0xF;
|
||||
sum += (f32(qs_val) * dl - ml) * src1[src1_i];
|
||||
src1_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q5_K
|
||||
// 8 blocks of 32 elements each
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block = src0[src0_idx_base + offset];
|
||||
let d = f32(block.d);
|
||||
let m = f32(block.dmin);
|
||||
var sum = 0.0;
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
var is: u32 = 0;
|
||||
var u: u32 = 1;
|
||||
// 2 blocks each iteration
|
||||
for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) {
|
||||
for (var shift: u32 = 0; shift < 8; shift += 4) {
|
||||
let scale_min = get_scale_min(is, block.scales);
|
||||
is++;
|
||||
let dl = d * scale_min.x;
|
||||
let ml = m * scale_min.y;
|
||||
for (var l: u32 = 0; l < 32; l++) {
|
||||
let q_idx = q_b_idx + l;
|
||||
let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4);
|
||||
let qh_byte = get_byte(block.qh[l / 4], l % 4);
|
||||
let qs_val = (q_byte >> shift) & 0xF;
|
||||
let qh_val = select(0.0, 16.0, (qh_byte & u) != 0);
|
||||
sum += ((f32(qs_val) + qh_val) * dl - ml) * src1[src1_i];
|
||||
src1_i++;
|
||||
}
|
||||
u <<= 1;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef Q6_K
|
||||
// 16 blocks of 16 elements each
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 210; // Block stride: 210 bytes
|
||||
|
||||
// Bytes 208-209: f16 scale 'd'
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base + 208);
|
||||
|
||||
// Bytes 0-127: 128 bytes of ql (32 u32s)
|
||||
var ql_vals: array<u32, 32>;
|
||||
for (var i: u32 = 0; i < 32; i++) {
|
||||
ql_vals[i] = load_u32_at_src0(block_byte_base + i * 4);
|
||||
}
|
||||
|
||||
// Bytes 128-191: 64 bytes of qh (16 u32s)
|
||||
var qh_vals: array<u32, 16>;
|
||||
for (var i: u32 = 0; i < 16; i++) {
|
||||
qh_vals[i] = load_u32_at_src0(block_byte_base + 128 + i * 4);
|
||||
}
|
||||
|
||||
// Bytes 192-207: 16 bytes of scales (4 u32s)
|
||||
var scale_vals: array<u32, 4>;
|
||||
for (var i: u32 = 0; i < 4; i++) {
|
||||
scale_vals[i] = load_u32_at_src0(block_byte_base + 192 + i * 4);
|
||||
}
|
||||
|
||||
var sum = 0.0;
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
var qh_b_idx: u32 = 0;
|
||||
var sc_b_idx: u32 = 0;
|
||||
for (var ql_b_idx: u32 = 0; ql_b_idx < 128; ql_b_idx += 64) {
|
||||
for (var l: u32 = 0; l < 32; l++) {
|
||||
let ql13_b = get_byte(ql_vals[(ql_b_idx + l) / 4], (ql_b_idx + l) % 4);
|
||||
let ql24_b = get_byte(ql_vals[(ql_b_idx + l + 32) / 4], (ql_b_idx + l + 32) % 4);
|
||||
let qh_b = get_byte(qh_vals[(qh_b_idx + l) / 4], (qh_b_idx + l) % 4);
|
||||
|
||||
let q1 = f32((ql13_b & 0xF) | ((qh_b & 3) << 4)) - 32.0;
|
||||
let q2 = f32((ql24_b & 0xF) | (((qh_b >> 2) & 3) << 4)) - 32.0;
|
||||
let q3 = f32((ql13_b >> 4) | (((qh_b >> 4) & 3) << 4)) - 32.0;
|
||||
let q4 = f32((ql24_b >> 4) | (((qh_b >> 6) & 3) << 4)) - 32.0;
|
||||
|
||||
let is = l/16;
|
||||
let is1 = sc_b_idx + is;
|
||||
let sc1 = get_byte_i32(scale_vals[is1 / 4], is1 % 4);
|
||||
let is2 = sc_b_idx + is + 2;
|
||||
let sc2 = get_byte_i32(scale_vals[is2 / 4], is2 % 4);
|
||||
let is3 = sc_b_idx + is + 4;
|
||||
let sc3 = get_byte_i32(scale_vals[is3 / 4], is3 % 4);
|
||||
let is4 = sc_b_idx + is + 6;
|
||||
let sc4 = get_byte_i32(scale_vals[is4 / 4], is4 % 4);
|
||||
|
||||
sum += d * f32(sc1) * q1 * src1[src1_i + l];
|
||||
sum += d * f32(sc2) * q2 * src1[src1_i + l + 32];
|
||||
sum += d * f32(sc3) * q3 * src1[src1_i + l + 64];
|
||||
sum += d * f32(sc4) * q4 * src1[src1_i + l + 96];
|
||||
}
|
||||
src1_i += 128;
|
||||
qh_b_idx += 32;
|
||||
sc_b_idx += 8;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IQ2_XXS
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 66; // Block stride: 66 bytes
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base);
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
var sum = 0.0;
|
||||
for (var ib: u32 = 0; ib < 32; ib += 4) {
|
||||
let aux0_offset = block_byte_base + 2 + ib * 2;
|
||||
let aux1_offset = block_byte_base + 2 + (ib + 2) * 2;
|
||||
let aux0 = load_u32_at_src0(aux0_offset);
|
||||
let aux1 = load_u32_at_src0(aux1_offset);
|
||||
let db = d * (0.5 + f32(aux1 >> 28)) * 0.25;
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let ig = get_byte(aux0, l) * 8;
|
||||
let is = (aux1 >> (7 * l)) & 127;
|
||||
let signs = get_byte(ksigns_iq2xs[is / 4], is % 4);
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let g = get_byte(iq2xxs_grid[(ig + j) / 4], (ig + j) % 4);
|
||||
let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0);
|
||||
sum += db * f32(g) * m * src1[src1_i];
|
||||
src1_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IQ2_XS
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 74; // Block stride: 74 bytes
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base);
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
|
||||
var scale_vals = array<u32, 2>(
|
||||
load_u32_at_src0(block_byte_base + 66),
|
||||
load_u32_at_src0(block_byte_base + 70)
|
||||
);
|
||||
|
||||
var sum = 0.0;
|
||||
for (var ib: u32 = 0; ib < 32; ib += 4) {
|
||||
let s = get_byte(scale_vals[ib / 16], (ib % 16) / 4);
|
||||
let db = array<f32, 2>(
|
||||
d * (0.5 + f32(s & 0xF)) * 0.25,
|
||||
d * (0.5 + f32(s >> 4)) * 0.25
|
||||
);
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let qs_offset = block_byte_base + 2 + (ib + l) * 2;
|
||||
let qs_val = load_u32_at_src0(qs_offset) & 0xFFFF;
|
||||
let ig = (qs_val & 511) * 8;
|
||||
let is = qs_val >> 9;
|
||||
let signs = get_byte(ksigns_iq2xs[is / 4], is % 4);
|
||||
let dl = db[l/2];
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let g = get_byte(iq2xs_grid[(ig + j) / 4], (ig + j) % 4);
|
||||
let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0);
|
||||
sum += dl * f32(g) * m * src1[src1_i];
|
||||
src1_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IQ2_S
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 82; // Block stride: 82 bytes
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base);
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
|
||||
var qs_vals : array<u32, 16>;
|
||||
for (var i: u32 = 0; i < 16; i++) {
|
||||
qs_vals[i] = load_u32_at_src0(block_byte_base + 2 + i * 4);
|
||||
}
|
||||
|
||||
var qh_vals: array<u32, 2>;
|
||||
qh_vals[0] = load_u32_at_src0(block_byte_base + 66);
|
||||
qh_vals[1] = load_u32_at_src0(block_byte_base + 70);
|
||||
|
||||
var scale_vals: array<u32, 2>;
|
||||
scale_vals[0] = load_u32_at_src0(block_byte_base + 74);
|
||||
scale_vals[1] = load_u32_at_src0(block_byte_base + 78);
|
||||
|
||||
var sum = 0.0;
|
||||
for (var ib: u32 = 0; ib < 8; ib ++) {
|
||||
let s = get_byte(scale_vals[ib / 4], ib % 4);
|
||||
let db = array<f32, 2>(
|
||||
d * (0.5 + f32(s & 0xF)) * 0.25,
|
||||
d * (0.5 + f32(s >> 4)) * 0.25
|
||||
);
|
||||
let qs_w = qs_vals[ib];
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let qh_b = (get_byte(qh_vals[ib / 4], ib % 4) << (8 - 2 * l)) & 0x300;
|
||||
let ig = (get_byte(qs_w, l) | qh_b) * 8;
|
||||
let signs = get_byte(qs_vals[ib + 8], l);
|
||||
let dl = db[l/2];
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let g = get_byte(iq2s_grid[(ig + j) / 4], (ig + j) % 4);
|
||||
let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0);
|
||||
sum += dl * f32(g) * m * src1[src1_i];
|
||||
src1_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IQ3_XXS
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 98; // Block stride: 98 bytes
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base);
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
var sum = 0.0;
|
||||
for (var ib: u32 = 0; ib < 16; ib += 2) {
|
||||
let sc_sign_offset = block_byte_base + 2 + (ib + 32) * 2;
|
||||
let sc_sign = load_u32_at_src0(sc_sign_offset);
|
||||
let db = d * (0.5 + f32(sc_sign >> 28)) * 0.5;
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let is = (sc_sign >> (7 * l)) & 127;
|
||||
let signs = get_byte(ksigns_iq2xs[is / 4], is % 4);
|
||||
let ig_val = load_u32_at_src0(block_byte_base + 2 + (ib * 2 + l) * 2) & 0xFFFF;
|
||||
let ig1 = get_byte(ig_val, 0);
|
||||
let ig2 = get_byte(ig_val, 1);
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let g1 = get_byte(iq3xxs_grid[ig1], j);
|
||||
let g2 = get_byte(iq3xxs_grid[ig2], j);
|
||||
let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0);
|
||||
let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0);
|
||||
sum += db * f32(g1) * m1 * src1[src1_i];
|
||||
sum += db * f32(g2) * m2 * src1[src1_i + 4];
|
||||
src1_i++;
|
||||
}
|
||||
src1_i += 4;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IQ3_S
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 110; // Block stride: 110 bytes
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base);
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
|
||||
var qh_vals = array<u32, 2>(
|
||||
load_u32_at_src0(block_byte_base + 66),
|
||||
load_u32_at_src0(block_byte_base + 70)
|
||||
);
|
||||
|
||||
var sign_vals: array<u32, 8>;
|
||||
for (var i: u32 = 0; i < 8; i++) {
|
||||
sign_vals[i] = load_u32_at_src0(block_byte_base + 74 + i * 4);
|
||||
}
|
||||
|
||||
var scale_vals = load_u32_at_src0(block_byte_base + 106);
|
||||
|
||||
var sum = 0.0;
|
||||
for (var ib: u32 = 0; ib < 4; ib++) {
|
||||
let s = get_byte(scale_vals, ib);
|
||||
let db = array<f32, 2>(
|
||||
d * (1.0 + 2.0 * f32(s & 0xF)),
|
||||
d * (1.0 + 2.0 * f32(s >> 4))
|
||||
);
|
||||
for (var k: u32 = 0; k < 2; k++) {
|
||||
let dl = db[k];
|
||||
let qh_byte = get_byte(qh_vals[ib / 2], (ib % 2) * 2 + k);
|
||||
let sign_w = sign_vals[ib * 2 + k];
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let signs = get_byte(sign_w, l);
|
||||
let ig_val = load_u32_at_src0(block_byte_base + 2 + (ib * 8 + k * 4 + l) * 2) & 0xFFFF;
|
||||
let ig1 = get_byte(ig_val, 0) | ((qh_byte << ((8 - (2 * l)))) & 256);
|
||||
let ig2 = get_byte(ig_val, 1) | ((qh_byte << ((7 - (2 * l)))) & 256);
|
||||
for (var j: u32 = 0; j < 4; j++) {
|
||||
let g1 = get_byte(iq3s_grid[ig1], j);
|
||||
let g2 = get_byte(iq3s_grid[ig2], j);
|
||||
let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0);
|
||||
let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0);
|
||||
sum += dl * f32(g1) * m1 * src1[src1_i];
|
||||
sum += dl * f32(g2) * m2 * src1[src1_i + 4];
|
||||
src1_i++;
|
||||
}
|
||||
src1_i += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IQ1_S
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 50; // Block stride: 50 bytes
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base);
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
var sum = 0.0;
|
||||
for (var ib: u32 = 0; ib < 8; ib++) {
|
||||
let qh = load_u32_at_src0(block_byte_base + 34 + ib * 2) & 0xFFFF;
|
||||
let dl = d * (2.0 * f32((qh >> 12) & 7) + 1.0);
|
||||
let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000) != 0);
|
||||
let qs_w = load_u32_at_src0(block_byte_base + 2 + ib * 4);
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let ig = (get_byte(qs_w, l) | (((qh >> (3 * l)) & 7) << 8)) * 8;
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let gw = iq1_grid[(ig + j) / 16];
|
||||
let g = (gw >> (((ig + j) % 16) * 2)) & 3;
|
||||
let gs = bitcast<i32>(g << 30) >> 30;
|
||||
sum += dl * (f32(gs) + delta) * src1[src1_i];
|
||||
src1_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef IQ1_M
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block = src0[src0_idx_base + offset];
|
||||
|
||||
let scale = ((block.scales[0] >> 12) & 0xF) | ((block.scales[0] >> 24) & 0x00F0) | ((block.scales[1] >> 4) & 0x0F00) | ((block.scales[1] >> 16) & 0xF000);
|
||||
let d = f32(bitcast<vec2<f16>>(scale).x);
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
var sum = 0.0;
|
||||
for (var ib: u32 = 0; ib < 8; ib++) {
|
||||
let sw = (block.scales[ib / 4] >> (16 * ((ib / 2) % 2))) & 0xFFFF;
|
||||
let s1 : u32 = (sw >> (6 * (ib % 2))) & 0x7;
|
||||
let s2 : u32 = (sw >> (6 * (ib % 2) + 3)) & 0x7;
|
||||
var dl = array<f32, 2>(
|
||||
d * f32(2 * s1 + 1),
|
||||
d * f32(2 * s2 + 1)
|
||||
);
|
||||
|
||||
let qh = block.qh[ib / 2] >> (16 * (ib % 2));
|
||||
var idx = array<u32, 4>(
|
||||
get_byte(block.qs[ib], 0) | ((qh << 8) & 0x700),
|
||||
get_byte(block.qs[ib], 1) | ((qh << 4) & 0x700),
|
||||
get_byte(block.qs[ib], 2) | ((qh) & 0x700),
|
||||
get_byte(block.qs[ib], 3) | ((qh >> 4) & 0x700)
|
||||
);
|
||||
var delta = array<f32, 4>(
|
||||
select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x08) != 0),
|
||||
select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x80) != 0),
|
||||
select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x08) != 0),
|
||||
select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x80) != 0)
|
||||
);
|
||||
for (var l: u32 = 0; l < 4; l++) {
|
||||
let ig = idx[l] * 8;
|
||||
for (var j: u32 = 0; j < 8; j++) {
|
||||
let gw = iq1_grid[(ig + j) / 16];
|
||||
let g = (gw >> (((ig + j) % 16) * 2)) & 3;
|
||||
let gs = bitcast<i32>(g << 30) >> 30;
|
||||
sum += dl[l/2] * (f32(gs) + delta[l]) * src1[src1_i];
|
||||
src1_i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IQ4_NL
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block_byte_base = (src0_idx_base + offset) * 18; // Block stride: 18 bytes
|
||||
let d = load_f16_as_f32_at_src0(block_byte_base);
|
||||
var src1_i = src1_idx_base + offset * 32;
|
||||
var sum = 0.0;
|
||||
var qs: array<u32, 4>;
|
||||
for (var i: u32 = 0; i < 4; i++) {
|
||||
qs[i] = load_u32_at_src0(block_byte_base + 2 + i * 4);
|
||||
}
|
||||
for (var j: u32 = 0; j < 16; j++) {
|
||||
let qsb = get_byte(qs[j / 4], j % 4);
|
||||
sum += d * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i];
|
||||
sum += d * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16];
|
||||
src1_i++;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef IQ4_XS
|
||||
fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 {
|
||||
let block = src0[src0_idx_base + offset];
|
||||
let d = unpack2x16float(block.d_scales_h)[0];
|
||||
let scales_h = block.d_scales_h >> 16;
|
||||
var src1_i = src1_idx_base + offset * 256;
|
||||
var sum = 0.0;
|
||||
for (var ib: u32 = 0; ib < 8; ib++) {
|
||||
let ls = ((get_byte(block.scales_l, ib / 2) >> (4 * (ib % 2))) & 0xF) | (((scales_h >> (2 * ib)) & 3) << 4);
|
||||
let dl = d * (f32(ls) - 32.0);
|
||||
for (var j: u32 = 0; j < 16; j++) {
|
||||
let iqs = ib * 16 + j;
|
||||
let qsb = get_byte(block.qs[iqs / 4], iqs % 4);
|
||||
sum += dl * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i];
|
||||
sum += dl * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16];
|
||||
src1_i++;
|
||||
}
|
||||
src1_i += 16;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct MulMatParams {
|
||||
offset_src0: u32, // in elements/blocks
|
||||
offset_src1: u32, // in elements/blocks
|
||||
offset_dst: u32, // in elements/blocks
|
||||
m: u32,
|
||||
n: u32,
|
||||
k: u32,
|
||||
// all strides are in elements/blocks
|
||||
stride_01: u32,
|
||||
stride_11: u32,
|
||||
stride_02: u32,
|
||||
stride_12: u32,
|
||||
stride_03: u32,
|
||||
stride_13: u32,
|
||||
|
||||
bs02: u32,
|
||||
bs03: u32,
|
||||
broadcast2: u32,
|
||||
broadcast3: u32
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<storage, read_write> src0: array<SRC0_TYPE>; // M rows, K columns
|
||||
@group(0) @binding(1) var<storage, read_write> src1: array<SRC1_TYPE>; // K rows, N columns (transposed)
|
||||
@group(0) @binding(2) var<storage, read_write> dst: array<f32>; // M rows, N columns
|
||||
|
||||
@group(0) @binding(3) var<uniform> params: MulMatParams;
|
||||
|
||||
@compute @workgroup_size(256)
|
||||
fn main(@builtin(local_invocation_id) local_id: vec3<u32>,
|
||||
@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@builtin(num_workgroups) num_wg: vec3<u32>) {
|
||||
let wg_linear = wg_id.y * num_wg.x + wg_id.x;
|
||||
let global_idx = wg_linear * 256u + local_id.x;
|
||||
|
||||
let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3;
|
||||
if (global_idx >= total) {
|
||||
return;
|
||||
}
|
||||
|
||||
let dst2_stride = params.m * params.n;
|
||||
let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
|
||||
|
||||
let dst3_idx = global_idx / dst3_stride;
|
||||
let src03_idx = dst3_idx / params.broadcast3; // src0 may be broadcast along the third dimension
|
||||
let src13_idx = dst3_idx; // src1 is not broadcast
|
||||
let dst3_rem = global_idx % dst3_stride;
|
||||
|
||||
let dst2_idx = dst3_rem / dst2_stride;
|
||||
let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension
|
||||
let src12_idx = dst2_idx; // src1 is not broadcast
|
||||
|
||||
let dst2_rem = dst3_rem % dst2_stride;
|
||||
|
||||
let row = dst2_rem / params.m; // output row
|
||||
let col = dst2_rem % params.m; // output column
|
||||
|
||||
let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01;
|
||||
let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11;
|
||||
|
||||
var sum = 0.0;
|
||||
for (var i: u32 = 0u; i < params.k/BLOCK_SIZE; i = i + 1u) {
|
||||
sum += multiply_add(src0_idx_base, src1_idx_base, i);
|
||||
}
|
||||
dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.m + col] = sum;
|
||||
}
|
||||
@@ -3,10 +3,18 @@ enable subgroups;
|
||||
#endif
|
||||
enable f16;
|
||||
|
||||
#ifdef MMVQ
|
||||
requires packed_4x8_integer_dot_product;
|
||||
#endif
|
||||
|
||||
#define DECLARE_BYTE_LOADERS_SRC0
|
||||
#include "common_decls.tmpl"
|
||||
|
||||
#ifdef MMVQ
|
||||
#include "mul_mat_vec_q_acc.tmpl"
|
||||
#else
|
||||
#include "mul_mat_vec_acc.tmpl"
|
||||
#endif
|
||||
|
||||
struct MulMatParams {
|
||||
offset_src0: u32,
|
||||
@@ -28,9 +36,14 @@ struct MulMatParams {
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<storage, read_write> src0: array<SRC0_TYPE>;
|
||||
@group(0) @binding(1) var<storage, read_write> src1: array<SRC1_TYPE>;
|
||||
@group(0) @binding(2) var<storage, read_write> dst: array<f32>;
|
||||
|
||||
#ifdef MMVQ
|
||||
@group(0) @binding(1) var<storage, read_write> src1q: array<q8_1>;
|
||||
#else
|
||||
@group(0) @binding(1) var<storage, read_write> src1: array<SRC1_TYPE>;
|
||||
#endif
|
||||
|
||||
@group(0) @binding(2) var<storage, read_write> dst: array<f32>;
|
||||
// "mul_mat_vec_acc.tmpl" requires params.k, params.m, params.stride_01
|
||||
@group(0) @binding(3) var<uniform> params: MulMatParams;
|
||||
|
||||
@@ -75,10 +88,15 @@ fn main(
|
||||
let src12_idx = dst2_idx;
|
||||
|
||||
let src0_batch_offset = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02;
|
||||
let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
|
||||
let dst_idx_base = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row_base;
|
||||
|
||||
#ifdef MMVQ
|
||||
let src1q_idx_base = (src13_idx * params.bs02 * params.broadcast2 + src12_idx) * (params.k / 32u);
|
||||
let acc = accumulate_vec_q_dot(thread_id, row_base, src0_batch_offset, src1q_idx_base);
|
||||
#else
|
||||
let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
|
||||
let acc = accumulate_vec_dot(thread_id, row_base, src0_batch_offset, src1_idx_base);
|
||||
#endif
|
||||
|
||||
#ifdef USE_SUBGROUP_REDUCTION
|
||||
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
|
||||
|
||||
@@ -436,7 +436,6 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef MUL_ACC_Q3_K
|
||||
#define BLOCK_SIZE 256
|
||||
#define BLOCK_SIZE_BYTES 110
|
||||
|
||||
303
ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl
Normal file
303
ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl
Normal file
@@ -0,0 +1,303 @@
|
||||
#ifdef U32_DEQUANT_HELPERS
|
||||
#define SRC0_TYPE u32
|
||||
|
||||
fn byte_of(v: u32, b: u32) -> u32 {
|
||||
return (v >> (b * 8u)) & 0xFFu;
|
||||
}
|
||||
|
||||
fn sbyte_of(v: u32, b: u32) -> i32 {
|
||||
let raw = i32((v >> (b * 8u)) & 0xFFu);
|
||||
return select(raw, raw - 256, raw >= 128);
|
||||
}
|
||||
#endif
|
||||
|
||||
#define SRC0_TYPE SRC0_INNER_TYPE
|
||||
#define SRC1_TYPE SRC1_INNER_TYPE
|
||||
|
||||
#ifdef LEGACY_QUANTS
|
||||
#define BLOCK_SIZE 32
|
||||
#define THREADS_PER_BLOCK 4
|
||||
#elif K_QUANTS
|
||||
#define BLOCK_SIZE 256
|
||||
#define THREADS_PER_BLOCK 16
|
||||
#endif
|
||||
|
||||
#define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK)
|
||||
#define Q8_BLOCK_SIZE 32
|
||||
|
||||
#ifdef MUL_ACC_Q4_0
|
||||
#define BLOCK_SIZE_BYTES 18
|
||||
#define B_DS_TYPE vec2<f32>
|
||||
fn repack_a(block_byte_base: u32, inner_id: u32) -> vec2<u32> {
|
||||
let qs_packed = load_u32_at_src0(block_byte_base + 2u + 4u * inner_id);
|
||||
|
||||
return vec2<u32>(
|
||||
qs_packed & 0x0F0F0F0Fu,
|
||||
(qs_packed >> 4u) & 0x0F0F0F0Fu
|
||||
);
|
||||
}
|
||||
fn repack_b_qs(block:u32, inner_id: u32) -> vec2<u32> {
|
||||
return vec2<u32>(
|
||||
src1q[block].qs[inner_id],
|
||||
src1q[block].qs[inner_id + 4u],
|
||||
);
|
||||
}
|
||||
fn repack_b_dm(block: u32) -> B_DS_TYPE {
|
||||
return B_DS_TYPE(
|
||||
f32(src1q[block].d),
|
||||
f32(src1q[block].s)
|
||||
);
|
||||
}
|
||||
fn get_dm(block_byte_base: u32) -> f32 {
|
||||
return f32(load_f16_at_src0(block_byte_base));
|
||||
}
|
||||
fn mul_q8_1(row_sum: i32, da: f32, b_ds: B_DS_TYPE) -> f32 {
|
||||
return f32(row_sum) * (da * b_ds.x) - 8.0 * da * b_ds.y / THREADS_PER_BLOCK;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MUL_ACC_Q4_1
|
||||
#define BLOCK_SIZE_BYTES 20
|
||||
#define B_DS_TYPE vec2<f32>
|
||||
fn repack_a(block_byte_base: u32, inner_id: u32) -> vec2<u32> {
|
||||
let qs_packed = load_u32_at_src0(block_byte_base + 4u + 4u * inner_id);
|
||||
|
||||
return vec2<u32>(
|
||||
qs_packed & 0x0F0F0F0Fu,
|
||||
(qs_packed >> 4u) & 0x0F0F0F0Fu
|
||||
);
|
||||
}
|
||||
fn repack_b_qs(block:u32, inner_id: u32) -> vec2<u32> {
|
||||
return vec2<u32>(
|
||||
src1q[block].qs[inner_id],
|
||||
src1q[block].qs[inner_id + 4u],
|
||||
);
|
||||
}
|
||||
fn repack_b_dm(block: u32) -> B_DS_TYPE {
|
||||
return B_DS_TYPE(
|
||||
f32(src1q[block].d),
|
||||
f32(src1q[block].s)
|
||||
);
|
||||
}
|
||||
fn get_dm(block_byte_base: u32) -> vec2<f32> {
|
||||
return vec2<f32>(
|
||||
f32(load_f16_at_src0(block_byte_base)),
|
||||
f32(load_f16_at_src0(block_byte_base + 2u))
|
||||
);
|
||||
}
|
||||
fn mul_q8_1(row_sum: i32, dma: vec2<f32>, b_ds: B_DS_TYPE) -> f32 {
|
||||
return f32(row_sum) * (dma.x * b_ds.x) + dma.y * b_ds.y / THREADS_PER_BLOCK;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MUL_ACC_Q8_0
|
||||
#define BLOCK_SIZE_BYTES 34
|
||||
#define B_DS_TYPE f32
|
||||
fn repack_a(block_byte_base: u32, inner_id: u32) -> vec2<u32> {
|
||||
return vec2<u32>(
|
||||
load_u32_at_src0(block_byte_base + 2u + 4u * (inner_id * 2u)),
|
||||
load_u32_at_src0(block_byte_base + 2u + 4u * (inner_id * 2u + 1))
|
||||
);
|
||||
}
|
||||
fn repack_b_qs(block:u32, inner_id: u32) -> vec2<u32> {
|
||||
return vec2<u32>(
|
||||
src1q[block].qs[inner_id * 2u],
|
||||
src1q[block].qs[inner_id * 2u + 1],
|
||||
);
|
||||
}
|
||||
fn repack_b_dm(block: u32) -> B_DS_TYPE {
|
||||
return B_DS_TYPE(src1q[block].d);
|
||||
}
|
||||
fn get_dm(block_byte_base: u32) -> f32 {
|
||||
return f32(load_f16_at_src0(block_byte_base));
|
||||
}
|
||||
fn mul_q8_1(row_sum: i32, da: f32, b_ds: B_DS_TYPE) -> f32 {
|
||||
return f32(row_sum) * (da * b_ds);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef LEGACY_QUANTS
|
||||
fn mmvq_dot_product(a_byte_base: u32, b_inner_id: u32, b_repacked: vec2<u32>, b_ds: B_DS_TYPE) -> f32 {
|
||||
var row_sum = 0;
|
||||
let a_repacked = repack_a(a_byte_base, b_inner_id);
|
||||
|
||||
row_sum += dot4I8Packed(a_repacked[0], b_repacked[0]);
|
||||
row_sum += dot4I8Packed(a_repacked[1], b_repacked[1]);
|
||||
|
||||
return mul_q8_1(row_sum, get_dm(a_byte_base), b_ds);
|
||||
}
|
||||
|
||||
fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array<f32, OUTPUTS_PER_WG> {
|
||||
var acc: array<f32, OUTPUTS_PER_WG>;
|
||||
|
||||
let num_blocks = params.k / BLOCK_SIZE;
|
||||
|
||||
for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) {
|
||||
let b_inner_id = thread_id % THREADS_PER_BLOCK;
|
||||
let b_block_idx = src1q_idx_base + block;
|
||||
|
||||
let b_repacked = repack_b_qs(b_block_idx, b_inner_id);
|
||||
let b_ds = repack_b_dm(b_block_idx);
|
||||
|
||||
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
|
||||
let output_row = row_base + row;
|
||||
if (output_row < params.m) {
|
||||
let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
|
||||
acc[row] += mmvq_dot_product(block_byte_base, b_inner_id, b_repacked, b_ds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return acc;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MUL_ACC_Q2_K
|
||||
#define BLOCK_SIZE_BYTES 84
|
||||
#define B_DS_TYPE f32
|
||||
fn repack_a(block_byte_base: u32, tid: u32) -> vec4<u32> {
|
||||
let ih2 = tid / 8u;
|
||||
let phase = tid % 2u;
|
||||
let iq4_idx = 2u * ih2 + phase;
|
||||
let qs_byte_base = block_byte_base + 16u + 16u * iq4_idx;
|
||||
let qs_shift = tid & 6u;
|
||||
return vec4<u32>(
|
||||
(load_u32_at_src0_aligned(qs_byte_base) >> qs_shift) & 0x03030303u,
|
||||
(load_u32_at_src0_aligned(qs_byte_base + 4u) >> qs_shift) & 0x03030303u,
|
||||
(load_u32_at_src0_aligned(qs_byte_base + 8u) >> qs_shift) & 0x03030303u,
|
||||
(load_u32_at_src0_aligned(qs_byte_base + 12u) >> qs_shift) & 0x03030303u,
|
||||
);
|
||||
}
|
||||
fn repack_b_qs(q8_block_idx: u32, tid: u32) -> vec4<u32> {
|
||||
let phase = tid % 2u;
|
||||
return vec4<u32>(
|
||||
src1q[q8_block_idx].qs[4u * phase],
|
||||
src1q[q8_block_idx].qs[4u * phase + 1u],
|
||||
src1q[q8_block_idx].qs[4u * phase + 2u],
|
||||
src1q[q8_block_idx].qs[4u * phase + 3u],
|
||||
);
|
||||
}
|
||||
fn repack_b_dm(q8_block_idx: u32) -> B_DS_TYPE {
|
||||
return B_DS_TYPE(src1q[q8_block_idx].d);
|
||||
}
|
||||
fn get_dm(block_byte_base: u32) -> vec2<f32> {
|
||||
return vec2<f32>(
|
||||
f32(load_f16_at_src0(block_byte_base + 80u)),
|
||||
f32(load_f16_at_src0(block_byte_base + 82u)),
|
||||
);
|
||||
}
|
||||
fn get_scale_min(block_byte_base: u32, tid: u32) -> vec2<f32> {
|
||||
let scale_byte = block_byte_base + tid;
|
||||
let scale = byte_of(load_u32_at_src0_aligned(scale_byte), scale_byte & 3u);
|
||||
return vec2<f32>(f32(scale & 0xFu), f32(scale >> 4u));
|
||||
}
|
||||
fn mmvq_dot_product(a_byte_base: u32, tid: u32, b_repacked: vec4<u32>, b_ds: B_DS_TYPE) -> f32 {
|
||||
let a_repacked = repack_a(a_byte_base, tid);
|
||||
let dm = get_dm(a_byte_base);
|
||||
let scale_min = get_scale_min(a_byte_base, tid);
|
||||
|
||||
let scale_q = i32(scale_min.x);
|
||||
let scale_m_i8x4 = u32(scale_min.y) * 0x01010101u;
|
||||
|
||||
let row_sum_d = (dot4I8Packed(b_repacked[0], a_repacked[0]) + dot4I8Packed(b_repacked[1], a_repacked[1])
|
||||
+ dot4I8Packed(b_repacked[2], a_repacked[2]) + dot4I8Packed(b_repacked[3], a_repacked[3])) * scale_q;
|
||||
let row_sum_m = dot4I8Packed(b_repacked[0], scale_m_i8x4) + dot4I8Packed(b_repacked[1], scale_m_i8x4)
|
||||
+ dot4I8Packed(b_repacked[2], scale_m_i8x4) + dot4I8Packed(b_repacked[3], scale_m_i8x4);
|
||||
|
||||
return b_ds * (dm.x * f32(row_sum_d) - dm.y * f32(row_sum_m));
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MUL_ACC_Q4_K
|
||||
#define BLOCK_SIZE_BYTES 144
|
||||
#define B_DS_TYPE vec2<f32>
|
||||
fn repack_a(block_byte_base: u32, tid: u32) -> vec4<u32> {
|
||||
let iq4 = tid / 4u;
|
||||
let phase = tid % 2u;
|
||||
let nibble = (tid >> 1u) % 2u;
|
||||
let q_qs_byte_base = block_byte_base + 16u + 32u * iq4 + 16u * phase;
|
||||
let qs_shift = 4u * nibble;
|
||||
return vec4<u32>(
|
||||
(load_u32_at_src0_aligned(q_qs_byte_base) >> qs_shift) & 0x0F0F0F0Fu,
|
||||
(load_u32_at_src0_aligned(q_qs_byte_base + 4u) >> qs_shift) & 0x0F0F0F0Fu,
|
||||
(load_u32_at_src0_aligned(q_qs_byte_base + 8u) >> qs_shift) & 0x0F0F0F0Fu,
|
||||
(load_u32_at_src0_aligned(q_qs_byte_base + 12u) >> qs_shift) & 0x0F0F0F0Fu,
|
||||
);
|
||||
}
|
||||
fn repack_b_qs(q8_block_idx: u32, tid: u32) -> vec4<u32> {
|
||||
let phase = tid % 2u;
|
||||
return vec4<u32>(
|
||||
src1q[q8_block_idx].qs[4u * phase],
|
||||
src1q[q8_block_idx].qs[4u * phase + 1u],
|
||||
src1q[q8_block_idx].qs[4u * phase + 2u],
|
||||
src1q[q8_block_idx].qs[4u * phase + 3u],
|
||||
);
|
||||
}
|
||||
fn repack_b_dm(q8_block_idx: u32) -> B_DS_TYPE {
|
||||
return B_DS_TYPE(
|
||||
f32(src1q[q8_block_idx].d),
|
||||
f32(src1q[q8_block_idx].s),
|
||||
);
|
||||
}
|
||||
fn get_dm(block_byte_base: u32) -> vec2<f32> {
|
||||
return vec2<f32>(
|
||||
f32(load_f16_at_src0(block_byte_base + 0u)),
|
||||
f32(load_f16_at_src0(block_byte_base + 2u)),
|
||||
);
|
||||
}
|
||||
fn get_scale_min(block_byte_base: u32, tid: u32) -> vec2<f32> {
|
||||
let sc_m_idx = tid / 2u;
|
||||
let scales_byte_base = block_byte_base + 4u;
|
||||
let scales0_3 = load_u32_at_src0_aligned(scales_byte_base);
|
||||
let scales4_7 = load_u32_at_src0_aligned(scales_byte_base + 4u);
|
||||
let scales8_11 = load_u32_at_src0_aligned(scales_byte_base + 8u);
|
||||
|
||||
let byte_idx = sc_m_idx & 3u;
|
||||
let is_high = sc_m_idx >= 4u;
|
||||
|
||||
let sc_low = byte_of(scales0_3, byte_idx) & 0x3Fu;
|
||||
let sc_high = (byte_of(scales8_11, byte_idx) & 0x0Fu) | ((byte_of(scales0_3, byte_idx) & 0xC0u) >> 2u);
|
||||
let scale = f32(select(sc_low, sc_high, is_high));
|
||||
|
||||
let mn_low = byte_of(scales4_7, byte_idx) & 0x3Fu;
|
||||
let mn_high = (byte_of(scales8_11, byte_idx) >> 4u) | ((byte_of(scales4_7, byte_idx) & 0xC0u) >> 2u);
|
||||
let min_val = f32(select(mn_low, mn_high, is_high));
|
||||
|
||||
return vec2<f32>(scale, min_val);
|
||||
}
|
||||
fn mmvq_dot_product(a_byte_base: u32, tid: u32, b_repacked: vec4<u32>, b_ds: B_DS_TYPE) -> f32 {
|
||||
let a_repacked = repack_a(a_byte_base, tid);
|
||||
let dm = get_dm(a_byte_base);
|
||||
let scale_min = get_scale_min(a_byte_base, tid);
|
||||
|
||||
let row_sum = dot4I8Packed(a_repacked[0], b_repacked[0]) + dot4I8Packed(a_repacked[1], b_repacked[1])
|
||||
+ dot4I8Packed(a_repacked[2], b_repacked[2]) + dot4I8Packed(a_repacked[3], b_repacked[3]);
|
||||
|
||||
// Each thread covers half of the Q8_1 block, so add only b_ds.y/2.
|
||||
return b_ds.x * dm.x * scale_min.x * f32(row_sum) - dm.y * scale_min.y * (b_ds.y / (Q8_BLOCK_SIZE / ELEMS_PER_THREAD));
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef K_QUANTS
|
||||
fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array<f32, OUTPUTS_PER_WG> {
|
||||
var acc: array<f32, OUTPUTS_PER_WG>;
|
||||
|
||||
let tid = thread_id % THREADS_PER_BLOCK;
|
||||
|
||||
for (var block = thread_id / THREADS_PER_BLOCK; block < params.k / BLOCK_SIZE; block += WG_SIZE / THREADS_PER_BLOCK) {
|
||||
let src1q_idx = src1q_idx_base + (block * BLOCK_SIZE + ELEMS_PER_THREAD * tid) / Q8_BLOCK_SIZE;
|
||||
let b_repacked = repack_b_qs(src1q_idx, tid);
|
||||
let b_ds = repack_b_dm(src1q_idx);
|
||||
|
||||
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
|
||||
let output_row = row_base + row;
|
||||
if (output_row < params.m) {
|
||||
let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
|
||||
acc[row] += mmvq_dot_product(block_byte_base, tid, b_repacked, b_ds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return acc;
|
||||
}
|
||||
#endif
|
||||
173
ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl
Normal file
173
ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl
Normal file
@@ -0,0 +1,173 @@
|
||||
#ifdef USE_SUBGROUP_REDUCTION
|
||||
enable subgroups;
|
||||
#endif
|
||||
enable f16;
|
||||
|
||||
requires packed_4x8_integer_dot_product;
|
||||
|
||||
#include "common_decls.tmpl"
|
||||
|
||||
struct Params {
|
||||
offset_src1: u32,
|
||||
stride_12: u32,
|
||||
stride_13: u32,
|
||||
ne0: u32,
|
||||
ne2: u32,
|
||||
ne3: u32,
|
||||
};
|
||||
|
||||
#define SRC1_TYPE vec4<SRC1_INNER_TYPE>
|
||||
|
||||
@group(0) @binding(0) var<storage, read_write> src1: array<SRC1_TYPE>;
|
||||
@group(0) @binding(1) var<storage, read_write> src1q: array<q8_1>;
|
||||
|
||||
@group(0) @binding(2) var<uniform> params: Params;
|
||||
|
||||
#ifdef USE_SUBGROUP_REDUCTION
|
||||
fn cluster_max_8(v: f32) -> f32 {
|
||||
var r = v;
|
||||
r = max(r, subgroupShuffleXor(r, 1u));
|
||||
r = max(r, subgroupShuffleXor(r, 2u));
|
||||
r = max(r, subgroupShuffleXor(r, 4u));
|
||||
return r;
|
||||
}
|
||||
|
||||
#if defined(MUL_ACC_Q4_0) || defined(MUL_ACC_Q4_1) || defined(MUL_ACC_Q4_K)
|
||||
fn cluster_add_i4x8(v: i32) -> i32 {
|
||||
var r= v;
|
||||
r += subgroupShuffleXor(r, 1u);
|
||||
r += subgroupShuffleXor(r, 2u);
|
||||
r += subgroupShuffleXor(r, 4u);
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_WORKGROUP_REDUCTION
|
||||
#define CLUSTER_SIZE 8
|
||||
|
||||
var<workgroup> partial_amaxs: array<array<f32, CLUSTER_SIZE>, WG_SIZE / CLUSTER_SIZE>;
|
||||
var<workgroup> partial_sums: array<array<i32, CLUSTER_SIZE>, WG_SIZE / CLUSTER_SIZE>;
|
||||
#endif
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(
|
||||
@builtin(local_invocation_id) local_id: vec3<u32>,
|
||||
@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@builtin(num_workgroups) num_wg: vec3<u32>
|
||||
) {
|
||||
let thread_id = local_id.x;
|
||||
let num_vec4 = params.ne0 / 4u;
|
||||
|
||||
let wg_per_vec = (num_vec4 + (WG_SIZE - 1u)) / WG_SIZE;
|
||||
let total_batches = wg_per_vec * params.ne2 * params.ne3;
|
||||
|
||||
let wg_linear = wg_id.y * num_wg.x + wg_id.x;
|
||||
if (wg_linear >= total_batches) {
|
||||
return;
|
||||
}
|
||||
|
||||
let src13_idx = wg_linear / (params.ne2 * wg_per_vec);
|
||||
let src12_idx = (wg_linear - src13_idx * (params.ne2 * wg_per_vec)) / wg_per_vec;
|
||||
let src11_wg_idx = wg_linear % wg_per_vec;
|
||||
let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
|
||||
let src1_idx_vec4_base = src1_idx_base / 4u;
|
||||
|
||||
let blocks_per_row = params.ne0 / 32u;
|
||||
let blocks_per_wg = (WG_SIZE * 4u) / 32u;
|
||||
let src1q_idx_base = (src13_idx * params.ne2 + src12_idx) * blocks_per_row;
|
||||
let src1q_idx = src1q_idx_base + src11_wg_idx * blocks_per_wg + thread_id / 8u;
|
||||
let qs_idx = thread_id % 8u;
|
||||
|
||||
// reduction
|
||||
var q4 = vec4<f32>(0.0);
|
||||
var q4_quants = 0u;
|
||||
var thread_amax = 0.0;
|
||||
|
||||
let src11_vec4_idx = src11_wg_idx * WG_SIZE + thread_id;
|
||||
let is_valid = src11_vec4_idx < num_vec4;
|
||||
|
||||
#ifdef USE_SUBGROUP_REDUCTION
|
||||
|
||||
var d = 0.0;
|
||||
|
||||
if (is_valid) {
|
||||
q4 = src1[src1_idx_vec4_base + src11_vec4_idx];
|
||||
let abs_q4 = abs(q4);
|
||||
thread_amax = max(max(abs_q4[0u], abs_q4[1u]), max(abs_q4[2], abs_q4[3]));
|
||||
}
|
||||
|
||||
d = cluster_max_8(thread_amax) / 127.0;
|
||||
|
||||
if (is_valid) {
|
||||
let id = select(0.0, 1.0 / d, d > 0.0);
|
||||
q4_quants = pack4xI8(vec4<i32>(round(q4 * id)));
|
||||
if (qs_idx == 0u) {
|
||||
src1q[src1q_idx].d = f16(d);
|
||||
}
|
||||
src1q[src1q_idx].qs[qs_idx] = q4_quants;
|
||||
}
|
||||
|
||||
#if defined(MUL_ACC_Q4_0) || defined(MUL_ACC_Q4_1) || defined(MUL_ACC_Q4_K)
|
||||
let q4_quants_sum = dot4I8Packed(q4_quants, 0x01010101u);
|
||||
let s = f16(d * f32(cluster_add_i4x8(q4_quants_sum)));
|
||||
|
||||
if (is_valid) {
|
||||
if (qs_idx == 0u) {
|
||||
src1q[src1q_idx].s = s;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_WORKGROUP_REDUCTION
|
||||
|
||||
var d = 0.0;
|
||||
let cluster_id = thread_id / 8u;
|
||||
|
||||
if (is_valid) {
|
||||
q4 = src1[src1_idx_vec4_base + src11_vec4_idx];
|
||||
let abs_q4 = abs(q4);
|
||||
thread_amax = max(max(abs_q4[0], abs_q4[1]), max(abs_q4[2], abs_q4[3]));
|
||||
partial_amaxs[cluster_id][qs_idx] = thread_amax;
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
if (is_valid) {
|
||||
let amax = max(
|
||||
max(
|
||||
max(partial_amaxs[cluster_id][0], partial_amaxs[cluster_id][1]), max(partial_amaxs[cluster_id][2], partial_amaxs[cluster_id][3])),
|
||||
max(
|
||||
max(partial_amaxs[cluster_id][4], partial_amaxs[cluster_id][5]), max(partial_amaxs[cluster_id][6], partial_amaxs[cluster_id][7]))
|
||||
);
|
||||
|
||||
d = amax / 127.0;
|
||||
let id = select(0.0f, 1.0f / d, d > 0.0f);
|
||||
|
||||
q4_quants = pack4xI8(vec4<i32>(round(q4 * id)));
|
||||
src1q[src1q_idx].qs[qs_idx] = q4_quants;
|
||||
|
||||
if (qs_idx == 0u) {
|
||||
src1q[src1q_idx].d = f16(d);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(MUL_ACC_Q4_0) || defined(MUL_ACC_Q4_1) || defined(MUL_ACC_Q4_K)
|
||||
|
||||
partial_sums[cluster_id][qs_idx] = dot4I8Packed(q4_quants, 0x01010101u);
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
if (is_valid) {
|
||||
if (qs_idx == 0u) {
|
||||
let s = d * f32(partial_sums[cluster_id][0] + partial_sums[cluster_id][1] + partial_sums[cluster_id][2] + partial_sums[cluster_id][3]
|
||||
+ partial_sums[cluster_id][4] + partial_sums[cluster_id][5] + partial_sums[cluster_id][6] + partial_sums[cluster_id][7]);
|
||||
src1q[src1q_idx].s = f16(s);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
@@ -88,7 +88,7 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_zendnn_sgemm(ggml_backend_zendnn_context * ctx, int64_t m, int64_t n, int64_t k,
|
||||
static bool ggml_zendnn_gemm(ggml_backend_zendnn_context * ctx, int64_t m, int64_t n, int64_t k,
|
||||
const void * A, int64_t lda, const void * B, int64_t ldb, void * C,
|
||||
int64_t ldc, int Atype, int Btype, int Ctype) {
|
||||
|
||||
@@ -200,7 +200,7 @@ static void ggml_zendnn_compute_forward_mul_mat(
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
const void* wdata = (src1->type == vec_dot_type || src0->type == GGML_TYPE_Q8_0) ? src1->data : work_data;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
if (!ggml_zendnn_sgemm(ctx,
|
||||
if (!ggml_zendnn_gemm(ctx,
|
||||
ne01, // m
|
||||
ne11, // n
|
||||
ne10, // k
|
||||
@@ -213,7 +213,7 @@ static void ggml_zendnn_compute_forward_mul_mat(
|
||||
src0->type,
|
||||
src0->type == GGML_TYPE_Q8_0 ? GGML_TYPE_F32 : vec_dot_type,
|
||||
dst->type))
|
||||
GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
|
||||
GGML_ABORT("%s: ZenDNN gemm failed\n", __func__);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -355,7 +355,7 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
|
||||
}
|
||||
|
||||
// batched gemm for all tokens in this expert
|
||||
if (!ggml_zendnn_sgemm(ctx,
|
||||
if (!ggml_zendnn_gemm(ctx,
|
||||
ne01, // m
|
||||
cne1, // n
|
||||
ne10, // k
|
||||
@@ -368,7 +368,7 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
|
||||
src0->type,
|
||||
src0->type == GGML_TYPE_Q8_0 ? GGML_TYPE_F32 : vec_dot_type,
|
||||
dst->type)) {
|
||||
GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
|
||||
GGML_ABORT("%s: ZenDNN gemm failed\n", __func__);
|
||||
}
|
||||
|
||||
// scatter output rows to destination
|
||||
|
||||
@@ -505,6 +505,7 @@ class MODEL_ARCH(IntEnum):
|
||||
LLAMA_EMBED = auto()
|
||||
MAINCODER = auto()
|
||||
KIMI_LINEAR = auto()
|
||||
TALKIE = auto()
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
@@ -1021,6 +1022,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.LLAMA_EMBED: "llama-embed",
|
||||
MODEL_ARCH.MAINCODER: "maincoder",
|
||||
MODEL_ARCH.KIMI_LINEAR: "kimi-linear",
|
||||
MODEL_ARCH.TALKIE: "talkie",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
@@ -4013,6 +4015,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
],
|
||||
MODEL_ARCH.TALKIE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.LAYER_OUT_SCALE,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ class TensorNameMap:
|
||||
"encoder", # neobert
|
||||
"model.transformer.wte", # llada
|
||||
"embed_tokens", # qwen3-embedding
|
||||
"model.embed", # talkie
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
@@ -259,6 +260,7 @@ class TensorNameMap:
|
||||
"model.transformer.blocks.{bid}.q_proj", # llada
|
||||
"layers.{bid}.self_attn.q_proj", # qwen3-embedding
|
||||
"backbone.layers.{bid}.mixer.q_proj", # nemotron-h
|
||||
"model.blocks.{bid}.attn.attn_query", # talkie
|
||||
),
|
||||
|
||||
# Attention key
|
||||
@@ -279,6 +281,7 @@ class TensorNameMap:
|
||||
"model.transformer.blocks.{bid}.k_proj", # llada
|
||||
"layers.{bid}.self_attn.k_proj", # qwen3-embedding
|
||||
"backbone.layers.{bid}.mixer.k_proj", # nemotron-h
|
||||
"model.blocks.{bid}.attn.attn_key", # talkie
|
||||
),
|
||||
|
||||
# Attention value
|
||||
@@ -298,6 +301,7 @@ class TensorNameMap:
|
||||
"model.transformer.blocks.{bid}.v_proj", # llada
|
||||
"layers.{bid}.self_attn.v_proj", # qwen3-embedding
|
||||
"backbone.layers.{bid}.mixer.v_proj", # nemotron-h
|
||||
"model.blocks.{bid}.attn.attn_value", # talkie
|
||||
),
|
||||
|
||||
# Attention output
|
||||
@@ -336,6 +340,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.self_attn.o_proj", # qwen3-embedding
|
||||
"backbone.layers.{bid}.mixer.o_proj", # nemotron-h
|
||||
"model.layers.{bid}.self_attn.language_expert_dense", # cogvlm
|
||||
"model.blocks.{bid}.attn.attn_resid", # talkie
|
||||
),
|
||||
|
||||
# Attention output norm
|
||||
@@ -508,6 +513,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.mlp.up_proj", # qwen3-embedding
|
||||
"backbone.layers.{bid}.mixer.up_proj", # nemotron-h
|
||||
"model.layers.{bid}.mlp.language_mlp.up_proj", # cogvlm
|
||||
"model.blocks.{bid}.mlp.mlp_linear", # talkie
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
@@ -561,6 +567,7 @@ class TensorNameMap:
|
||||
"model.transformer.blocks.{bid}.ff_proj", # llada
|
||||
"layers.{bid}.mlp.gate_proj", # qwen3-embedding
|
||||
"model.layers.{bid}.mlp.language_mlp.gate_proj", # cogvlm
|
||||
"model.blocks.{bid}.mlp.mlp_gate", # talkie
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
@@ -636,6 +643,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.mlp.down_proj", # qwen3-embedding
|
||||
"backbone.layers.{bid}.mixer.down_proj", # nemotron-h
|
||||
"model.layers.{bid}.mlp.language_mlp.down_proj", # cogvlm
|
||||
"model.blocks.{bid}.mlp.mlp_resid", # talkie
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
@@ -682,6 +690,7 @@ class TensorNameMap:
|
||||
"model.layers.layers.{bid}.mixer.q_norm", # plamo3
|
||||
"layers.{bid}.self_attn.q_norm", # qwen3-embedding
|
||||
"model.layers.{bid}.attention.query_layernorm", # apertus
|
||||
"model.blocks.{bid}.attn.head_gain.head_g", # talkie
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
@@ -716,6 +725,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.LAYER_OUT_SCALE: (
|
||||
"model.layers.{bid}.layer_scalar", # gemma4
|
||||
"model.blocks.{bid}.embed_skip.a_g", # talkie
|
||||
),
|
||||
|
||||
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: (
|
||||
|
||||
@@ -133,6 +133,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA_EMBED, "llama-embed" },
|
||||
{ LLM_ARCH_MAINCODER, "maincoder" },
|
||||
{ LLM_ARCH_KIMI_LINEAR, "kimi-linear" },
|
||||
{ LLM_ARCH_TALKIE, "talkie" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
|
||||
@@ -137,6 +137,7 @@ enum llm_arch {
|
||||
LLM_ARCH_LLAMA_EMBED,
|
||||
LLM_ARCH_MAINCODER,
|
||||
LLM_ARCH_KIMI_LINEAR,
|
||||
LLM_ARCH_TALKIE,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
|
||||
@@ -44,6 +44,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
|
||||
return new llama_model_llama_embed(params);
|
||||
case LLM_ARCH_MAINCODER:
|
||||
return new llama_model_maincoder(params);
|
||||
case LLM_ARCH_TALKIE:
|
||||
return new llama_model_talkie(params);
|
||||
case LLM_ARCH_DECI:
|
||||
return new llama_model_deci(params);
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
@@ -2353,6 +2355,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
case LLM_ARCH_MIMO2:
|
||||
case LLM_ARCH_STEP35:
|
||||
case LLM_ARCH_TALKIE:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
|
||||
@@ -488,7 +488,7 @@ struct llama_layer {
|
||||
struct ggml_tensor * indexer_attn_k = nullptr;
|
||||
struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
|
||||
|
||||
// gemma4 layer output scale
|
||||
// gemma4 layer output scale, reused for talkie embedding skip scale
|
||||
struct ggml_tensor * out_scale = nullptr;
|
||||
|
||||
struct llama_layer_posnet posnet;
|
||||
|
||||
@@ -511,6 +511,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
};
|
||||
byte_encode = false;
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_MINICPM5:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json (openbmb/MiniCPM5-1B)
|
||||
"\\p{N}{1,3}",
|
||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
@@ -2039,6 +2047,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (tokenizer_pre == "default") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (tokenizer_pre == "minicpm5") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_MINICPM5;
|
||||
ignore_merges = true;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
tokenizer_pre == "llama-v3" ||
|
||||
@@ -2196,7 +2207,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
} else if (
|
||||
tokenizer_pre == "gpt-4o" ||
|
||||
tokenizer_pre == "llama4" ||
|
||||
tokenizer_pre == "kanana2") {
|
||||
tokenizer_pre == "kanana2" ||
|
||||
tokenizer_pre == "talkie") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
|
||||
@@ -60,6 +60,7 @@ enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
||||
@@ -177,9 +177,9 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_gate_s,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
@@ -200,7 +200,11 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa
|
||||
LLM_FFN_SILU, true,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
il,
|
||||
nullptr, nullptr,
|
||||
model.layers[il].ffn_up_exps_s,
|
||||
model.layers[il].ffn_gate_exps_s,
|
||||
model.layers[il].ffn_down_exps_s);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
@@ -186,6 +186,19 @@ struct llama_model_maincoder : public llama_model_base {
|
||||
};
|
||||
|
||||
|
||||
struct llama_model_talkie : public llama_model_base {
|
||||
llama_model_talkie(const struct llama_model_params & params) : llama_model_base(params) {}
|
||||
void load_arch_hparams(llama_model_loader & ml) override;
|
||||
void load_arch_tensors(llama_model_loader & ml) override;
|
||||
|
||||
struct graph : public llm_graph_context {
|
||||
graph(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
|
||||
struct llama_model_deci : public llama_model_base {
|
||||
llama_model_deci(const struct llama_model_params & params) : llama_model_base(params) {}
|
||||
void load_arch_hparams(llama_model_loader & ml) override;
|
||||
|
||||
149
src/models/talkie.cpp
Normal file
149
src/models/talkie.cpp
Normal file
@@ -0,0 +1,149 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_talkie::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 40: type = LLM_TYPE_13B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_talkie::load_arch_tensors(llama_model_loader &) {
|
||||
LLAMA_LOAD_LOCALS;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
// no k gain
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {1, n_head}, 0);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
|
||||
layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), {1}, 0);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<llm_graph_context> llama_model_talkie::build_arch_graph(const llm_graph_params & params) const {
|
||||
return std::make_unique<graph>(*this, params);
|
||||
}
|
||||
|
||||
llama_model_talkie::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
inpL = build_norm(inpL, nullptr, nullptr, LLM_NORM_RMS, -1);
|
||||
cb(inpL, "inp_norm", -1);
|
||||
|
||||
ggml_tensor * embd_skip = inpL;
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
ggml_tensor * inp_skip = embd_skip;
|
||||
|
||||
cur = build_norm(inpL, nullptr, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
// reference applies qknorm after rope
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_norm", il);
|
||||
|
||||
Kcur = build_norm(Kcur, nullptr, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_norm", il);
|
||||
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, nullptr, model.layers[il].wo_s,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
inp_skip = ggml_get_rows(ctx0, inp_skip, inp_out_ids);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
cur = build_norm(ffn_inp, nullptr, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, nullptr, nullptr,
|
||||
model.layers[il].ffn_gate, nullptr, nullptr,
|
||||
model.layers[il].ffn_down, nullptr, model.layers[il].ffn_down_s,
|
||||
nullptr,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
ggml_tensor * skip = ggml_mul(ctx0, inp_skip, model.layers[il].out_scale);
|
||||
cb(skip, "embd_skip", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, skip);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur, nullptr, nullptr, LLM_NORM_RMS, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
@@ -630,10 +630,11 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
|
||||
}
|
||||
|
||||
for (int i = 2; i <= model.n_split; i++) {
|
||||
char num_buf[6], total_buf[6];
|
||||
snprintf(num_buf, sizeof(num_buf), "%05d", i);
|
||||
snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
|
||||
std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
|
||||
char buf_num[32];
|
||||
char buf_tot[32];
|
||||
snprintf(buf_num, sizeof(buf_num), "%05d", i);
|
||||
snprintf(buf_tot, sizeof(buf_tot), "%05d", (int)model.n_split);
|
||||
std::string shard_name = split_prefix + "-" + buf_num + "-of-" + buf_tot + ".gguf";
|
||||
|
||||
auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
|
||||
if (!shard.has_value()) {
|
||||
@@ -704,10 +705,11 @@ gguf_context_ptr gguf_fetch_gguf_ctx(
|
||||
}
|
||||
|
||||
for (int i = 2; i <= model.n_split; i++) {
|
||||
char num_buf[6], total_buf[6];
|
||||
snprintf(num_buf, sizeof(num_buf), "%05d", i);
|
||||
snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
|
||||
std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
|
||||
char buf_num[32];
|
||||
char buf_tot[32];
|
||||
snprintf(buf_num, sizeof(buf_num), "%05d", i);
|
||||
snprintf(buf_tot, sizeof(buf_tot), "%05d", (int)model.n_split);
|
||||
std::string shard_name = split_prefix + "-" + buf_num + "-of-" + buf_tot + ".gguf";
|
||||
|
||||
auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part, verbose);
|
||||
if (!shard.has_value()) {
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include <ggml-cpp.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <array>
|
||||
#include <cfloat>
|
||||
#include <cinttypes>
|
||||
@@ -33,6 +34,7 @@
|
||||
#include <future>
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
@@ -55,33 +57,24 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||
{
|
||||
// parallel initialization
|
||||
static const size_t n_threads = N_THREADS;
|
||||
// static RNG initialization (revisit if n_threads stops being constant)
|
||||
static std::vector<std::default_random_engine> generators = []() {
|
||||
std::random_device rd;
|
||||
std::vector<std::default_random_engine> vec;
|
||||
vec.reserve(n_threads);
|
||||
//for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
|
||||
for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
|
||||
return vec;
|
||||
}();
|
||||
|
||||
auto init_thread = [&](size_t ith, size_t start, size_t end) {
|
||||
auto init_thread = [&](size_t start, size_t end) {
|
||||
thread_local std::default_random_engine gen(std::random_device{}());
|
||||
std::uniform_real_distribution<float> distribution(min, max);
|
||||
auto & gen = generators[ith];
|
||||
for (size_t i = start; i < end; i++) {
|
||||
data[i] = distribution(gen);
|
||||
}
|
||||
};
|
||||
|
||||
if (n_threads == 1) {
|
||||
init_thread(0, 0, nels);
|
||||
init_thread(0, nels);
|
||||
} else {
|
||||
std::vector<std::future<void>> tasks;
|
||||
tasks.reserve(n_threads);
|
||||
for (size_t i = 0; i < n_threads; i++) {
|
||||
size_t start = i*nels/n_threads;
|
||||
size_t end = (i+1)*nels/n_threads;
|
||||
tasks.push_back(std::async(std::launch::async, init_thread, i, start, end));
|
||||
tasks.push_back(std::async(std::launch::async, init_thread, start, end));
|
||||
}
|
||||
for (auto & t : tasks) {
|
||||
t.get();
|
||||
@@ -516,6 +509,25 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
|
||||
return true;
|
||||
}
|
||||
|
||||
static std::string test_time_now() {
|
||||
time_t t = time(NULL);
|
||||
struct tm tm_buf;
|
||||
#ifdef _WIN32
|
||||
if (gmtime_s(&tm_buf, &t) != 0) {
|
||||
return "";
|
||||
}
|
||||
#else
|
||||
if (gmtime_r(&t, &tm_buf) == nullptr) {
|
||||
return "";
|
||||
}
|
||||
#endif
|
||||
char buf[32];
|
||||
if (std::strftime(buf, sizeof(buf), "%FT%TZ", &tm_buf) == 0) {
|
||||
return "";
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
// Test result structure for SQL output
|
||||
struct test_result {
|
||||
std::string test_time;
|
||||
@@ -545,11 +557,7 @@ struct test_result {
|
||||
supported = false;
|
||||
passed = false;
|
||||
|
||||
// Set test time
|
||||
time_t t = time(NULL);
|
||||
char buf[32];
|
||||
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
|
||||
test_time = buf;
|
||||
test_time = test_time_now();
|
||||
|
||||
// Set build info
|
||||
build_commit = ggml_commit();
|
||||
@@ -573,11 +581,7 @@ struct test_result {
|
||||
n_runs(n_runs),
|
||||
device_description(device_description),
|
||||
backend_reg_name(backend_reg_name) {
|
||||
// Set test time
|
||||
time_t t = time(NULL);
|
||||
char buf[32];
|
||||
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
|
||||
test_time = buf;
|
||||
test_time = test_time_now();
|
||||
|
||||
// Set build info
|
||||
build_commit = ggml_commit();
|
||||
@@ -1110,6 +1114,17 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
|
||||
GGML_ABORT("invalid output format");
|
||||
}
|
||||
|
||||
static std::mutex g_test_output_mutex;
|
||||
|
||||
static void print_test_result_locked(printer * output_printer, const test_result & result) {
|
||||
if (output_printer == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> guard(g_test_output_mutex);
|
||||
output_printer->print_test_result(result);
|
||||
}
|
||||
|
||||
struct test_case {
|
||||
virtual ~test_case() {}
|
||||
|
||||
@@ -1338,9 +1353,7 @@ struct test_case {
|
||||
test_result result(ggml_backend_name(backend1), current_op_name, vars(), "test",
|
||||
false, false, "not supported");
|
||||
|
||||
if (output_printer) {
|
||||
output_printer->print_test_result(result);
|
||||
}
|
||||
print_test_result_locked(output_printer, result);
|
||||
|
||||
ggml_free(ctx);
|
||||
return test_status_t::NOT_SUPPORTED;
|
||||
@@ -1462,9 +1475,7 @@ struct test_case {
|
||||
test_result result(ggml_backend_name(backend1), current_op_name, vars(), "test", supported, test_passed,
|
||||
error_msg);
|
||||
|
||||
if (output_printer) {
|
||||
output_printer->print_test_result(result);
|
||||
}
|
||||
print_test_result_locked(output_printer, result);
|
||||
|
||||
return test_passed ? test_status_t::OK : test_status_t::FAIL;
|
||||
}
|
||||
@@ -9493,8 +9504,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_from_file(const c
|
||||
return test_cases;
|
||||
}
|
||||
|
||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_names_filter, const char * params_filter,
|
||||
printer * output_printer, const char * test_file_path) {
|
||||
static bool test_backend(ggml_backend_t backend, ggml_backend_dev_t dev, test_mode mode, const char * op_names_filter, const char * params_filter,
|
||||
printer * output_printer, const char * test_file_path, int parallel_workers) {
|
||||
auto filter_test_cases = [](std::vector<std::unique_ptr<test_case>> & test_cases, const char * params_filter) {
|
||||
if (params_filter == nullptr) {
|
||||
return;
|
||||
@@ -9547,21 +9558,90 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
set_use_ref(backend_cpu, true);
|
||||
}
|
||||
|
||||
size_t n_ok = 0;
|
||||
size_t tests_run = 0;
|
||||
std::atomic<size_t> n_ok = 0;
|
||||
std::atomic<size_t> tests_run = 0;
|
||||
std::vector<std::string> failed_tests;
|
||||
for (auto & test : test_cases) {
|
||||
test_status_t status = test->eval(backend, backend_cpu, op_names_filter, output_printer);
|
||||
if (status == test_status_t::SKIPPED || status == test_status_t::NOT_SUPPORTED) {
|
||||
continue;
|
||||
std::mutex failed_tests_mutex;
|
||||
|
||||
// Each worker grabs a chunk of cases at a time. The chunk shrinks as we
|
||||
// run out of work so that a few slow tests at the tail get spread across
|
||||
// workers instead of landing on one unlucky thread.
|
||||
constexpr size_t MAX_TESTS_PER_ITER = 100;
|
||||
std::atomic<size_t> test_idx = 0;
|
||||
|
||||
const auto & next_chunk = [&](size_t & my_begin, size_t & my_end) {
|
||||
const size_t cur = test_idx.load(std::memory_order_relaxed);
|
||||
const size_t remaining = cur < test_cases.size() ? test_cases.size() - cur : 0;
|
||||
const size_t chunk = std::max<size_t>(1, std::min<size_t>(MAX_TESTS_PER_ITER, remaining / parallel_workers));
|
||||
my_begin = test_idx.fetch_add(chunk);
|
||||
my_end = std::min(my_begin + chunk, test_cases.size());
|
||||
};
|
||||
|
||||
const auto & run_tests = [&](ggml_backend_t b, ggml_backend_t b_cpu) {
|
||||
size_t my_begin, my_end;
|
||||
next_chunk(my_begin, my_end);
|
||||
while (my_begin < test_cases.size()) {
|
||||
for (size_t i = my_begin; i < my_end; ++i) {
|
||||
auto & test = test_cases[i];
|
||||
test_status_t status = test->eval(b, b_cpu, op_names_filter, output_printer);
|
||||
if (status == test_status_t::SKIPPED || status == test_status_t::NOT_SUPPORTED) {
|
||||
continue;
|
||||
}
|
||||
tests_run++;
|
||||
if (status == test_status_t::OK) {
|
||||
n_ok++;
|
||||
} else if (status == test_status_t::FAIL) {
|
||||
std::lock_guard<std::mutex> guard(failed_tests_mutex);
|
||||
failed_tests.push_back(test->current_op_name + "(" + test->vars() + ")");
|
||||
}
|
||||
}
|
||||
next_chunk(my_begin, my_end);
|
||||
}
|
||||
tests_run++;
|
||||
if (status == test_status_t::OK) {
|
||||
n_ok++;
|
||||
} else if (status == test_status_t::FAIL) {
|
||||
failed_tests.push_back(test->current_op_name + "(" + test->vars() + ")");
|
||||
};
|
||||
|
||||
if (parallel_workers <= 1) {
|
||||
// Reuse the outer backend / backend_cpu so we don't pay an
|
||||
// extra CPU backend init.
|
||||
run_tests(backend, backend_cpu);
|
||||
} else {
|
||||
std::atomic<size_t> workers_started = 0;
|
||||
|
||||
const auto & eval_worker = [&]() {
|
||||
ggml_backend_t b = ggml_backend_dev_init(dev, NULL);
|
||||
if (b == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_backend_t b_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
|
||||
if (b_cpu == NULL) {
|
||||
ggml_backend_free(b);
|
||||
return;
|
||||
}
|
||||
|
||||
if (set_use_ref) {
|
||||
set_use_ref(b_cpu, true);
|
||||
}
|
||||
workers_started++;
|
||||
run_tests(b, b_cpu);
|
||||
ggml_backend_free(b_cpu);
|
||||
ggml_backend_free(b);
|
||||
};
|
||||
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(parallel_workers);
|
||||
for (int i = 0; i < parallel_workers; ++i) {
|
||||
threads.emplace_back(eval_worker);
|
||||
}
|
||||
for (auto & t : threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
if (workers_started == 0 && !test_cases.empty()) {
|
||||
ggml_backend_free(backend_cpu);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
output_printer->print_summary(test_summary_info(n_ok, tests_run, false));
|
||||
output_printer->print_failed_tests(failed_tests);
|
||||
|
||||
@@ -9709,7 +9789,7 @@ static void show_test_coverage() {
|
||||
|
||||
static void usage(char ** argv) {
|
||||
printf("Usage: %s [mode] [-o <op,..>] [-b <backend>] [-p <params regex>] [--output <console|sql|csv>] [--list-ops]", argv[0]);
|
||||
printf(" [--show-coverage] [--test-file <path>]\n");
|
||||
printf(" [--show-coverage] [--test-file <path>] [-j <n>]\n");
|
||||
printf(" valid modes:\n");
|
||||
printf(" - test (default, compare with CPU backend for correctness)\n");
|
||||
printf(" - grad (compare gradients from backpropagation with method of finite differences)\n");
|
||||
@@ -9721,6 +9801,7 @@ static void usage(char ** argv) {
|
||||
printf(" --list-ops lists all available GGML operations\n");
|
||||
printf(" --show-coverage shows test coverage\n");
|
||||
printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
|
||||
printf(" -j <n> runs tests using <n> parallel worker threads (default: 1, test mode only)\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
@@ -9730,6 +9811,7 @@ int main(int argc, char ** argv) {
|
||||
const char * backend_filter = nullptr;
|
||||
const char * params_filter = nullptr;
|
||||
const char * test_file_path = nullptr;
|
||||
int parallel_workers = 1;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if (strcmp(argv[i], "test") == 0) {
|
||||
@@ -9784,6 +9866,17 @@ int main(int argc, char ** argv) {
|
||||
usage(argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-j") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
parallel_workers = atoi(argv[++i]);
|
||||
if (parallel_workers < 1) {
|
||||
usage(argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
usage(argv);
|
||||
return 1;
|
||||
@@ -9836,7 +9929,7 @@ int main(int argc, char ** argv) {
|
||||
false, "", ggml_backend_dev_description(dev),
|
||||
total / 1024 / 1024, free / 1024 / 1024, true));
|
||||
|
||||
bool ok = test_backend(backend, mode, op_names_filter, params_filter, output_printer.get(), test_file_path);
|
||||
bool ok = test_backend(backend, dev, mode, op_names_filter, params_filter, output_printer.get(), test_file_path, parallel_workers);
|
||||
|
||||
if (ok) {
|
||||
n_ok++;
|
||||
|
||||
@@ -88,11 +88,11 @@
|
||||
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
|
||||
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
|
||||
| `--log-disable` | Log disable |
|
||||
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
|
||||
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_ARG_LOG_FILE) |
|
||||
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_ARG_LOG_COLORS) |
|
||||
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
|
||||
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
|
||||
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_ARG_OFFLINE) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: trace (more info)<br/> - 5: debug<br/>(default: 3)<br/><br/>(env: LLAMA_ARG_LOG_VERBOSITY) |
|
||||
| `--log-prefix, --no-log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_ARG_LOG_PREFIX) |
|
||||
| `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_ARG_LOG_TIMESTAMPS) |
|
||||
| `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
|
||||
@@ -165,7 +165,7 @@
|
||||
| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
|
||||
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
|
||||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
|
||||
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
|
||||
@@ -194,6 +194,7 @@
|
||||
| `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
|
||||
| `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
|
||||
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
|
||||
| `--spec-draft-backend-sampling, --no-spec-draft-backend-sampling` | offload draft sampling to the backend (default: enabled)<br/>(env: LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING) |
|
||||
| `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
|
||||
|
||||
@@ -171,11 +171,11 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
||||
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
|
||||
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
|
||||
| `--log-disable` | Log disable |
|
||||
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
|
||||
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_ARG_LOG_FILE) |
|
||||
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_ARG_LOG_COLORS) |
|
||||
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
|
||||
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
|
||||
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_ARG_OFFLINE) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: trace (more info)<br/> - 5: debug<br/>(default: 3)<br/><br/>(env: LLAMA_ARG_LOG_VERBOSITY) |
|
||||
| `--log-prefix, --no-log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_ARG_LOG_PREFIX) |
|
||||
| `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_ARG_LOG_TIMESTAMPS) |
|
||||
| `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
|
||||
|
||||
@@ -105,11 +105,11 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
|
||||
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
|
||||
| `--log-disable` | Log disable |
|
||||
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
|
||||
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `--log-file FNAME` | Log to file<br/>(env: LLAMA_ARG_LOG_FILE) |
|
||||
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_ARG_LOG_COLORS) |
|
||||
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
|
||||
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
|
||||
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_ARG_OFFLINE) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: trace (more info)<br/> - 5: debug<br/>(default: 3)<br/><br/>(env: LLAMA_ARG_LOG_VERBOSITY) |
|
||||
| `--log-prefix, --no-log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_ARG_LOG_PREFIX) |
|
||||
| `--log-timestamps, --no-log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_ARG_LOG_TIMESTAMPS) |
|
||||
| `--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) |
|
||||
@@ -204,7 +204,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
|
||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
||||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
|
||||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
|
||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||
| `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
|
||||
@@ -249,6 +249,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
|
||||
| `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
|
||||
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
|
||||
| `--spec-draft-backend-sampling, --no-spec-draft-backend-sampling` | offload draft sampling to the backend (default: enabled)<br/>(env: LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING) |
|
||||
| `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
|
||||
|
||||
@@ -99,6 +99,7 @@ bool server_http_context::init(const common_params & params) {
|
||||
srv.reset(
|
||||
new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
|
||||
);
|
||||
is_ssl = true;
|
||||
} else {
|
||||
SRV_INF("%s", "running without SSL\n");
|
||||
srv.reset(new httplib::Server());
|
||||
@@ -378,8 +379,8 @@ bool server_http_context::start() {
|
||||
thread = std::thread([this]() { pimpl->srv->listen_after_bind(); });
|
||||
srv->wait_until_ready();
|
||||
|
||||
listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
|
||||
: string_format("http://%s:%d", hostname.c_str(), port);
|
||||
listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
|
||||
: string_format("%s://%s:%d", is_ssl ? "https" : "http", hostname.c_str(), port);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -74,6 +74,7 @@ struct server_http_context {
|
||||
std::string path_prefix;
|
||||
std::string hostname;
|
||||
int port;
|
||||
bool is_ssl = false;
|
||||
|
||||
server_http_context();
|
||||
~server_http_context();
|
||||
|
||||
Reference in New Issue
Block a user