mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
45 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
45cac7ca70 | ||
|
|
b94050e896 | ||
|
|
a279d0f0f4 | ||
|
|
268d61e178 | ||
|
|
6990e2f1f7 | ||
|
|
fcc7508759 | ||
|
|
5e6c0e18b6 | ||
|
|
30dce2cf29 | ||
|
|
089dd41fe3 | ||
|
|
85dde8dc4a | ||
|
|
4fbdabdc61 | ||
|
|
e45dbdece8 | ||
|
|
4adac43f6f | ||
|
|
9db77a020c | ||
|
|
f772f6e434 | ||
|
|
b572d1ecd6 | ||
|
|
03b3d07798 | ||
|
|
3f7c29d318 | ||
|
|
ae2d34899e | ||
|
|
1e796eb41f | ||
|
|
5637536517 | ||
|
|
90fb96a7b3 | ||
|
|
82677a6ede | ||
|
|
8612ed18b7 | ||
|
|
b1be68e8ca | ||
|
|
408225bb1a | ||
|
|
b3d758750a | ||
|
|
7e72b38bc1 | ||
|
|
20d3bc2cc8 | ||
|
|
a6206958d2 | ||
|
|
014dca49d6 | ||
|
|
adb541a6ad | ||
|
|
80d8770804 | ||
|
|
8dc530b86d | ||
|
|
e1a9a6dcbe | ||
|
|
e39eba26f3 | ||
|
|
5d14e5d19b | ||
|
|
fae3a28070 | ||
|
|
c0de6eda72 | ||
|
|
707c0b7a6e | ||
|
|
1f30ac0cea | ||
|
|
f4b5bf2f32 | ||
|
|
aa0f1897b7 | ||
|
|
be76dd0bb2 | ||
|
|
2e05f06ffb |
@@ -18,6 +18,7 @@
|
||||
vulkan-loader,
|
||||
openssl,
|
||||
shaderc,
|
||||
spirv-headers,
|
||||
useBlas ?
|
||||
builtins.all (x: !x) [
|
||||
useCuda
|
||||
@@ -145,6 +146,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
ninja
|
||||
pkg-config
|
||||
git
|
||||
spirv-headers
|
||||
]
|
||||
++ optionals useCuda [
|
||||
cudaPackages.cuda_nvcc
|
||||
|
||||
@@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget xz-utils
|
||||
|
||||
# Install SSL and Vulkan SDK dependencies
|
||||
RUN apt install -y libssl-dev curl \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc spirv-headers
|
||||
|
||||
# Build it
|
||||
WORKDIR /app
|
||||
|
||||
2
.github/workflows/build-android.yml
vendored
2
.github/workflows/build-android.yml
vendored
@@ -51,7 +51,7 @@ jobs:
|
||||
distribution: zulu
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
|
||||
24
.github/workflows/build-riscv.yml
vendored
24
.github/workflows/build-riscv.yml
vendored
@@ -47,22 +47,10 @@ jobs:
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential wget git-lfs
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
|
||||
if ! which rustc; then
|
||||
# Install Rust stable version
|
||||
sudo apt-get install -y rustup
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
fi
|
||||
|
||||
git lfs install
|
||||
|
||||
- name: GCC version check
|
||||
@@ -74,12 +62,12 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# FIXME: Enable when ggml-org/ccache-action works on riscv64
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanytizer }}-${{ matrix.build_type }}
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
with:
|
||||
key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
138
.github/workflows/build-self-hosted.yml
vendored
138
.github/workflows/build-self-hosted.yml
vendored
@@ -97,6 +97,36 @@ jobs:
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: investigate slight precision issues in some operations for test-backend-ops on the WebGPU backend.
|
||||
#ggml-ci-nvidia-webgpu:
|
||||
# runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
# - name: Dawn Dependency
|
||||
# id: dawn-depends
|
||||
# run: |
|
||||
# DAWN_VERSION="v20260317.182325"
|
||||
# DAWN_OWNER="google"
|
||||
# DAWN_REPO="dawn"
|
||||
# DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
|
||||
# echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
# curl -L -o artifact.tar.gz \
|
||||
# "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
# mkdir dawn
|
||||
# tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_WEBGPU=1 \
|
||||
# GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
# GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
|
||||
# bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMX-compatible machine
|
||||
#ggml-ci-cpu-amx:
|
||||
# runs-on: [self-hosted, Linux, CPU, AMX]
|
||||
@@ -141,61 +171,59 @@ jobs:
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: sandbox Mac runners
|
||||
# ggml-ci-mac-metal:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-mac-webgpu:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Dawn Dependency
|
||||
# id: dawn-depends
|
||||
# run: |
|
||||
# DAWN_VERSION="v2.0.0"
|
||||
# DAWN_OWNER="reeselevine"
|
||||
# DAWN_REPO="dawn"
|
||||
# DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||
# echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
# curl -L -o artifact.zip \
|
||||
# "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
# mkdir dawn
|
||||
# unzip artifact.zip
|
||||
# tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-mac-vulkan:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
ggml-ci-mac-metal:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-webgpu:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v20260317.182325"
|
||||
DAWN_OWNER="google"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-vulkan:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
3
.github/workflows/build-vulkan.yml
vendored
3
.github/workflows/build-vulkan.yml
vendored
@@ -93,4 +93,5 @@ jobs:
|
||||
export GGML_VK_DISABLE_F16=1
|
||||
export GGML_VK_DISABLE_COOPMAT=1
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 4800
|
||||
# test-backend-ops is too slow on llvmpipe, skip it
|
||||
ctest -L main -E test-backend-ops --verbose --timeout 900
|
||||
|
||||
77
.github/workflows/build.yml
vendored
77
.github/workflows/build.yml
vendored
@@ -267,6 +267,56 @@ jobs:
|
||||
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
|
||||
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
android-arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
NDK_VERSION: "29.0.14206865"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: android-arm64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: temurin
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Install NDK
|
||||
run: |
|
||||
sdkmanager "ndk;${{ env.NDK_VERSION }}"
|
||||
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-latest-rpc:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -318,7 +368,7 @@ jobs:
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
@@ -1001,22 +1051,14 @@ jobs:
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential libssl-dev wget git-lfs
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libssl-dev
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
|
||||
if ! which rustc; then
|
||||
# Install Rust stable version
|
||||
sudo apt-get install -y rustup
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
fi
|
||||
|
||||
git lfs install
|
||||
|
||||
- name: Check environment
|
||||
@@ -1032,13 +1074,12 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# FIXME: Enable when ggml-org/ccache-action works on riscv64
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ubuntu-cpu-riscv64-native
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
with:
|
||||
key: ubuntu-cpu-riscv64-native
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
75
.github/workflows/release.yml
vendored
75
.github/workflows/release.yml
vendored
@@ -202,7 +202,7 @@ jobs:
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
|
||||
else
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
fi
|
||||
@@ -236,6 +236,75 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
|
||||
android-arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
NDK_VERSION: "29.0.14206865"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: android-arm64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: temurin
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Install NDK
|
||||
run: |
|
||||
sdkmanager "ndk;${{ env.NDK_VERSION }}"
|
||||
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz
|
||||
name: llama-bin-android-arm64.tar.gz
|
||||
|
||||
ubuntu-24-openvino:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
@@ -971,6 +1040,7 @@ jobs:
|
||||
- ubuntu-cpu
|
||||
- ubuntu-vulkan
|
||||
- ubuntu-24-openvino
|
||||
- android-arm64
|
||||
- macOS-cpu
|
||||
- ios-xcode-build
|
||||
- openEuler-cann
|
||||
@@ -1059,6 +1129,9 @@ jobs:
|
||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||
|
||||
**Android:**
|
||||
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
|
||||
|
||||
**Windows:**
|
||||
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
||||
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
|
||||
|
||||
77
.github/workflows/server-self-hosted.yml
vendored
77
.github/workflows/server-self-hosted.yml
vendored
@@ -84,41 +84,42 @@ jobs:
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-cuda:
|
||||
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
|
||||
name: server-cuda (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
# TODO: provision CUDA runner
|
||||
# server-cuda:
|
||||
# runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
#
|
||||
# name: server-cuda (${{ matrix.wf_name }})
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build_type: [Release]
|
||||
# wf_name: ["GPUx1"]
|
||||
# include:
|
||||
# - build_type: Release
|
||||
# extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
# wf_name: "GPUx1, backend-sampling"
|
||||
# fail-fast: false
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
# ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
# cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
#
|
||||
# - name: Tests
|
||||
# id: server_integration_tests
|
||||
# if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
# run: |
|
||||
# cd tools/server/tests
|
||||
# python3 -m venv venv
|
||||
# source venv/bin/activate
|
||||
# pip install -r requirements.txt
|
||||
# export ${{ matrix.extra_args }}
|
||||
# pytest -v -x -m "not slow"
|
||||
|
||||
@@ -225,7 +225,7 @@ foreach(FILE_PATH ${EXTRA_LICENSES})
|
||||
endforeach()
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
license_generate(common)
|
||||
license_generate(llama-common)
|
||||
endif()
|
||||
|
||||
#
|
||||
@@ -249,6 +249,10 @@ set_target_properties(llama
|
||||
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
install(TARGETS llama-common LIBRARY)
|
||||
endif()
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
||||
|
||||
18
CODEOWNERS
18
CODEOWNERS
@@ -1,5 +1,21 @@
|
||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
||||
# multiplie collaborators per item can be specified
|
||||
# multiple collaborators per item can be specified
|
||||
#
|
||||
# ggml-org/ci : CISC, danbev, ggerganov, netrunnereve, ngxson, taronaeo
|
||||
# ggml-org/ggml-cann : hipudding
|
||||
# ggml-org/ggml-cuda : JohannesGaessler, am17an, IMbackK, ORippler
|
||||
# ggml-org/ggml-hexagon : lhez, max-krasnyansky
|
||||
# ggml-org/ggml-metal : ggerganov
|
||||
# ggml-org/ggml-opencl : lhez, max-krasnyansky
|
||||
# ggml-org/ggml-rpc : rgerganov
|
||||
# ggml-org/ggml-sycl : arthw
|
||||
# ggml-org/ggml-vulkan : 0cc4m, jeffbolznv
|
||||
# ggml-org/ggml-webgpu : reeselevine
|
||||
# ggml-org/ggml-zdnn : taronaeo
|
||||
# ggml-org/llama-common : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
|
||||
# ggml-org/llama-mtmd : ngxson
|
||||
# ggml-org/llama-server : ggerganov, ngxson, allozaur, angt, ServeurpersoCom
|
||||
# ggml-org/llama-webui : allozaur
|
||||
|
||||
/.devops/*.Dockerfile @ngxson
|
||||
/.github/actions/ @ggml-org/ci
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
# common
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
llama_add_compile_flags()
|
||||
|
||||
#
|
||||
# llama-common-base
|
||||
#
|
||||
|
||||
# Build info header
|
||||
|
||||
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
@@ -33,17 +35,25 @@ endif()
|
||||
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
|
||||
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
|
||||
set(TARGET build_info)
|
||||
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
|
||||
set(TARGET llama-common-base)
|
||||
add_library(${TARGET} STATIC ${OUTPUT_FILE})
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
set(TARGET common)
|
||||
#
|
||||
# llama-common
|
||||
#
|
||||
|
||||
add_library(${TARGET} STATIC
|
||||
set(TARGET llama-common)
|
||||
|
||||
add_library(${TARGET}
|
||||
arg.cpp
|
||||
arg.h
|
||||
base64.hpp
|
||||
@@ -106,17 +116,24 @@ add_library(${TARGET} STATIC
|
||||
jinja/caps.h
|
||||
)
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES
|
||||
VERSION ${LLAMA_INSTALL_VERSION}
|
||||
SOVERSION 0
|
||||
MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
|
||||
)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
# TODO: make fine-grained exports in the future
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
endif()
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
build_info
|
||||
cpp-httplib
|
||||
)
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common-base)
|
||||
target_link_libraries(${TARGET} PRIVATE cpp-httplib)
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
include(ExternalProject)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "arg.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "download.h"
|
||||
@@ -1044,8 +1045,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--version"},
|
||||
"show version and build info",
|
||||
[](common_params &) {
|
||||
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
fprintf(stderr, "version: %d (%s)\n", llama_build_number(), llama_commit());
|
||||
fprintf(stderr, "built with %s for %s\n", llama_compiler(), llama_build_target());
|
||||
exit(0);
|
||||
}
|
||||
));
|
||||
|
||||
@@ -1,4 +1,35 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
char const * LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
||||
char const * LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const * LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
|
||||
int llama_build_number(void) {
|
||||
return LLAMA_BUILD_NUMBER;
|
||||
}
|
||||
|
||||
const char * llama_commit(void) {
|
||||
return LLAMA_COMMIT;
|
||||
}
|
||||
|
||||
const char * llama_compiler(void) {
|
||||
return LLAMA_COMPILER;
|
||||
}
|
||||
|
||||
const char * llama_build_target(void) {
|
||||
return LLAMA_BUILD_TARGET;
|
||||
}
|
||||
|
||||
const char * llama_build_info(void) {
|
||||
static std::string s = "b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT;
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
void llama_print_build_info(void) {
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, llama_build_number(), llama_commit());
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, llama_compiler(), llama_build_target());
|
||||
}
|
||||
|
||||
11
common/build-info.h
Normal file
11
common/build-info.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
int llama_build_number(void);
|
||||
|
||||
const char * llama_commit(void);
|
||||
const char * llama_compiler(void);
|
||||
|
||||
const char * llama_build_target(void);
|
||||
const char * llama_build_info(void);
|
||||
|
||||
void llama_print_build_info(void);
|
||||
@@ -198,10 +198,19 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
|
||||
args_field = format.function_field + "." + args_field;
|
||||
}
|
||||
|
||||
auto tools_parser = p.standard_json_tools(
|
||||
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
auto tools_parser = p.eps();
|
||||
if (format.section_start.empty() && !format.per_call_start.empty()) {
|
||||
auto single_tool_parser = p.standard_json_tools(
|
||||
format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
|
||||
} else {
|
||||
tools_parser = p.standard_json_tools(
|
||||
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
}
|
||||
|
||||
// Handle content wrappers if present
|
||||
if (ctx.content && ctx.content->is_always_wrapped()) {
|
||||
|
||||
@@ -308,19 +308,23 @@ struct analyze_tools : analyze_base {
|
||||
|
||||
private:
|
||||
// Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
|
||||
void analyze_tool_calls(const analyze_reasoning & reasoning);
|
||||
void analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls);
|
||||
|
||||
// Analyze format based on position of function and argument name in needle
|
||||
void analyze_tool_call_format(const std::string & haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle,
|
||||
const analyze_reasoning & reasoning);
|
||||
const analyze_reasoning & reasoning,
|
||||
bool supports_parallel_tool_calls);
|
||||
|
||||
// Analyze specifics of JSON native format (entire tool call is a JSON object)
|
||||
void analyze_tool_call_format_json_native(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle);
|
||||
|
||||
// Check if parallel calls in JSON native format array wrapped or tag wrapped
|
||||
void analyze_json_native_parallel_calls();
|
||||
|
||||
// Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
|
||||
void analyze_tool_call_format_non_json(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle);
|
||||
|
||||
@@ -558,7 +558,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
|
||||
: analyze_base(tmpl) {
|
||||
LOG_DBG(ANSI_ORANGE "Phase 3: Tool call analysis\n" ANSI_RESET);
|
||||
|
||||
analyze_tool_calls(reasoning);
|
||||
analyze_tool_calls(reasoning, caps.supports_parallel_tool_calls);
|
||||
|
||||
if (format.mode != tool_format::NONE && format.mode != tool_format::JSON_NATIVE) {
|
||||
if (caps.supports_parallel_tool_calls) {
|
||||
@@ -577,7 +577,7 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
|
||||
}
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
|
||||
void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls) {
|
||||
json assistant_no_tools = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", ASSISTANT_MSG }
|
||||
@@ -611,13 +611,14 @@ void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
|
||||
return;
|
||||
}
|
||||
|
||||
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning);
|
||||
analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning, supports_parallel_tool_calls);
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_tool_call_format(const std::string & haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle,
|
||||
const analyze_reasoning & reasoning) {
|
||||
const analyze_reasoning & reasoning,
|
||||
bool supports_parallel_tool_calls) {
|
||||
if (fun_name_needle.empty() || arg_name_needle.empty() || haystack.empty()) {
|
||||
return;
|
||||
}
|
||||
@@ -660,6 +661,9 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
|
||||
|
||||
if (format.mode == tool_format::JSON_NATIVE) {
|
||||
analyze_tool_call_format_json_native(clean_haystack, fun_name_needle, arg_name_needle);
|
||||
if (supports_parallel_tool_calls) {
|
||||
analyze_json_native_parallel_calls();
|
||||
}
|
||||
} else {
|
||||
analyze_tool_call_format_non_json(clean_haystack, fun_name_needle);
|
||||
}
|
||||
@@ -668,6 +672,42 @@ void analyze_tools::analyze_tool_call_format(const std::string & haystack,
|
||||
format.per_call_end = trim_whitespace(format.per_call_end);
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_json_native_parallel_calls() {
|
||||
json assistant_one_tool = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", "" },
|
||||
{ "tool_calls", json::array({ first_tool_call }) }
|
||||
};
|
||||
|
||||
json assistant_two_tools = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", "" },
|
||||
{ "tool_calls", json::array({ first_tool_call, second_tool_call }) }
|
||||
};
|
||||
|
||||
template_params params;
|
||||
params.messages = json::array({ user_msg, assistant_one_tool });
|
||||
params.tools = tools;
|
||||
params.add_generation_prompt = false;
|
||||
params.enable_thinking = true;
|
||||
|
||||
auto comparison = compare_variants(
|
||||
*tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_two_tools }); });
|
||||
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
std::string & second_call = comparison->diff.right;
|
||||
if (!format.section_start.empty() && second_call.find(format.section_start) != std::string::npos) {
|
||||
format.per_call_start = format.section_start;
|
||||
format.per_call_end = format.section_end;
|
||||
format.section_start.clear();
|
||||
format.section_end.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void analyze_tools::analyze_tool_call_format_json_native(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle) {
|
||||
|
||||
@@ -676,7 +676,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
|
||||
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||
|
||||
auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
|
||||
literal("\"") + tool_name(literal(name)) + literal("\"");
|
||||
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
|
||||
auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
|
||||
tool_args(schema(json(), "tool-" + name + "-schema", params));
|
||||
|
||||
@@ -744,7 +744,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
|
||||
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
|
||||
|
||||
auto tool_name_ = name_key_parser + space() + literal(":") + space() +
|
||||
literal("\"") + tool_name(literal(name)) + literal("\"");
|
||||
atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
|
||||
auto tool_args_ = args_key_parser + space() + literal(":") + space() +
|
||||
tool_args(schema(json(), "tool-" + name + "-schema", params));
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
@@ -372,7 +373,7 @@ void common_init() {
|
||||
const char * build_type = " (debug)";
|
||||
#endif
|
||||
|
||||
LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
||||
LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
|
||||
}
|
||||
|
||||
std::string common_params_get_system_info(const common_params & params) {
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "ggml.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
@@ -27,11 +28,6 @@
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
struct common_time_meas {
|
||||
common_time_meas(int64_t & t_acc, bool disable = false);
|
||||
~common_time_meas();
|
||||
@@ -53,14 +49,6 @@ struct common_adapter_lora_info {
|
||||
|
||||
using llama_tokens = std::vector<llama_token>;
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern const char * LLAMA_COMMIT;
|
||||
extern const char * LLAMA_COMPILER;
|
||||
extern const char * LLAMA_BUILD_TARGET;
|
||||
|
||||
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
|
||||
|
||||
struct common_control_vector_load_info;
|
||||
|
||||
//
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "arg.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "download.h"
|
||||
@@ -303,7 +304,7 @@ static int common_download_file_single_online(const std::string & url,
|
||||
headers.emplace(h.first, h.second);
|
||||
}
|
||||
if (headers.find("User-Agent") == headers.end()) {
|
||||
headers.emplace("User-Agent", "llama-cpp/" + build_info);
|
||||
headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
|
||||
}
|
||||
if (!opts.bearer_token.empty()) {
|
||||
headers.emplace("Authorization", "Bearer " + opts.bearer_token);
|
||||
@@ -441,7 +442,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
|
||||
headers.emplace(h.first, h.second);
|
||||
}
|
||||
if (headers.find("User-Agent") == headers.end()) {
|
||||
headers.emplace("User-Agent", "llama-cpp/" + build_info);
|
||||
headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
|
||||
}
|
||||
|
||||
if (params.timeout > 0) {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "hf-cache.h"
|
||||
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "http.h"
|
||||
@@ -200,7 +201,7 @@ static nl::json api_get(const std::string & url,
|
||||
auto [cli, parts] = common_http_client(url);
|
||||
|
||||
httplib::Headers headers = {
|
||||
{"User-Agent", "llama-cpp/" + build_info},
|
||||
{"User-Agent", "llama-cpp/" + std::string(llama_build_info())},
|
||||
{"Accept", "application/json"}
|
||||
};
|
||||
|
||||
|
||||
@@ -23,6 +23,10 @@
|
||||
|
||||
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||
|
||||
int common_log_get_verbosity_thold(void) {
|
||||
return common_log_verbosity_thold;
|
||||
}
|
||||
|
||||
void common_log_set_verbosity_thold(int verbosity) {
|
||||
common_log_verbosity_thold = verbosity;
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@ enum log_colors {
|
||||
|
||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||
// set via common_log_set_verbosity()
|
||||
extern int common_log_verbosity_thold;
|
||||
int common_log_get_verbosity_thold(void);
|
||||
|
||||
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
|
||||
|
||||
@@ -98,7 +98,7 @@ void common_log_flush (struct common_log * log); // f
|
||||
|
||||
#define LOG_TMPL(level, verbosity, ...) \
|
||||
do { \
|
||||
if ((verbosity) <= common_log_verbosity_thold) { \
|
||||
if ((verbosity) <= common_log_get_verbosity_thold()) { \
|
||||
common_log_add(common_log_main(), (level), __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@@ -10893,7 +10893,64 @@ class NemotronHModel(GraniteHybridModel):
|
||||
self.gguf_writer.add_moe_latent_size(latent_size)
|
||||
|
||||
def set_vocab(self):
|
||||
super().set_vocab()
|
||||
# The NemotronH config uses pattern characters (e.g. '-') that may not
|
||||
# be supported by the installed transformers version. AutoTokenizer
|
||||
# internally calls AutoConfig which triggers this parsing failure.
|
||||
# Using trust_remote_code=True to load the model's own config class.
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
|
||||
# Pad vocab size (from Mamba2Model/GraniteHybridModel)
|
||||
self.hparams["pad_vocab_size_multiple"] = 8 # Setting this here since GraniteHybridModel.set_vocab() isn't being invoked now.
|
||||
# From Mamba2Model.set_vocab():
|
||||
vocab_size = self.hparams["vocab_size"]
|
||||
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
|
||||
# ref: https://stackoverflow.com/a/17511341/22827863
|
||||
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
||||
self.hparams["vocab_size"] = vocab_size
|
||||
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
else:
|
||||
token: str = reverse_vocab[i]
|
||||
if token in added_vocab:
|
||||
if not added_tokens_decoder[i].normalized:
|
||||
previous_token = token
|
||||
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
||||
if previous_token != token:
|
||||
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
||||
|
||||
if added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
tokens.append(token)
|
||||
|
||||
# From TextModel.set_vocab_gpt2():
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
# The tokenizer _does_ add a BOS token (via post_processor type
|
||||
# TemplateProcessing) but does not set add_bos_token to true in the
|
||||
|
||||
@@ -689,6 +689,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. (1.) |
|
||||
| GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
||||
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
|
||||
| GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
|
||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||
|
||||
|
||||
@@ -281,6 +281,12 @@ Use `GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F` environment variable to force use FP16
|
||||
|
||||
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
|
||||
|
||||
### Peer Access
|
||||
|
||||
The environment variable `GGML_CUDA_P2P` can be set to enable peer-to-peer access between multiple GPUs, allowing them to transfer data directly rather than to go through system memory.
|
||||
Requires driver support (usually restricted to workstation/datacenter GPUs).
|
||||
May cause crashes or corrupted outputs for some motherboards and BIOS settings (e.g. IOMMU).
|
||||
|
||||
### Performance Tuning
|
||||
|
||||
The following compilation options are also available to tweak performance:
|
||||
@@ -456,7 +462,8 @@ pacman -S git \
|
||||
mingw-w64-ucrt-x86_64-gcc \
|
||||
mingw-w64-ucrt-x86_64-cmake \
|
||||
mingw-w64-ucrt-x86_64-vulkan-devel \
|
||||
mingw-w64-ucrt-x86_64-shaderc
|
||||
mingw-w64-ucrt-x86_64-shaderc \
|
||||
mingw-w64-ucrt-x86_64-spirv-headers
|
||||
```
|
||||
|
||||
Switch into the `llama.cpp` directory and build using CMake.
|
||||
@@ -490,9 +497,11 @@ First, follow the official LunarG instructions for the installation and setup of
|
||||
|
||||
On Debian / Ubuntu, you can install the required dependencies using:
|
||||
```sh
|
||||
sudo apt-get install libvulkan-dev glslc
|
||||
sudo apt-get install libvulkan-dev glslc spirv-headers
|
||||
```
|
||||
|
||||
SPIRV-Headers (`spirv/unified1/spirv.hpp`) are required for the Vulkan backend and are **not** always pulled in by the Vulkan loader dev package alone. Other distros use names such as `spirv-headers` (Ubuntu / Debian / Arch), or `spirv-headers-devel` (Fedora / openSUSE). On Windows, the LunarG Vulkan SDK’s `Include` directory already contains these headers.
|
||||
|
||||
#### Common steps
|
||||
|
||||
Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
|
||||
|
||||
@@ -130,6 +130,23 @@ Note:
|
||||
- Adding a model-specific API or CLI is an anti-pattern in `libmtmd`. The goal of `libmtmd` is to provide an easy-to-use, model-agnostic library for multimodal pipeline.
|
||||
- In most cases, `llama-mtmd-cli` should not be modified. If a model requires a specific prompt, either let the user provide it or bake it into the Jinja chat template.
|
||||
|
||||
## Tips and tricks
|
||||
|
||||
### Working with ggml_rope_ext
|
||||
|
||||
PyTorch implementations usually prefer explicitly calculating `freq_cis`/`sin`/`cos` components. However, in llama.cpp, most RoPE operations can be handled via `ggml_rope_ext`, which does not require a sin/cos matrix. This saves memory while allowing the GGML RoPE kernel to be fused with other ops.
|
||||
|
||||
However, since `ggml_rope_ext` only provides a subset of the RoPE implementations that models use, converting models from PyTorch to llama.cpp may require some creative adaptations.
|
||||
|
||||
For more information about `ggml_rope_ext`, please refer to the in-code documentation in `ggml.h`.
|
||||
|
||||
Examples:
|
||||
- `libmtmd` implements 2D RoPE with `GGML_ROPE_TYPE_NORMAL` ordering by splitting the input tensor in half, applying `ggml_rope_ext` separately to each half, then joining them back together using `ggml_concat`.
|
||||
- The [Kimi-K2.5](https://github.com/ggml-org/llama.cpp/pull/19170) vision encoder uses vision RoPE with interleaved frequencies. The weights must be permuted during conversion in order to reuse the `build_rope_2d()` function.
|
||||
- [Gemma 4](https://github.com/ggml-org/llama.cpp/pull/21309) uses "proportional" RoPE. We employ a trick where `rope_freqs` is set to a very large value in the last dimensions to prevent those dimensions from being rotated. See the `Gemma4Model` class in `convert_hf_to_gguf.py`.
|
||||
- Some models require scaling the input position. For example, `[0, 1, 2, ...]` becomes `[0, 0.5, 1, ...]`. In this case, you can provide the scaling via `freq_scale = 0.5f`.
|
||||
- Some models use learned RoPE frequencies instead of relying on `powf(freq_base, -2.0 * i / n_dims)`. In this case, you can provide the learned frequencies via the `rope_freqs` tensor (corresponding to the `c` argument in `ggml_rope_ext`), then set `freq_base = 1.0f`. An important note is that `rope_freqs` in GGML is the **inverse** (`theta = pos[i] / rope_freqs`), so you may need to invert `rope_freqs` during conversion.
|
||||
|
||||
## GGUF specification
|
||||
|
||||
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md
|
||||
|
||||
16
docs/ops.md
16
docs/ops.md
@@ -22,13 +22,13 @@ Legend:
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
@@ -46,7 +46,7 @@ Legend:
|
||||
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
@@ -84,10 +84,10 @@ Legend:
|
||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
@@ -116,6 +116,6 @@ Legend:
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
3778
docs/ops/Metal.csv
3778
docs/ops/Metal.csv
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-batched)
|
||||
add_executable(${TARGET} batched.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-convert-llama2c-to-ggml)
|
||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-debug)
|
||||
add_executable(${TARGET} debug.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-diffusion-cli)
|
||||
add_executable(${TARGET} diffusion-cli.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
int n_input = input_tokens.size();
|
||||
|
||||
if (n_input >= params.n_ctx) {
|
||||
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
|
||||
if (static_cast<uint32_t>(n_input) >= llama_n_ctx(ctx)) {
|
||||
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, llama_n_ctx(ctx));
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
return 1;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-embedding)
|
||||
add_executable(${TARGET} embedding.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
set(TARGET llama-eval-callback)
|
||||
add_executable(${TARGET} eval-callback.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_BUILD_TESTS)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-gen-docs)
|
||||
add_executable(${TARGET} gen-docs.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-idle)
|
||||
add_executable(${TARGET} idle.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-lookahead)
|
||||
add_executable(${TARGET} lookahead.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,23 +1,23 @@
|
||||
set(TARGET llama-lookup)
|
||||
add_executable(${TARGET} lookup.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-create)
|
||||
add_executable(${TARGET} lookup-create.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-merge)
|
||||
add_executable(${TARGET} lookup-merge.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-lookup-stats)
|
||||
add_executable(${TARGET} lookup-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-parallel)
|
||||
add_executable(${TARGET} parallel.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-passkey)
|
||||
add_executable(${TARGET} passkey.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-retrieval)
|
||||
add_executable(${TARGET} retrieval.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-speculative-simple)
|
||||
add_executable(${TARGET} speculative-simple.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-speculative)
|
||||
add_executable(${TARGET} speculative.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -5,5 +5,5 @@
|
||||
set(TARGET llama-ls-sycl-device)
|
||||
add_executable(${TARGET} ls-sycl-device.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-finetune)
|
||||
add_executable(${TARGET} finetune.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -254,6 +254,7 @@ option(GGML_RPC "ggml: use RPC"
|
||||
option(GGML_SYCL "ggml: use SYCL" OFF)
|
||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
|
||||
option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
|
||||
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
|
||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||
"ggml: sycl target device")
|
||||
|
||||
@@ -202,8 +202,11 @@ extern "C" {
|
||||
|
||||
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||
|
||||
// AllReduce operation for tensor parallelism (meta backend)
|
||||
typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
|
||||
// Context management and operations for faster communication between backends, used for tensor parallelism (meta backend)
|
||||
typedef void * (*ggml_backend_comm_init_t)(ggml_backend_t * backends, size_t n_backends);
|
||||
typedef void (*ggml_backend_comm_free_t)(void * comm_ctx);
|
||||
typedef bool (*ggml_backend_comm_allreduce_tensor_t)(void * comm_ctx, struct ggml_tensor ** tensors);
|
||||
|
||||
// Split buffer type for tensor parallelism (old)
|
||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
||||
// Set the number of threads for the backend
|
||||
@@ -348,6 +351,53 @@ extern "C" {
|
||||
// Set a callback to be called for each resulting node during graph compute
|
||||
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
||||
|
||||
//
|
||||
// Meta backend
|
||||
//
|
||||
|
||||
#define GGML_BACKEND_META_MAX_DEVICES 16
|
||||
|
||||
enum ggml_backend_meta_split_axis {
|
||||
// tensor split by tensor dimensions:
|
||||
GGML_BACKEND_SPLIT_AXIS_0 = 0,
|
||||
GGML_BACKEND_SPLIT_AXIS_1 = 1,
|
||||
GGML_BACKEND_SPLIT_AXIS_2 = 2,
|
||||
GGML_BACKEND_SPLIT_AXIS_3 = 3,
|
||||
|
||||
GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
|
||||
GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
|
||||
|
||||
// for internal bookkeeping only:
|
||||
GGML_BACKEND_SPLIT_AXIS_NONE = 98,
|
||||
GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
|
||||
};
|
||||
GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
|
||||
|
||||
struct ggml_backend_meta_split_state {
|
||||
enum ggml_backend_meta_split_axis axis;
|
||||
|
||||
// for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
|
||||
// - each device has a slice of the tensor along the split axis
|
||||
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
|
||||
// - some tensors have an inhomogenenous data layout along the split axis,
|
||||
// those tensors are divided into segments which are each individually split across devices
|
||||
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
|
||||
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
|
||||
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
|
||||
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
|
||||
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
|
||||
uint32_t n_segments;
|
||||
};
|
||||
|
||||
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
|
||||
typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
|
||||
|
||||
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
|
||||
// TODO: this looks a bit strange - a backend API creates a device. I think we should try
|
||||
// express this as a backend registry functionality instead
|
||||
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
|
||||
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
|
||||
|
||||
//
|
||||
// Utils
|
||||
//
|
||||
|
||||
@@ -6,9 +6,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define RPC_PROTO_MAJOR_VERSION 3
|
||||
#define RPC_PROTO_MINOR_VERSION 6
|
||||
#define RPC_PROTO_PATCH_VERSION 1
|
||||
#define RPC_PROTO_MAJOR_VERSION 4
|
||||
#define RPC_PROTO_MINOR_VERSION 0
|
||||
#define RPC_PROTO_PATCH_VERSION 0
|
||||
|
||||
#ifdef __cplusplus
|
||||
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
|
||||
|
||||
@@ -1773,8 +1773,32 @@ extern "C" {
|
||||
int n_dims,
|
||||
int mode);
|
||||
|
||||
// custom RoPE
|
||||
// RoPE operations with extended options
|
||||
// a is the input tensor to apply RoPE to, shape [n_embd, n_head, n_token]
|
||||
// b is an int32 vector with size n_token
|
||||
// c is freq factors (e.g. phi3-128k), (optional)
|
||||
// mode can be GGML_ROPE_TYPE_NORMAL or NEOX; for MROPE and VISION mode, use ggml_rope_multi
|
||||
//
|
||||
// pseudo-code for computing theta:
|
||||
// for i in [0, n_dims/2):
|
||||
// theta[i] = b[i] * powf(freq_base, -2.0 * i / n_dims);
|
||||
// theta[i] = theta[i] / c[i]; # if c is provided, divide theta by c
|
||||
// theta[i] = rope_yarn(theta[i], ...); # note: theta = theta * freq_scale is applied here
|
||||
//
|
||||
// other params are used by YaRN RoPE scaling, these default values will disable YaRN:
|
||||
// freq_scale = 1.0f
|
||||
// ext_factor = 0.0f
|
||||
// attn_factor = 1.0f
|
||||
// beta_fast = 0.0f
|
||||
// beta_slow = 0.0f
|
||||
//
|
||||
// example:
|
||||
// (marking: c = cos, s = sin, 0 = unrotated)
|
||||
// given a single head with size = 8 --> [00000000]
|
||||
// GGML_ROPE_TYPE_NORMAL n_dims = 4 --> [cscs0000]
|
||||
// GGML_ROPE_TYPE_NORMAL n_dims = 8 --> [cscscscs]
|
||||
// GGML_ROPE_TYPE_NEOX n_dims = 4 --> [ccss0000]
|
||||
// GGML_ROPE_TYPE_NEOX n_dims = 8 --> [ccccssss]
|
||||
GGML_API struct ggml_tensor * ggml_rope_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -1790,6 +1814,36 @@ extern "C" {
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
// multi-dimensional RoPE, for Qwen-VL and similar vision models
|
||||
// mode can be either VISION, MROPE, IMROPE, cannot be combined with NORMAL or NEOX
|
||||
// sections specify how many dimensions to rotate in each section:
|
||||
// section length is equivalent to number of cos/sin pairs, NOT the number of dims
|
||||
// (i.e. sum of 4 sections are expected to be n_dims/2)
|
||||
// last sections can be 0, means ignored
|
||||
// all other options are identical to ggml_rope_ext
|
||||
//
|
||||
// important note:
|
||||
// - NEOX ordering is automatically applied and cannot be disabled for MROPE and VISION
|
||||
// if you need normal ordering, there are 2 methods:
|
||||
// (1) split the tensor manually using ggml_view
|
||||
// (2) permute the weight upon conversion
|
||||
// - for VISION, n_dims must be head_size/2
|
||||
//
|
||||
// example M-RoPE:
|
||||
// given sections = [t=4, y=2, x=2, 0]
|
||||
// given a single head with size = 18 --> [000000000000000000]
|
||||
// GGML_ROPE_TYPE_MROPE n_dims = 16 --> [ttttyyxxttttyyxx00] (cos/sin are applied in NEOX ordering)
|
||||
// GGML_ROPE_TYPE_IMROPE n_dims = 16 --> [ttyxttyxttyxttyx00] (interleaved M-RoPE, still NEOX ordering)
|
||||
// note: the theta for each dim is computed the same way as ggml_rope_ext, no matter the section
|
||||
// in other words, idx used for theta: [0123456789... until n_dims/2], not reset for each section
|
||||
//
|
||||
// example vision RoPE:
|
||||
// given sections = [y=4, x=4, 0, 0] (last 2 sections are ignored)
|
||||
// given a single head with size = 8 --> [00000000]
|
||||
// GGML_ROPE_TYPE_VISION n_dims = 4 --> [yyyyxxxx]
|
||||
// other values of n_dims are untested and is undefined behavior
|
||||
// note: unlike MROPE, the theta for each dim is computed differently for each section
|
||||
// in other words, idx used for theta: [0123] for y section, then [0123] for x section
|
||||
GGML_API struct ggml_tensor * ggml_rope_multi(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
@@ -5,9 +5,6 @@
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
// TODO: tmp
|
||||
#include "ggml-ext.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
@@ -1422,22 +1419,48 @@ struct ggml_backend_meta_context {
|
||||
size_t max_tmp_size = 0;
|
||||
size_t max_subgraphs = 0;
|
||||
|
||||
void * comm_ctx = nullptr;
|
||||
ggml_backend_comm_allreduce_tensor_t comm_allreduce = nullptr;
|
||||
|
||||
ggml_backend_meta_context(ggml_backend_dev_t meta_dev, const char * params) {
|
||||
const size_t n_devs = ggml_backend_meta_dev_n_devs(meta_dev);
|
||||
name = "Meta(";
|
||||
std::vector<ggml_backend_t> simple_backends;
|
||||
backend_configs.reserve(n_devs);
|
||||
simple_backends.reserve(n_devs);
|
||||
for (size_t i = 0; i < n_devs; i++) {
|
||||
ggml_backend_dev_t simple_dev = ggml_backend_meta_dev_simple_dev(meta_dev, i);
|
||||
if (i > 0) {
|
||||
name += ",";
|
||||
}
|
||||
name += ggml_backend_dev_name(simple_dev);
|
||||
backend_configs.emplace_back(ggml_backend_dev_init(simple_dev, params));
|
||||
simple_backends.push_back(ggml_backend_dev_init(simple_dev, params));
|
||||
backend_configs.emplace_back(simple_backends.back());
|
||||
}
|
||||
name += ")";
|
||||
|
||||
if (n_devs > 1) {
|
||||
ggml_backend_comm_init_t comm_init = (ggml_backend_comm_init_t) ggml_backend_reg_get_proc_address(
|
||||
ggml_backend_dev_backend_reg(ggml_backend_get_device(simple_backends[0])), "ggml_backend_comm_init");
|
||||
if (comm_init != nullptr) {
|
||||
comm_ctx = comm_init(simple_backends.data(), simple_backends.size());
|
||||
}
|
||||
}
|
||||
if (comm_ctx != nullptr) {
|
||||
comm_allreduce = (ggml_backend_comm_allreduce_tensor_t)
|
||||
ggml_backend_reg_get_proc_address(ggml_backend_dev_backend_reg(
|
||||
ggml_backend_get_device(simple_backends[0])), "ggml_backend_comm_allreduce_tensor");
|
||||
GGML_ASSERT(comm_allreduce != nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
~ggml_backend_meta_context() {
|
||||
if (comm_ctx != nullptr) {
|
||||
ggml_backend_comm_free_t comm_free = (ggml_backend_comm_free_t) ggml_backend_reg_get_proc_address(
|
||||
ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_configs[0].backend)), "ggml_backend_comm_free");
|
||||
GGML_ASSERT(comm_free != nullptr);
|
||||
comm_free(comm_ctx);
|
||||
}
|
||||
for (auto & bc : backend_configs) {
|
||||
ggml_backend_free(bc.backend);
|
||||
}
|
||||
@@ -1848,20 +1871,15 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
|
||||
if (n_backends > 1 && i < n_subgraphs - 1) {
|
||||
bool backend_allreduce_success = false;
|
||||
ggml_backend_allreduce_tensor_t allreduce_tensor = (ggml_backend_allreduce_tensor_t) ggml_backend_reg_get_proc_address(
|
||||
ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_ctx->backend_configs[0].backend)), "ggml_backend_allreduce_tensor");
|
||||
if (allreduce_tensor) {
|
||||
std::vector<ggml_backend_t> backends;
|
||||
backends.reserve(n_backends);
|
||||
if (backend_ctx->comm_ctx) {
|
||||
std::vector<ggml_tensor *> nodes;
|
||||
nodes.reserve(n_backends);
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
backends.push_back(bcj.backend);
|
||||
ggml_cgraph * cgraph_ij = bcj.cgraphs[i].cgraph_main;
|
||||
nodes.push_back(cgraph_ij->nodes[cgraph_ij->n_nodes-1]);
|
||||
}
|
||||
backend_allreduce_success = allreduce_tensor(backends.data(), nodes.data(), n_backends);
|
||||
backend_allreduce_success = backend_ctx->comm_allreduce(backend_ctx->comm_ctx, nodes.data());
|
||||
}
|
||||
|
||||
if (!backend_allreduce_success) {
|
||||
|
||||
@@ -1030,6 +1030,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||
GGML_ABORT("%s: failed to initialize context\n", __func__);
|
||||
}
|
||||
|
||||
graph->uid = ggml_graph_next_uid();
|
||||
|
||||
// pass 1: assign backends to ops with pre-allocated inputs
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
@@ -1477,6 +1479,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||
assert(graph_copy->size > graph_copy->n_leafs);
|
||||
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
||||
}
|
||||
|
||||
// set ids for all splits
|
||||
for (int i = 0; i < sched->n_splits; ++i) {
|
||||
sched->splits[i].graph.uid = ggml_graph_next_uid();
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||
|
||||
@@ -783,6 +783,7 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_1, m4b));
|
||||
const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
|
||||
const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
|
||||
const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
|
||||
@@ -794,15 +795,40 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
|
||||
|
||||
const int32x4_t p0 = vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
|
||||
vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
|
||||
vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
|
||||
const int32x4_t p1 = vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
|
||||
vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
|
||||
vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
|
||||
|
||||
const int32x4_t sums = vpaddq_s32(p0, p1);
|
||||
const int32x4_t sumi = vpaddq_s32(p0, p1);
|
||||
#else
|
||||
const int8x8_t q4_0_lo = vget_low_s8(q4_lo_0);
|
||||
const int8x8_t q4_0_hi = vget_low_s8(q4_hi_0);
|
||||
const int8x8_t q4_1_lo = vget_high_s8(q4_lo_0);
|
||||
const int8x8_t q4_1_hi = vget_high_s8(q4_hi_0);
|
||||
const int8x8_t q4_2_lo = vget_low_s8(q4_lo_1);
|
||||
const int8x8_t q4_2_hi = vget_low_s8(q4_hi_1);
|
||||
const int8x8_t q4_3_lo = vget_high_s8(q4_lo_1);
|
||||
const int8x8_t q4_3_hi = vget_high_s8(q4_hi_1);
|
||||
|
||||
const int8x8_t q8_0_lo = vld1_s8(y[2*ib].qs);
|
||||
const int8x8_t q8_0_hi = vld1_s8(y[2*ib].qs + 8);
|
||||
const int8x8_t q8_1_lo = vld1_s8(y[2*ib].qs + 16);
|
||||
const int8x8_t q8_1_hi = vld1_s8(y[2*ib].qs + 24);
|
||||
const int8x8_t q8_2_lo = vld1_s8(y[2*ib+1].qs);
|
||||
const int8x8_t q8_2_hi = vld1_s8(y[2*ib+1].qs + 8);
|
||||
const int8x8_t q8_3_lo = vld1_s8(y[2*ib+1].qs + 16);
|
||||
const int8x8_t q8_3_hi = vld1_s8(y[2*ib+1].qs + 24);
|
||||
|
||||
const int32x4_t sumi = (int32x4_t){
|
||||
vaddvq_s32(ggml_nvfp4_dot8(q4_0_lo, q8_0_lo, q4_0_hi, q8_0_hi)),
|
||||
vaddvq_s32(ggml_nvfp4_dot8(q4_1_lo, q8_1_lo, q4_1_hi, q8_1_hi)),
|
||||
vaddvq_s32(ggml_nvfp4_dot8(q4_2_lo, q8_2_lo, q4_2_hi, q8_2_hi)),
|
||||
vaddvq_s32(ggml_nvfp4_dot8(q4_3_lo, q8_3_lo, q4_3_hi, q8_3_hi)),
|
||||
};
|
||||
#endif
|
||||
|
||||
// Decode 4 UE4M3 scales to f32 and multiply with q8 scales
|
||||
const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
|
||||
const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
|
||||
const float32x4_t nvsc = {
|
||||
@@ -813,7 +839,7 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
};
|
||||
const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});
|
||||
|
||||
acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
|
||||
acc = vfmaq_f32(acc, vcvtq_f32_s32(sumi), scales);
|
||||
}
|
||||
sumf = vaddvq_f32(acc);
|
||||
#else
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -306,6 +306,7 @@ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
// NOTE: this fallback produces the same total sum as native vdotq_s32 but with different per-lane grouping — do not use when individual lane values matter.
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
@@ -319,6 +320,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||
|
||||
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
static inline int32x4_t ggml_nvfp4_dot8(const int8x8_t q4_lo, const int8x8_t q8_lo,
|
||||
const int8x8_t q4_hi, const int8x8_t q8_hi) {
|
||||
const int16x8_t p_lo = vmull_s8(q4_lo, q8_lo);
|
||||
const int16x8_t p_hi = vmull_s8(q4_hi, q8_hi);
|
||||
const int32x4_t sum_lo = vpaddlq_s16(p_lo);
|
||||
const int32x4_t sum_hi = vpaddlq_s16(p_hi);
|
||||
return vaddq_s32(sum_lo, sum_hi);
|
||||
}
|
||||
|
||||
#endif // defined(__ARM_NEON)
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
|
||||
@@ -109,6 +109,96 @@ static void simd_gemm(
|
||||
C += N;
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_SIMD) && defined(__riscv_v_intrinsic)
|
||||
// RM accumulators + 1 B vector = RM + 1 <= 8 => RM <= 7
|
||||
// Microkernel: C[RM x vl] += A[RM x K] * B[K x N]
|
||||
template <int RM>
|
||||
static inline void rvv_simd_gemm_ukernel(
|
||||
float * GGML_RESTRICT C,
|
||||
const float * GGML_RESTRICT A,
|
||||
const float * GGML_RESTRICT B,
|
||||
int K, int N, size_t vl)
|
||||
{
|
||||
static_assert(RM >= 1 && RM <= 7, "RM must be 1..7 for LMUL=4");
|
||||
|
||||
vfloat32m4_t acc_0 = __riscv_vle32_v_f32m4(C + 0 * N, vl);
|
||||
vfloat32m4_t acc_1, acc_2, acc_3, acc_4, acc_5, acc_6;
|
||||
if constexpr (RM > 1) acc_1 = __riscv_vle32_v_f32m4(C + 1 * N, vl);
|
||||
if constexpr (RM > 2) acc_2 = __riscv_vle32_v_f32m4(C + 2 * N, vl);
|
||||
if constexpr (RM > 3) acc_3 = __riscv_vle32_v_f32m4(C + 3 * N, vl);
|
||||
if constexpr (RM > 4) acc_4 = __riscv_vle32_v_f32m4(C + 4 * N, vl);
|
||||
if constexpr (RM > 5) acc_5 = __riscv_vle32_v_f32m4(C + 5 * N, vl);
|
||||
if constexpr (RM > 6) acc_6 = __riscv_vle32_v_f32m4(C + 6 * N, vl);
|
||||
|
||||
for (int kk = 0; kk < K; kk++) {
|
||||
vfloat32m4_t b_0 = __riscv_vle32_v_f32m4(B + kk * N, vl);
|
||||
|
||||
acc_0 = __riscv_vfmacc_vf_f32m4(acc_0, A[0 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 1) acc_1 = __riscv_vfmacc_vf_f32m4(acc_1, A[1 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 2) acc_2 = __riscv_vfmacc_vf_f32m4(acc_2, A[2 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 3) acc_3 = __riscv_vfmacc_vf_f32m4(acc_3, A[3 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 4) acc_4 = __riscv_vfmacc_vf_f32m4(acc_4, A[4 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 5) acc_5 = __riscv_vfmacc_vf_f32m4(acc_5, A[5 * K + kk], b_0, vl);
|
||||
if constexpr (RM > 6) acc_6 = __riscv_vfmacc_vf_f32m4(acc_6, A[6 * K + kk], b_0, vl);
|
||||
}
|
||||
|
||||
__riscv_vse32_v_f32m4(C + 0 * N, acc_0, vl);
|
||||
if constexpr (RM > 1) __riscv_vse32_v_f32m4(C + 1 * N, acc_1, vl);
|
||||
if constexpr (RM > 2) __riscv_vse32_v_f32m4(C + 2 * N, acc_2, vl);
|
||||
if constexpr (RM > 3) __riscv_vse32_v_f32m4(C + 3 * N, acc_3, vl);
|
||||
if constexpr (RM > 4) __riscv_vse32_v_f32m4(C + 4 * N, acc_4, vl);
|
||||
if constexpr (RM > 5) __riscv_vse32_v_f32m4(C + 5 * N, acc_5, vl);
|
||||
if constexpr (RM > 6) __riscv_vse32_v_f32m4(C + 6 * N, acc_6, vl);
|
||||
}
|
||||
|
||||
template <int RM>
|
||||
static inline void rvv_simd_gemm_dispatch_tail(
|
||||
float * GGML_RESTRICT C,
|
||||
const float * GGML_RESTRICT A,
|
||||
const float * GGML_RESTRICT B,
|
||||
int K, int N, int KN, int remaining_rows)
|
||||
{
|
||||
if constexpr (RM > 0) {
|
||||
if (remaining_rows == RM) {
|
||||
int64_t jj = 0;
|
||||
for (; jj + KN <= N; jj += KN) {
|
||||
rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, KN);
|
||||
}
|
||||
if (jj < N) {
|
||||
rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, N - jj);
|
||||
}
|
||||
} else {
|
||||
rvv_simd_gemm_dispatch_tail<RM - 1>(C, A, B, K, N, KN, remaining_rows);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr int GEMM_RM = 7;
|
||||
|
||||
// C[M x N] += A[M x K] * B[K x N]
|
||||
static void simd_gemm(
|
||||
float * GGML_RESTRICT C,
|
||||
const float * GGML_RESTRICT A,
|
||||
const float * GGML_RESTRICT B,
|
||||
int M, int K, int N)
|
||||
{
|
||||
const int KN = (int)__riscv_vlenb();
|
||||
int64_t ii = 0;
|
||||
for (; ii + GEMM_RM <= M; ii += GEMM_RM) {
|
||||
int64_t jj = 0;
|
||||
for (; jj + KN <= N; jj += KN) {
|
||||
rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, KN);
|
||||
}
|
||||
if (jj < N) {
|
||||
rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, N - jj);
|
||||
}
|
||||
A += GEMM_RM * K;
|
||||
C += GEMM_RM * N;
|
||||
}
|
||||
|
||||
int remaining_rows = M - ii;
|
||||
rvv_simd_gemm_dispatch_tail<GEMM_RM - 1>(C, A, B, K, N, KN, remaining_rows);
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
@@ -924,6 +924,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_F16> {
|
||||
static constexpr int qr = 1;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_Q1_0> {
|
||||
static constexpr int qk = QK1_0;
|
||||
static constexpr int qr = QR1_0;
|
||||
static constexpr int qi = QI1_0;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
|
||||
static constexpr int qk = QK4_0;
|
||||
@@ -1092,10 +1099,6 @@ struct ggml_cuda_device_info {
|
||||
cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
|
||||
|
||||
std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
ncclComm_t comms[GGML_CUDA_MAX_DEVICES];
|
||||
#endif // GGML_USE_NCCL
|
||||
};
|
||||
|
||||
const ggml_cuda_device_info & ggml_cuda_info();
|
||||
@@ -1183,6 +1186,8 @@ struct ggml_cuda_graph {
|
||||
std::vector<cudaGraphNode_t> nodes;
|
||||
bool disable_due_to_gpu_arch = false;
|
||||
bool warmup_complete = false;
|
||||
uint64_t uid = 0;
|
||||
int64_t last_used_time = 0;
|
||||
struct node_properties {
|
||||
ggml_tensor node;
|
||||
void * node_src_data_ptrs[GGML_MAX_SRC];
|
||||
@@ -1364,12 +1369,28 @@ struct ggml_backend_cuda_context {
|
||||
// when the computation is split across CPU/GPU (e.g., with --n-cpu-moe)
|
||||
std::unordered_map<const void *, std::unique_ptr<ggml_cuda_graph>> cuda_graphs;
|
||||
|
||||
int64_t last_graph_eviction_sweep = 0;
|
||||
|
||||
ggml_cuda_graph * cuda_graph(const void * first_node_ptr) {
|
||||
const int64_t time_now = ggml_time_us();
|
||||
|
||||
// sweep every 5s, evicting cuda graphs unused for >=10s
|
||||
if (time_now - last_graph_eviction_sweep >= 5'000'000) {
|
||||
last_graph_eviction_sweep = time_now;
|
||||
for (auto it = cuda_graphs.begin(); it != cuda_graphs.end(); ) {
|
||||
if (time_now - it->second->last_used_time >= 10'000'000) {
|
||||
it = cuda_graphs.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto it = cuda_graphs.find(first_node_ptr);
|
||||
if (it == cuda_graphs.end()) {
|
||||
cuda_graphs[first_node_ptr] = std::make_unique<ggml_cuda_graph>();
|
||||
return cuda_graphs[first_node_ptr].get();
|
||||
it = cuda_graphs.emplace(first_node_ptr, std::make_unique<ggml_cuda_graph>()).first;
|
||||
}
|
||||
it->second->last_used_time = time_now;
|
||||
return it->second.get();
|
||||
}
|
||||
|
||||
|
||||
@@ -711,6 +711,8 @@ to_bf16_cuda_t ggml_get_to_bf16_cuda(ggml_type type) {
|
||||
|
||||
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_cont_cuda<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_row_q4_0_cuda;
|
||||
case GGML_TYPE_Q4_1:
|
||||
@@ -767,6 +769,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
|
||||
to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_cont_cuda<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_row_q4_0_cuda;
|
||||
case GGML_TYPE_Q4_1:
|
||||
@@ -822,6 +826,8 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_cuda<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
@@ -843,6 +849,8 @@ to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float, nv_bfloat16>;
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_cuda<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
@@ -864,6 +872,8 @@ to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half, float>;
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_cuda<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
||||
@@ -1,5 +1,27 @@
|
||||
#include "common.cuh"
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q1_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q1_0 * x = (const block_q1_0 *) vx;
|
||||
|
||||
const float d = x[ib].d;
|
||||
|
||||
const int bit_index_0 = iqs;
|
||||
const int bit_index_1 = iqs + 1;
|
||||
|
||||
const int byte_index_0 = bit_index_0 / 8;
|
||||
const int bit_offset_0 = bit_index_0 % 8;
|
||||
|
||||
const int byte_index_1 = bit_index_1 / 8;
|
||||
const int bit_offset_1 = bit_index_1 % 8;
|
||||
|
||||
// Extract bits: 1 = +d, 0 = -d (branchless)
|
||||
const int bit_0 = (x[ib].qs[byte_index_0] >> bit_offset_0) & 1;
|
||||
const int bit_1 = (x[ib].qs[byte_index_1] >> bit_offset_1) & 1;
|
||||
|
||||
v.x = (2*bit_0 - 1) * d;
|
||||
v.y = (2*bit_1 - 1) * d;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
|
||||
@@ -179,6 +179,10 @@ static void ggml_cuda_get_rows_switch_src0_type(
|
||||
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q1_0:
|
||||
get_rows_cuda_q<QK1_0, QR1_0, dequantize_q1_0>(src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
get_rows_cuda_q<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
|
||||
@@ -324,28 +324,22 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
// configure logging to stdout
|
||||
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
||||
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
ggml_cuda_set_device(id);
|
||||
for (int id_other = 0; id_other < info.device_count; ++id_other) {
|
||||
if (id == id_other) {
|
||||
continue;
|
||||
}
|
||||
int can_access_peer;
|
||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
||||
if (can_access_peer) {
|
||||
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
||||
if (getenv("GGML_CUDA_P2P") != nullptr) {
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
ggml_cuda_set_device(id);
|
||||
for (int id_other = 0; id_other < info.device_count; ++id_other) {
|
||||
if (id == id_other) {
|
||||
continue;
|
||||
}
|
||||
int can_access_peer;
|
||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
||||
if (can_access_peer) {
|
||||
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
int dev_ids[GGML_CUDA_MAX_DEVICES];
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
dev_ids[id] = id;
|
||||
}
|
||||
NCCL_CHECK(ncclCommInitAll(info.comms, info.device_count, dev_ids));
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
@@ -1125,66 +1119,51 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
|
||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
|
||||
// This then causes a crash in this function
|
||||
if (ne == 0) {
|
||||
return true;
|
||||
}
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
GGML_ASSERT(tensors[i] != nullptr);
|
||||
GGML_ASSERT(ggml_nelements(tensors[i]) == ne);
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(tensors[i]));
|
||||
}
|
||||
struct ggml_backend_cuda_comm_context {
|
||||
std::vector<ggml_backend_t> backends;
|
||||
std::vector<ncclComm_t> comms;
|
||||
|
||||
const ggml_cuda_device_info info = ggml_cuda_info();
|
||||
|
||||
// For small tensors, simply reduce them as FP32.
|
||||
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
|
||||
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
|
||||
~ggml_backend_cuda_comm_context() {
|
||||
for (ncclComm_t comm : comms) {
|
||||
NCCL_CHECK(ncclCommDestroy(comm));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
// For large tensors it's faster to compress them to BF16 for the reduction:
|
||||
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
|
||||
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
||||
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return;
|
||||
}
|
||||
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
|
||||
delete comm_ctx;
|
||||
#else
|
||||
GGML_UNUSED(comm_ctx_v);
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> tmp[GGML_CUDA_MAX_DEVICES];
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
if (!ggml_backend_is_cuda(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
|
||||
std::vector<int> dev_ids;
|
||||
ret->backends.reserve(n_backends);
|
||||
dev_ids.reserve(n_backends);
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
ret->backends.push_back(backends[i]);
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
tmp[i].pool = &cuda_ctx->pool();
|
||||
tmp[i].alloc(ne);
|
||||
|
||||
ggml_cuda_set_device(i);
|
||||
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
dev_ids.push_back(cuda_ctx->device);
|
||||
}
|
||||
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, info.comms[cuda_ctx->device], cuda_ctx->stream()));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
|
||||
ggml_cuda_set_device(i);
|
||||
to_fp32(tmp[i].get(), (float *) tensors[i]->data, ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
return true;
|
||||
ret->comms.resize(n_backends);
|
||||
NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
|
||||
return ret;
|
||||
#else
|
||||
// If NCCL is installed it is used by default for optimal performance.
|
||||
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
|
||||
@@ -1197,7 +1176,76 @@ bool ggml_backend_cuda_allreduce_tensor(ggml_backend_t * backends, struct ggml_t
|
||||
warning_printed = true;
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
GGML_UNUSED_VARS(backends, tensors, n_backends);
|
||||
GGML_UNUSED_VARS(backends, n_backends);
|
||||
return nullptr;
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
|
||||
// This then causes a crash in this function
|
||||
if (ne == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_ASSERT(comm_ctx_v != nullptr);
|
||||
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
GGML_ASSERT(tensors[i] != nullptr);
|
||||
GGML_ASSERT(ggml_nelements(tensors[i]) == ne);
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(tensors[i]));
|
||||
}
|
||||
|
||||
// For small tensors, simply reduce them as FP32.
|
||||
// The following heuristic for how "small" a tensor should be is based on RTX 4090s connected via 16x PCIe 4.0.
|
||||
if ((n_backends <= 2 && ne < 32768) || (n_backends == 3 && ne < 131072) || (n_backends >= 4 && ne < 262144)) {
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// For large tensors it's faster to compress them to BF16 for the reduction:
|
||||
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
|
||||
to_fp32_cuda_t to_fp32 = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
||||
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> tmp[GGML_CUDA_MAX_DEVICES];
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
tmp[i].pool = &cuda_ctx->pool();
|
||||
tmp[i].alloc(ne);
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
to_bf16(tensors[i]->data, tmp[i].get(), ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
NCCL_CHECK(ncclGroupStart());
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
NCCL_CHECK(ncclAllReduce(tmp[i].get(), tmp[i].get(), ne, ncclBfloat16, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) comm_ctx->backends[i]->context;
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
to_fp32(tmp[i].get(), (float *) tensors[i]->data, ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
return true;
|
||||
#else
|
||||
GGML_UNUSED_VARS(comm_ctx_v, tensors);
|
||||
return false;
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
@@ -3060,6 +3108,15 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
|
||||
const void * graph_key = ggml_cuda_graph_get_key(cgraph);
|
||||
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
|
||||
|
||||
if (cgraph->uid != 0 &&
|
||||
cgraph->uid == graph->uid) {
|
||||
GGML_LOG_DEBUG("CUDA Graph id %zu reused\n", cgraph->uid);
|
||||
GGML_ASSERT((int)graph->node_props.size() == cgraph->n_nodes);
|
||||
return false;
|
||||
}
|
||||
|
||||
graph->uid = cgraph->uid;
|
||||
|
||||
// Check if the graph size has changed
|
||||
if ((int)graph->node_props.size() != cgraph->n_nodes) {
|
||||
res = true;
|
||||
@@ -4783,6 +4840,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
switch (a->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
@@ -4820,6 +4878,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
@@ -5220,8 +5279,14 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
|
||||
|
||||
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
GGML_UNUSED(reg);
|
||||
if (strcmp(name, "ggml_backend_allreduce_tensor") == 0) {
|
||||
return (void *)ggml_backend_cuda_allreduce_tensor;
|
||||
if (strcmp(name, "ggml_backend_comm_init") == 0) {
|
||||
return (void *)ggml_backend_cuda_comm_init;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_comm_free") == 0) {
|
||||
return (void *)ggml_backend_cuda_comm_free;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_comm_allreduce_tensor") == 0) {
|
||||
return (void *)ggml_backend_cuda_comm_allreduce_tensor;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
return (void *)ggml_backend_cuda_split_buffer_type;
|
||||
|
||||
@@ -5,6 +5,9 @@
|
||||
|
||||
static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
switch (args.type_x) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q1_0>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
|
||||
break;
|
||||
@@ -270,6 +273,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
||||
bool mmq_supported;
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
|
||||
@@ -57,6 +57,8 @@ static_assert(sizeof(block_fp4_mmq) == sizeof(block_q8_1_mmq), "Unexpected b
|
||||
|
||||
static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
|
||||
switch (type_x) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
return MMQ_Q8_1_DS_LAYOUT_D4;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
return MMQ_Q8_1_DS_LAYOUT_DS4;
|
||||
@@ -185,6 +187,7 @@ static constexpr __device__ int get_mmq_y_device() {
|
||||
|
||||
static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0;
|
||||
case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1;
|
||||
case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0;
|
||||
@@ -229,6 +232,7 @@ static_assert(MMQ_MMA_TILE_X_K_NVFP4 % 8 == 4, "Wrong padding.");
|
||||
|
||||
static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
@@ -302,6 +306,87 @@ static constexpr __device__ int mmq_get_nwarps_device() {
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q1_0(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + 2*MMQ_TILE_NE_K);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q8_0, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
constexpr int blocks_per_iter = MMQ_ITER_K / QK1_0;
|
||||
constexpr int threads_per_row = blocks_per_iter * QI1_0;
|
||||
constexpr int nrows = warp_size / threads_per_row;
|
||||
constexpr int scale_entries_per_block = QK1_0 / QK8_1;
|
||||
constexpr int scale_entries_per_row = blocks_per_iter * scale_entries_per_block;
|
||||
|
||||
const int txi = threadIdx.x % threads_per_row;
|
||||
const int kbx = txi / QI1_0;
|
||||
const int kqsx = txi % QI1_0;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
|
||||
int i = i0 + threadIdx.y*nrows + threadIdx.x/threads_per_row;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_q1_0 * bxi = (const block_q1_0 *) x + kbx0 + i*stride + kbx;
|
||||
const int qs_offset = 4*kqsx;
|
||||
const int qs0 = bxi->qs[qs_offset + 0] | (bxi->qs[qs_offset + 1] << 8) |
|
||||
(bxi->qs[qs_offset + 2] << 16) | (bxi->qs[qs_offset + 3] << 24);
|
||||
|
||||
int unpacked_bytes[8];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
const int shift = j * 4;
|
||||
const int bits4 = (qs0 >> shift) & 0x0F;
|
||||
const int b0 = (bits4 & 0x01) ? 1 : -1;
|
||||
const int b1 = (bits4 & 0x02) ? 1 : -1;
|
||||
const int b2 = (bits4 & 0x04) ? 1 : -1;
|
||||
const int b3 = (bits4 & 0x08) ? 1 : -1;
|
||||
unpacked_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
|
||||
}
|
||||
|
||||
const int dst_offset = kbx*(scale_entries_per_block*QI8_0) + kqsx*QI8_0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + dst_offset + j] = unpacked_bytes[j];
|
||||
#else
|
||||
x_qs[i*(2*MMQ_TILE_NE_K + 1) + dst_offset + j] = unpacked_bytes[j];
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
const int ksx = threadIdx.x % scale_entries_per_row;
|
||||
const int scale_block = ksx / scale_entries_per_block;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||
int i = i0 + threadIdx.y;
|
||||
|
||||
if (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_q1_0 * bxi = (const block_q1_0 *) x + kbx0 + i*stride + scale_block;
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + ksx] = bxi->d;
|
||||
#else
|
||||
x_df[i*(2*MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + ksx] = bxi->d;
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
||||
const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
@@ -3290,6 +3375,14 @@ static __device__ __forceinline__ void mmq_write_back_mma(
|
||||
template <int mmq_x, int mmq_y, bool need_check, ggml_type type>
|
||||
struct mmq_type_traits;
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q1_0> {
|
||||
static constexpr int vdr = VDR_Q1_0_Q8_1_MMQ;
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q1_0<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_0> {
|
||||
static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ;
|
||||
|
||||
@@ -9,6 +9,7 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
|
||||
|
||||
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0: return vec_dot_q1_0_q8_1;
|
||||
case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1;
|
||||
case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1;
|
||||
case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1;
|
||||
@@ -36,6 +37,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
||||
|
||||
static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0: return VDR_Q1_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ;
|
||||
case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ;
|
||||
@@ -886,6 +888,12 @@ static void mul_mat_vec_q_switch_type(
|
||||
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
const int ids_stride, cudaStream_t stream) {
|
||||
switch (type_x) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q1_0>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
|
||||
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
|
||||
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
|
||||
|
||||
@@ -32,6 +32,7 @@ SOURCE_FATTN_MMA_START = """// This file has been autogenerated by generate_cu_f
|
||||
SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size_kq}, {head_size_v}, {ncols1}, {ncols2});\n"
|
||||
|
||||
TYPES_MMQ = [
|
||||
"GGML_TYPE_Q1_0",
|
||||
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
||||
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
||||
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_Q1_0);
|
||||
@@ -106,6 +106,9 @@ static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) {
|
||||
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
||||
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
||||
|
||||
#define VDR_Q1_0_Q8_1_MMVQ 1 // Process one 32-element chunk at a time for parallelism
|
||||
#define VDR_Q1_0_Q8_1_MMQ 4 // Q1_0 has 128 bits (4 ints) per block
|
||||
|
||||
#define VDR_Q4_0_Q8_1_MMVQ 2
|
||||
#define VDR_Q4_0_Q8_1_MMQ 4
|
||||
|
||||
@@ -669,6 +672,51 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
||||
return d6 * sumf_d;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q1_0_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
|
||||
const block_q1_0 * bq1_0 = (const block_q1_0 *) vbq + kbx;
|
||||
|
||||
// Q1_0: 128 elements with ONE scale
|
||||
// Q8_1: 32 elements per block with individual scales
|
||||
// iqs selects which of the 4 chunks of 32 elements to process (0-3)
|
||||
|
||||
const float d1 = bq1_0->d;
|
||||
|
||||
// Process only the chunk specified by iqs
|
||||
const block_q8_1 * bq8_1_chunk = bq8_1 + iqs;
|
||||
|
||||
// Load 32 bits (4 bytes) for this chunk from Q1_0
|
||||
const int offset = iqs * 4;
|
||||
const int v = bq1_0->qs[offset + 0] | (bq1_0->qs[offset + 1] << 8) |
|
||||
(bq1_0->qs[offset + 2] << 16) | (bq1_0->qs[offset + 3] << 24);
|
||||
|
||||
// Unpack 32 bits into 32 signed values (-1 or +1)
|
||||
int vi_bytes[8];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
const int shift = j * 4;
|
||||
const int bits4 = (v >> shift) & 0x0F;
|
||||
const int b0 = (bits4 & 0x01) ? 1 : -1;
|
||||
const int b1 = (bits4 & 0x02) ? 1 : -1;
|
||||
const int b2 = (bits4 & 0x04) ? 1 : -1;
|
||||
const int b3 = (bits4 & 0x08) ? 1 : -1;
|
||||
vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
|
||||
}
|
||||
|
||||
// Compute dot product for this 32-element chunk
|
||||
int sumi = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
const int u = get_int_b4(bq8_1_chunk->qs, j);
|
||||
sumi = ggml_cuda_dp4a(vi_bytes[j], u, sumi);
|
||||
}
|
||||
|
||||
// Apply Q1_0's single scale and this chunk's Q8_1 scale
|
||||
const float d8 = __low2float(bq8_1_chunk->ds);
|
||||
return d1 * d8 * sumi;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
|
||||
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
// This is a "staging" header for new ggml API
|
||||
// It is not publicly available and it should not be used by 3rd party projects
|
||||
//
|
||||
// When the API matures enough, it will be moved to the official public API
|
||||
|
||||
//
|
||||
// Meta backend
|
||||
//
|
||||
|
||||
#define GGML_BACKEND_META_MAX_DEVICES 16
|
||||
|
||||
enum ggml_backend_meta_split_axis {
|
||||
// tensor split by tensor dimensions:
|
||||
GGML_BACKEND_SPLIT_AXIS_0 = 0,
|
||||
GGML_BACKEND_SPLIT_AXIS_1 = 1,
|
||||
GGML_BACKEND_SPLIT_AXIS_2 = 2,
|
||||
GGML_BACKEND_SPLIT_AXIS_3 = 3,
|
||||
|
||||
GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
|
||||
GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum
|
||||
|
||||
// for internal bookkeeping only:
|
||||
GGML_BACKEND_SPLIT_AXIS_NONE = 98,
|
||||
GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
|
||||
};
|
||||
GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
|
||||
|
||||
struct ggml_backend_meta_split_state {
|
||||
enum ggml_backend_meta_split_axis axis;
|
||||
|
||||
// for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
|
||||
// - each device has a slice of the tensor along the split axis
|
||||
// - most tensors have n_segments == 1 and a contiguous slice of the tensor data
|
||||
// - some tensors have an inhomogenenous data layout along the split axis,
|
||||
// those tensors are divided into segments which are each individually split across devices
|
||||
// - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
|
||||
// the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
|
||||
// - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
|
||||
// that each need to be split individually across devices so that each device gets a slice of Q, K, and V
|
||||
int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES];
|
||||
uint32_t n_segments;
|
||||
};
|
||||
|
||||
// function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
|
||||
typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
|
||||
|
||||
// create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
|
||||
// TODO: this looks a bit strange - a backend API creates a device. I think we should try
|
||||
// express this as a backend registry functionality instead
|
||||
GGML_API ggml_backend_dev_t ggml_backend_meta_device(
|
||||
ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
|
||||
@@ -47,6 +47,7 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
|
||||
|
||||
if (_hmx_idx GREATER_EQUAL 0)
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
hmx-queue.c
|
||||
hmx-matmul-ops.c
|
||||
)
|
||||
|
||||
|
||||
@@ -31,6 +31,14 @@ static inline uint64_t hex_get_pktcnt() {
|
||||
return pktcnt;
|
||||
}
|
||||
|
||||
static inline uint32_t hex_ceil_pow2(uint32_t x) {
|
||||
if (x <= 1) { return 1; }
|
||||
int p = 2;
|
||||
x--;
|
||||
while (x >>= 1) { p <<= 1; }
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline size_t hmx_ceil_div(size_t num, size_t den) {
|
||||
return (num + den - 1) / den;
|
||||
}
|
||||
@@ -73,8 +81,7 @@ static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride,
|
||||
#define HEX_L2_LINE_SIZE 64
|
||||
#define HEX_L2_FLUSH_SIZE (128 * 1024)
|
||||
|
||||
static inline void hex_l2flush(void * addr, size_t size)
|
||||
{
|
||||
static inline void hex_l2flush(void * addr, size_t size) {
|
||||
if (size > HEX_L2_FLUSH_SIZE) {
|
||||
qurt_mem_cache_clean((qurt_addr_t) 0, 0, QURT_MEM_CACHE_FLUSH_INVALIDATE_ALL, QURT_MEM_DCACHE);
|
||||
} else {
|
||||
@@ -89,4 +96,8 @@ static inline void hex_l2flush(void * addr, size_t size)
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hex_pause() {
|
||||
asm volatile(" pause(#255)\n");
|
||||
}
|
||||
|
||||
#endif /* HEX_UTILS_H */
|
||||
|
||||
@@ -16,14 +16,16 @@
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "worker-pool.h"
|
||||
|
||||
#include "hvx-utils.h"
|
||||
#include "hvx-dump.h"
|
||||
#include "worker-pool.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
#include "hmx-utils.h"
|
||||
#include "hmx-ops.h"
|
||||
#include "hmx-utils.h"
|
||||
#include "hmx-queue.h"
|
||||
#include "hmx-profile.h"
|
||||
|
||||
static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
|
||||
@@ -47,7 +49,8 @@ static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
|
||||
static const int32_t weight_transpose_scatter_offsets[32] __attribute__((aligned(VLEN))) = {
|
||||
0*128, 1*128, 2*128, 3*128, 4*128, 5*128, 6*128, 7*128,
|
||||
8*128, 9*128, 10*128, 11*128, 12*128, 13*128, 14*128, 15*128,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
16*128, 17*128, 18*128, 19*128, 20*128, 21*128, 22*128, 23*128,
|
||||
24*128, 25*128, 26*128, 27*128, 28*128, 29*128, 30*128, 31*128
|
||||
};
|
||||
|
||||
// Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes
|
||||
@@ -109,36 +112,45 @@ static inline bool hmx_add_overflow(size_t a, size_t b, size_t *out) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Search for optimal (mc, nc) chunk sizes that maximize mc * nc within VTCM budget.
|
||||
// Search for optimal (mc, nc) chunk sizes within VTCM budget.
|
||||
//
|
||||
// Cost model: total = nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead
|
||||
// per_n_cost: bytes per nc column (weight + scratch buffers)
|
||||
// per_m_cost: bytes per mc row (activation)
|
||||
// per_mn_cost: bytes per mc*nc element (output)
|
||||
// overhead: fixed bytes (scales 256B, eye_tile 2048B, etc.)
|
||||
// VTCM model: nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead
|
||||
//
|
||||
// Minimize ceil(m/mc) * m_block_cost + ceil(n/nc) * n_block_cost.
|
||||
// All matmul paths repeat weight processing per M-block and activation loading
|
||||
// per N-block, so discrete block counts drive total overhead.
|
||||
// Tie-break: when cost is equal, prefer larger mc * nc.
|
||||
//
|
||||
// Caller-provided coefficients:
|
||||
// m_block_cost: penalty per extra M-block (weight redundancy, scales with n).
|
||||
// n_block_cost: penalty per extra N-block (activation redundancy, scales with m).
|
||||
//
|
||||
// Algorithm: nc sweeps from n_max down by 32, analytically solving for mc_max.
|
||||
// Returns 0 on success, -1 if VTCM is insufficient.
|
||||
static int hmx_compute_chunks(
|
||||
size_t vtcm_total, size_t overhead,
|
||||
size_t per_n_cost, size_t per_m_cost, size_t per_mn_cost,
|
||||
int m, int n,
|
||||
size_t *m_chunk_out, size_t *n_chunk_out,
|
||||
size_t *total_out)
|
||||
{
|
||||
static int hmx_compute_chunks(size_t vtcm_total,
|
||||
size_t overhead,
|
||||
size_t per_n_cost,
|
||||
size_t per_m_cost,
|
||||
size_t per_mn_cost,
|
||||
int m,
|
||||
int n,
|
||||
size_t m_block_cost,
|
||||
size_t n_block_cost,
|
||||
size_t * m_chunk_out,
|
||||
size_t * n_chunk_out,
|
||||
size_t * total_out) {
|
||||
if (m <= 0 || n <= 0) return -1;
|
||||
if (vtcm_total <= overhead) return -1;
|
||||
if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1;
|
||||
|
||||
const size_t usable = vtcm_total - overhead;
|
||||
size_t best_mn = 0, best_m = 0, best_n = 0;
|
||||
|
||||
size_t best_cost = SIZE_MAX;
|
||||
size_t best_mn = 0;
|
||||
size_t best_m = 0, best_n = 0;
|
||||
|
||||
const size_t n_max = hex_align_down((size_t)n, HMX_FP16_TILE_N_COLS);
|
||||
for (size_t nc = n_max; nc >= HMX_FP16_TILE_N_COLS; nc -= HMX_FP16_TILE_N_COLS) {
|
||||
// Early exit: if nc * m_max cannot beat best, smaller nc won't either
|
||||
if (nc * hex_align_down((size_t)m, HMX_FP16_TILE_N_ROWS) <= best_mn)
|
||||
break;
|
||||
|
||||
size_t n_fixed = 0, ncmn = 0, mc_denom = 0;
|
||||
if (hmx_mul_overflow(nc, per_n_cost, &n_fixed)) continue;
|
||||
if (n_fixed >= usable) goto next_nc;
|
||||
@@ -152,10 +164,19 @@ static int hmx_compute_chunks(
|
||||
mc = hex_align_down(mc, HMX_FP16_TILE_N_ROWS);
|
||||
mc = hex_smin(mc, (size_t)m);
|
||||
|
||||
if (mc > 0 && mc * nc > best_mn) {
|
||||
best_mn = mc * nc;
|
||||
best_m = mc;
|
||||
best_n = nc;
|
||||
if (mc == 0) {
|
||||
goto next_nc;
|
||||
}
|
||||
|
||||
size_t mblocks = ((size_t) m + mc - 1) / mc;
|
||||
size_t nblocks = ((size_t) n + nc - 1) / nc;
|
||||
size_t cost = mblocks * m_block_cost + nblocks * n_block_cost;
|
||||
size_t mn = mc * nc;
|
||||
if (cost < best_cost || (cost == best_cost && mn > best_mn)) {
|
||||
best_cost = cost;
|
||||
best_mn = mn;
|
||||
best_m = mc;
|
||||
best_n = nc;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -233,7 +254,7 @@ static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_scales = hvx_vec_splat_f16(*scale);
|
||||
// q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
|
||||
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
// Shuffle before LUT
|
||||
v_quants = Q6_Vb_vshuff_Vb(v_quants);
|
||||
@@ -257,7 +278,7 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
// Load all 128 packed bytes (4 contiguous 32-byte groups)
|
||||
HVX_Vector vq = hvx_vmemu(packed_128);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
|
||||
// Shuffle before LUT
|
||||
@@ -277,10 +298,8 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
|
||||
|
||||
// Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
|
||||
out[0] = v_lo; // group0 already in [0:63]
|
||||
out[1] = Q6_V_vror_VR(v_lo, 64); // group1 rotated to [0:63]
|
||||
out[2] = v_hi; // group2 already in [0:63]
|
||||
out[3] = Q6_V_vror_VR(v_hi, 64); // group3 rotated to [0:63]
|
||||
out[0] = v_lo; // group0 already in [0:63]
|
||||
out[1] = v_hi; // group2 already in [0:63]
|
||||
}
|
||||
|
||||
// Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
|
||||
@@ -384,8 +403,9 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
size_t row_stride, int weight_type,
|
||||
int start_tile, int end_tile) {
|
||||
|
||||
const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
|
||||
const int qrow_size = (weight_type == HTP_TYPE_Q8_0) ? k_block : (k_block / 2);
|
||||
const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS;
|
||||
const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
|
||||
const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block;
|
||||
|
||||
const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
|
||||
(weight_type == HTP_TYPE_MXFP4) ? hvx_vmem(mxfp4_to_fp16_lut) :
|
||||
@@ -398,47 +418,46 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); // 4 bytes = 1 column step
|
||||
const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); // first 16 words (64 bytes)
|
||||
|
||||
for (int t = start_tile; t < end_tile; ) {
|
||||
int ct = t / n_k_tiles; // column tile index
|
||||
int kt = t % n_k_tiles; // K tile index
|
||||
unsigned ct = (unsigned)start_tile / n_k_tiles; // column tile index
|
||||
unsigned kt = (unsigned)start_tile % n_k_tiles; // K tile index
|
||||
for (unsigned t = start_tile; t < end_tile; ) {
|
||||
if (kt >= n_k_tiles) { kt = 0; ct++; }
|
||||
|
||||
// --- Batch-4 fast path for Q4_0/IQ4_NL: process 4 contiguous K-tiles with one vlut16 per row ---
|
||||
if ((weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) && (kt % 4 == 0) && (t + 4 <= end_tile) &&
|
||||
((t + 3) / n_k_tiles == ct)) {
|
||||
int blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4
|
||||
bool upper = (sub_blk_base >= 4);
|
||||
int packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes
|
||||
int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
|
||||
+ sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales
|
||||
// --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
|
||||
if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
|
||||
unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4
|
||||
bool upper = (sub_blk_base >= 4);
|
||||
unsigned packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes
|
||||
unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
|
||||
+ sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales
|
||||
|
||||
__fp16 *tile_bases[4];
|
||||
for (int g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
|
||||
for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
|
||||
|
||||
HVX_Vector v_off = v_scat_base;
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
|
||||
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
|
||||
int row1 = row0 + 1;
|
||||
const uint8_t *r0 = vtcm_src + row0 * row_stride;
|
||||
const uint8_t *r1 = vtcm_src + row1 * row_stride;
|
||||
|
||||
HVX_Vector v0[4], v1[4];
|
||||
unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
|
||||
unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
|
||||
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
|
||||
HVX_Vector v0[2];
|
||||
const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
|
||||
if (row1 < n_cols) {
|
||||
dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt, v1);
|
||||
} else {
|
||||
v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
|
||||
}
|
||||
|
||||
for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]); }
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]); }
|
||||
|
||||
|
||||
r0 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
}
|
||||
|
||||
for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }
|
||||
|
||||
t += 4;
|
||||
t += 4; kt += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -495,20 +514,19 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
// --- Single-tile fallback ---
|
||||
__fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
if (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) {
|
||||
int blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
int sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32;
|
||||
bool upper = (sub_blk >= 4);
|
||||
int byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
|
||||
int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
|
||||
if (is_q4) {
|
||||
unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
unsigned sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32;
|
||||
bool upper = (sub_blk >= 4);
|
||||
unsigned byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
|
||||
unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
|
||||
|
||||
HVX_Vector v_off = v_scat_base; // reset to column 0
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
|
||||
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
|
||||
int row1 = row0 + 1;
|
||||
|
||||
const uint8_t *r0 = vtcm_src + row0 * row_stride;
|
||||
const uint8_t *r1 = vtcm_src + row1 * row_stride;
|
||||
unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
|
||||
unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
|
||||
const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
|
||||
HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(
|
||||
r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
|
||||
@@ -585,7 +603,7 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
}
|
||||
(void) *(volatile HVX_Vector *)(tile_base);
|
||||
}
|
||||
++t;
|
||||
++t; ++kt;
|
||||
}
|
||||
|
||||
// Drain HVX scatter write buffer: a vmem load on the same HW thread retires
|
||||
@@ -630,9 +648,9 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
|
||||
assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
|
||||
assert(k_block % HMX_FP16_TILE_N_COLS == 0);
|
||||
|
||||
int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
|
||||
int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
|
||||
int n_tot_tiles = n_col_tiles * n_k_tiles;
|
||||
size_t n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
|
||||
size_t n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
|
||||
size_t n_tot_tiles = n_col_tiles * n_k_tiles;
|
||||
|
||||
size_t n_tiles_per_task = hmx_ceil_div(n_tot_tiles, ctx->n_threads);
|
||||
|
||||
@@ -653,49 +671,91 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
|
||||
// --- End x4x2 dequantizers ---
|
||||
|
||||
// requires external HMX lock
|
||||
static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales,
|
||||
static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales,
|
||||
int n_row_tiles, int n_col_tiles, int n_dot_tiles) {
|
||||
hmx_set_output_scales(scales);
|
||||
__builtin_assume(n_row_tiles > 0);
|
||||
__builtin_assume(n_col_tiles > 0);
|
||||
__builtin_assume(n_dot_tiles > 0);
|
||||
|
||||
Q6_bias_mxmem2_A((void *)scales);
|
||||
for (int r = 0; r < n_row_tiles; ++r) {
|
||||
for (int c = 0; c < n_col_tiles; ++c) {
|
||||
for (size_t c = 0; c < n_col_tiles; ++c) {
|
||||
Q6_mxclracc_hf();
|
||||
|
||||
const __fp16 *row_tiles = activation + r * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
for (int k = 0; k < n_dot_tiles; ++k) {
|
||||
int offset = k * HMX_FP16_TILE_N_ELMS;
|
||||
hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset);
|
||||
Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
|
||||
row_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
col_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
}
|
||||
|
||||
__fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS;
|
||||
hmx_consume_accumulator_fp16(out_tile);
|
||||
Q6_mxmem_AR_after_hf(out_tile, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Async HMX matmul job (for pipeline overlap) ---
|
||||
|
||||
typedef struct {
|
||||
__fp16 * output;
|
||||
const __fp16 * activation;
|
||||
const __fp16 * weight;
|
||||
const __fp16 * scales;
|
||||
uint32_t n_row_tiles;
|
||||
uint32_t n_col_tiles;
|
||||
uint32_t n_dot_tiles;
|
||||
} hmx_matmul_job_t;
|
||||
|
||||
static void hmx_matmul_worker_fn(void * data) {
|
||||
hmx_matmul_job_t * job = (hmx_matmul_job_t *) data;
|
||||
FARF(HIGH, "hmx-mm-job: n_row_tiles %u n_col_tiles %u n_dot_tiles %u", job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles);
|
||||
core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales, job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles);
|
||||
}
|
||||
|
||||
static inline void hmx_matmul_job_init(hmx_matmul_job_t * job,
|
||||
__fp16 * output,
|
||||
const __fp16 * activation,
|
||||
const __fp16 * weight,
|
||||
const __fp16 * scales,
|
||||
int n_row_tiles,
|
||||
int n_col_tiles,
|
||||
int n_dot_tiles) {
|
||||
job->output = output;
|
||||
job->activation = activation;
|
||||
job->weight = weight;
|
||||
job->scales = scales;
|
||||
job->n_row_tiles = n_row_tiles;
|
||||
job->n_col_tiles = n_col_tiles;
|
||||
job->n_dot_tiles = n_dot_tiles;
|
||||
}
|
||||
|
||||
// --- End async HMX matmul job ---
|
||||
|
||||
static void transfer_output_chunk_fp16_to_fp32(float *restrict dst, const __fp16 *restrict vtcm_src, int n_rows, int n_cols, int n) {
|
||||
assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
|
||||
const int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
|
||||
const size_t tile_row_stride = (n_cols / HMX_FP16_TILE_N_COLS) * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
const HVX_Vector one = hvx_vec_splat_f16(1.0);
|
||||
|
||||
for (int r = 0; r < n_rows; r += 2) {
|
||||
int r0 = r / HMX_FP16_TILE_N_ROWS;
|
||||
int r1 = r % HMX_FP16_TILE_N_ROWS;
|
||||
for (size_t r = 0; r < n_rows; r += 2) {
|
||||
const size_t r0 = r / HMX_FP16_TILE_N_ROWS;
|
||||
const size_t r1 = (r % HMX_FP16_TILE_N_ROWS) / 2; // index of the row pair within the tile
|
||||
const __fp16 *row_base = vtcm_src + r0 * tile_row_stride;
|
||||
float *output_row_base = dst + r * n; // global memory row base for row r (and r+1)
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int c = 0; c < n_cols; c += HMX_FP16_TILE_N_COLS) {
|
||||
int c0 = c / HMX_FP16_TILE_N_COLS;
|
||||
|
||||
const __fp16 *tile = vtcm_src + (r0 * n_col_tiles + c0) * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
HVX_Vector v = ((const HVX_Vector *) tile)[r1 / 2];
|
||||
for (size_t c = 0; c < n_cols; c += HMX_FP16_TILE_N_COLS) {
|
||||
const size_t c0 = c / HMX_FP16_TILE_N_COLS;
|
||||
const __fp16 *tile = row_base + c0 * HMX_FP16_TILE_N_ELMS;
|
||||
HVX_Vector v = ((const HVX_Vector *) tile)[r1];
|
||||
HVX_VectorPair vp = Q6_Wqf32_vmpy_VhfVhf(v, one);
|
||||
|
||||
volatile HVX_Vector *pv_out0 = (volatile HVX_Vector *) (dst + (r * n + c + 0));
|
||||
volatile HVX_Vector *pv_out1 = (volatile HVX_Vector *) (dst + (r * n + c + n)); // next row in global memory
|
||||
volatile HVX_Vector *pv_out0 = (volatile HVX_Vector *) (output_row_base + c + 0);
|
||||
volatile HVX_Vector *pv_out1 = (volatile HVX_Vector *) (output_row_base + c + n); // next row in global memory
|
||||
|
||||
*pv_out0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(vp));
|
||||
if (r + 1 < n_rows) {
|
||||
@@ -733,7 +793,7 @@ static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst,
|
||||
assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
|
||||
|
||||
size_t n_tot_chunks = n_rows;
|
||||
size_t n_chunks_per_task = 32; // must be multiple of HMX_FP16_TILE_N_ROWS (32)
|
||||
size_t n_chunks_per_task = HMX_FP16_TILE_N_ROWS; // must be multiple of HMX_FP16_TILE_N_ROWS (32)
|
||||
|
||||
output_transfer_task_state_t state;
|
||||
state.n_tasks = (n_tot_chunks + n_chunks_per_task - 1) / n_chunks_per_task;
|
||||
@@ -832,12 +892,13 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
const size_t f32_scratch_per_m = use_dma_activation ? (size_t) params->k * sizeof(float) : 0;
|
||||
|
||||
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
|
||||
// FP16 weight: interleave and activation load have similar per-element cost.
|
||||
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256,
|
||||
/*per_n=*/3 * vec_dot_size,
|
||||
/*per_m=*/group_size * vec_dot_size + f32_scratch_per_m,
|
||||
/*per_mn=*/sizeof(__fp16),
|
||||
params->m, params->n,
|
||||
&m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
/*per_n=*/3 * vec_dot_size,
|
||||
/*per_m=*/group_size * vec_dot_size + f32_scratch_per_m,
|
||||
/*per_mn=*/sizeof(__fp16), params->m, params->n,
|
||||
/*m_block_cost=*/(size_t) params->n,
|
||||
/*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__);
|
||||
return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
|
||||
}
|
||||
@@ -864,7 +925,7 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
|
||||
}
|
||||
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
|
||||
FARF(MEDIUM, "%s: grouped path m=%d k=%d n=%d group=%d streams=%d mc=%zu nc=%zu vtcm=%zu/%zu",
|
||||
__func__, params->m, params->k, params->n, group_size, params->ne13,
|
||||
@@ -882,12 +943,15 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
const size_t fp16_row_bytes = (size_t) params->k * sizeof(__fp16);
|
||||
const size_t weight_row_bytes = (size_t) params->weight_stride * sizeof(__fp16);
|
||||
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
|
||||
for (int b3 = 0; b3 < params->ne13; ++b3) {
|
||||
for (int b2_base = 0; b2_base < params->ne12; b2_base += group_size) {
|
||||
const __fp16 *weight_group = hmx_matmul_weight_batch_ptr(params, b2_base, b3);
|
||||
|
||||
for (size_t mr = 0; mr < (size_t) params->m; mr += m_chunk_n_rows) {
|
||||
const size_t n_rows = hex_smin((size_t) params->m - mr, m_chunk_n_rows);
|
||||
const size_t n_row_tiles = hmx_ceil_div((int) n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
|
||||
// Pre-load activations for all heads in the group (once per m_chunk).
|
||||
// When the source is strided (permuted Q), use 2D DMA to gather
|
||||
@@ -925,10 +989,9 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_first);
|
||||
}
|
||||
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
|
||||
for (size_t nc = 0; nc < (size_t) params->n; nc += n_chunk_n_cols) {
|
||||
const size_t n_cols = hex_smin((size_t) params->n - nc, n_chunk_n_cols);
|
||||
const size_t n_col_tiles = hmx_ceil_div((int) n_cols, HMX_FP16_TILE_N_COLS);
|
||||
|
||||
TIMER_START(weight_load);
|
||||
{
|
||||
@@ -952,11 +1015,9 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
for (int g = 0; g < group_size; ++g) {
|
||||
TIMER_START(hmx_core);
|
||||
{
|
||||
const __fp16 *vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride;
|
||||
const int n_row_tiles = hmx_ceil_div((int) n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
const int n_col_tiles = hmx_ceil_div((int) n_cols, HMX_FP16_TILE_N_COLS);
|
||||
core_dot_chunk_fp16(vtcm_output, vtcm_act_g, vtcm_weight, vtcm_scales,
|
||||
n_row_tiles, n_col_tiles, params->k / 32);
|
||||
const __fp16 * vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride;
|
||||
core_dot_chunk_fp16(vtcm_output, vtcm_act_g, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles,
|
||||
params->k / 32);
|
||||
}
|
||||
TIMER_STOP(hmx_core);
|
||||
|
||||
@@ -968,12 +1029,12 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
|
||||
TIMER_STOP(output_store);
|
||||
}
|
||||
}
|
||||
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
|
||||
TIMER_STOP(total);
|
||||
|
||||
#if defined(ENABLE_PROFILE_TIMERS)
|
||||
@@ -1006,13 +1067,15 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
const size_t f32_scratch_per_m = use_dma_activation ? (size_t) k * sizeof(float) : 0;
|
||||
|
||||
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
|
||||
// FP16 weight: interleave and activation load have similar per-element cost.
|
||||
if (hmx_compute_chunks(vtcm_budget,
|
||||
/*overhead=*/ 256,
|
||||
/*per_n=*/ 3 * vec_dot_size, // W + S0 + S1
|
||||
/*per_m=*/ vec_dot_size + f32_scratch_per_m, // A + optional F32 scratch
|
||||
/*per_mn=*/ sizeof(__fp16), // O
|
||||
m, n,
|
||||
&m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
/*overhead=*/256,
|
||||
/*per_n=*/3 * vec_dot_size, // W + S0 + S1
|
||||
/*per_m=*/vec_dot_size + f32_scratch_per_m, // A + optional F32 scratch
|
||||
/*per_mn=*/sizeof(__fp16), // O
|
||||
m, n,
|
||||
/*m_block_cost=*/(size_t) n,
|
||||
/*n_block_cost=*/(size_t) m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
|
||||
return -1;
|
||||
}
|
||||
@@ -1039,7 +1102,7 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
return -1;
|
||||
}
|
||||
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
|
||||
FARF(MEDIUM, "%s: m=%d k=%d n=%d mc=%zu nc=%zu vtcm=%zu/%zu",
|
||||
__func__, m, k, n, m_chunk_n_rows, n_chunk_n_cols,
|
||||
@@ -1057,7 +1120,8 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
|
||||
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
|
||||
// transfer activation matrix chunk into VTCM
|
||||
size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
const size_t n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
|
||||
TIMER_START(activation_load);
|
||||
{
|
||||
@@ -1095,7 +1159,8 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
}
|
||||
|
||||
for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) {
|
||||
size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
|
||||
|
||||
TIMER_START(weight_load);
|
||||
{
|
||||
@@ -1120,8 +1185,6 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
|
||||
TIMER_START(hmx_core);
|
||||
{
|
||||
const int n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
const int n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
|
||||
core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, k / 32);
|
||||
}
|
||||
TIMER_STOP(hmx_core);
|
||||
@@ -1157,6 +1220,8 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
|
||||
int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, int m,
|
||||
int k, int n, int w_type);
|
||||
|
||||
#define FALLBACK_TO_STANDARD 1
|
||||
|
||||
int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
|
||||
const uint8_t *restrict permuted_weight, int m, int k, int n,
|
||||
int weight_type) {
|
||||
@@ -1169,9 +1234,12 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
|
||||
// for large m, k (e.g. prefill FFN Down), use out-stationary version
|
||||
if (m >= 128 && k > n && n > 1024) {
|
||||
FARF(MEDIUM, "hmx_matmul_qk: OUT-STATIONARY path m=%d k=%d n=%d type=%d (K_BLOCK=512, %d K-iters with fp16 intermediate)",
|
||||
m, k, n, weight_type, (k + 511) / 512);
|
||||
return mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
|
||||
int rc = mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
|
||||
if (rc != FALLBACK_TO_STANDARD) {
|
||||
return rc; // 0 success, -1 error
|
||||
}
|
||||
FARF(MEDIUM, "hmx_matmul_qk: out-stationary fallback to standard m=%d k=%d n=%d", m, k, n);
|
||||
// fall through to standard path
|
||||
}
|
||||
|
||||
size_t row_stride = get_x4x2_row_stride(weight_type, k);
|
||||
@@ -1197,9 +1265,10 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
}
|
||||
|
||||
size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
|
||||
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256,
|
||||
per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost,
|
||||
m, n, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
// Quantized weight: dequant ~1.5x more expensive per element than activation load.
|
||||
if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost, m, n,
|
||||
/*m_block_cost=*/(size_t) n * 3,
|
||||
/*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d pipe=%d budget=%zu)",
|
||||
__func__, m, k, n, use_pipeline, vtcm_budget);
|
||||
return -1;
|
||||
@@ -1237,7 +1306,7 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
return -1;
|
||||
}
|
||||
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
|
||||
FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d pipe=%d mc=%zu nc=%zu vtcm=%zu/%zu",
|
||||
__func__, m, k, n, weight_type, use_pipeline,
|
||||
@@ -1256,12 +1325,12 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols,
|
||||
(size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
|
||||
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
|
||||
if (!use_pipeline) {
|
||||
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
|
||||
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
|
||||
// transfer activation matrix chunk into VTCM
|
||||
size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
const size_t n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
|
||||
TIMER_START(activation_load);
|
||||
{
|
||||
@@ -1279,7 +1348,8 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
}
|
||||
|
||||
for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) {
|
||||
size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols);
|
||||
const size_t n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
|
||||
|
||||
TIMER_START(weight_load);
|
||||
{
|
||||
@@ -1304,8 +1374,6 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
|
||||
TIMER_START(hmx_core);
|
||||
{
|
||||
const int n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS);
|
||||
const int n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS);
|
||||
core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, k / 32);
|
||||
}
|
||||
TIMER_STOP(hmx_core);
|
||||
@@ -1318,20 +1386,22 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
TIMER_STOP(output_store);
|
||||
}
|
||||
}
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
} else {
|
||||
// 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
|
||||
// stage B and D (dequantize and store) are expected to be on the critical path
|
||||
// HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).
|
||||
|
||||
// A --> B: vtcm_qweight, 1 buffer
|
||||
// B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
|
||||
// C --> D: vtcm_output0/vtcm_output1, 2 buffers
|
||||
|
||||
//
|
||||
// LD ||A3| | B3 ||
|
||||
// MM || C2 ||
|
||||
// ST || D1 | ||
|
||||
// Async timeline (C overlaps B+D):
|
||||
// main+HVX: [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
|
||||
// HMX queue: [████ C0 ████████][████ C1 ████████████][████ C2 ████████]
|
||||
|
||||
int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
|
||||
hmx_matmul_job_t job_slots[2]; // persistent double-buffered job descriptors
|
||||
|
||||
for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
|
||||
const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
|
||||
|
||||
@@ -1352,31 +1422,34 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
|
||||
}
|
||||
|
||||
// prologue: B0, A1, C0, B1
|
||||
// prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
|
||||
{
|
||||
// B0
|
||||
// B0: wait for DMA, dequant weight chunk 0
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
|
||||
|
||||
// A1
|
||||
// A1: issue DMA for weight chunk 1
|
||||
const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
|
||||
if (1 < n_chunk_cnt) {
|
||||
const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
|
||||
}
|
||||
|
||||
// C0
|
||||
core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
|
||||
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
// submit C0 (non-blocking — HMX worker executes in parallel)
|
||||
hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
|
||||
(__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
|
||||
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
|
||||
hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0]));
|
||||
|
||||
// B1
|
||||
// B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
|
||||
if (1 < n_chunk_cnt) {
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
|
||||
}
|
||||
}
|
||||
|
||||
// main loop
|
||||
// main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
|
||||
for (int i = 0; i < n_chunk_cnt; ++i) {
|
||||
const size_t nc = i * n_chunk_n_cols;
|
||||
const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
|
||||
@@ -1386,36 +1459,41 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
|
||||
const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);
|
||||
|
||||
// issue A_{i+2}
|
||||
// issue A_{i+2}: DMA push (non-blocking)
|
||||
if (i + 2 < n_chunk_cnt) {
|
||||
const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
|
||||
dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
|
||||
}
|
||||
|
||||
// wait for HMX (C_{i}) -- C_{i} is done
|
||||
// wait C_i: block until prologue/previous C completes
|
||||
hmx_queue_pop(ctx->hmx_queue);
|
||||
|
||||
// result of B_{i+1} (input of C_{i+1}) should be ready now
|
||||
|
||||
// issue C_{i+1}
|
||||
// submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
|
||||
// job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
|
||||
// counterpart — and (i+1)%2 was last used by C_{i-1} which completed
|
||||
// before C_i was submitted.
|
||||
if (i + 1 < n_chunk_cnt) {
|
||||
core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales,
|
||||
hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
|
||||
(__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
|
||||
vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
|
||||
hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
|
||||
hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]));
|
||||
}
|
||||
|
||||
// compute D_{i}
|
||||
// D_i: store output (multi-thread HVX, parallel with C_{i+1})
|
||||
float *output_chunk = dst + (mr * n + nc);
|
||||
transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);
|
||||
|
||||
// wait for DMA (A_{i+2}), compute B_{i+2}
|
||||
// B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
|
||||
if (i + 2 < n_chunk_cnt) {
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
|
||||
hmx_queue_suspend(ctx->hmx_queue);
|
||||
}
|
||||
|
||||
TIMER_STOP(total);
|
||||
|
||||
@@ -1434,29 +1512,36 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
|
||||
}
|
||||
|
||||
// C += AB
|
||||
void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp16 *col_scales, const __fp16 *eye_tile,
|
||||
void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b, const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile,
|
||||
int n_row_tiles, int n_col_tiles, int n_dot_tiles, bool zero_init) {
|
||||
__builtin_assume(n_row_tiles > 0);
|
||||
__builtin_assume(n_col_tiles > 0);
|
||||
__builtin_assume(n_dot_tiles > 0);
|
||||
|
||||
hmx_set_output_scales(col_scales);
|
||||
Q6_bias_mxmem2_A((void *)col_scales);
|
||||
|
||||
for (int i = 0; i < n_row_tiles; ++i) {
|
||||
for (int j = 0; j < n_col_tiles; ++j) {
|
||||
const size_t dot_tile_stride = n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
for (size_t i = 0; i < n_row_tiles; ++i) {
|
||||
const __fp16 *row_base = a + i * dot_tile_stride;
|
||||
__fp16 *res_base = c + i * n_col_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
for (size_t j = 0; j < n_col_tiles; ++j) {
|
||||
Q6_mxclracc_hf();
|
||||
|
||||
const __fp16 *row_tiles = a + i * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
const __fp16 *col_tiles = b + j * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
__fp16 *accum_tile = c + (i * n_col_tiles + j) * HMX_FP16_TILE_N_ELMS;
|
||||
const __fp16 *col_tiles = b + j * dot_tile_stride;
|
||||
const __fp16 *row_tiles = row_base;
|
||||
__fp16 *accum_tile = res_base + j * HMX_FP16_TILE_N_ELMS;
|
||||
if (!zero_init) {
|
||||
hmx_load_tile_pair_fp16(accum_tile, eye_tile);
|
||||
Q6_activation_hf_mxmem_RR((unsigned int)accum_tile, 2047);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047);
|
||||
}
|
||||
|
||||
for (int k = 0; k < n_dot_tiles; ++k) {
|
||||
int offset = k * HMX_FP16_TILE_N_ELMS;
|
||||
hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset);
|
||||
Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
|
||||
Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
|
||||
row_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
col_tiles += HMX_FP16_TILE_N_ELMS;
|
||||
}
|
||||
|
||||
hmx_consume_accumulator_fp16(accum_tile);
|
||||
Q6_mxmem_AR_after_hf(accum_tile, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1540,12 +1625,41 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
|
||||
const size_t vtcm_budget = ctx->vtcm_size;
|
||||
|
||||
const size_t M_BLOCK_SIZE = 512;
|
||||
const size_t N_BLOCK_SIZE = 512;
|
||||
const size_t K_BLOCK_SIZE = 512;
|
||||
const size_t K_BLOCK_SIZE = 1024;
|
||||
|
||||
// Compute precise buffer sizes
|
||||
// Fallback: if k doesn't need K-blocking, out-stationary has no advantage
|
||||
const size_t k_iters_check = (k + K_BLOCK_SIZE - 1) / K_BLOCK_SIZE;
|
||||
if (k_iters_check <= 1) {
|
||||
FARF(MEDIUM, "%s: K_BLK=%zu >= k=%d, fallback to standard path", __func__, K_BLOCK_SIZE, k);
|
||||
return FALLBACK_TO_STANDARD;
|
||||
}
|
||||
|
||||
// Dynamic M,N search via hmx_compute_chunks
|
||||
const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE);
|
||||
const size_t per_m = K_BLOCK_SIZE * sizeof(float) // scratch1: M×K×4 (act DMA staging F32)
|
||||
+ K_BLOCK_SIZE * sizeof(__fp16); // activation: M×K×2 (F16 tiles)
|
||||
const size_t per_n = sub_row_stride_alloc // scratch0: N×sub_row(K) (packed quant)
|
||||
+ K_BLOCK_SIZE * sizeof(__fp16); // weight: N×K×2 (F16 tiles)
|
||||
const size_t per_mn = sizeof(__fp16); // output: M×N×2 (out-stationary)
|
||||
// Alignment margin: hex_align_up can add up to 2047 bytes per buffer;
|
||||
// scratch1 (mc×6144) is naturally 2048-aligned, remaining 4 buffers need margin
|
||||
const size_t align_margin = 4 * HMX_FP16_TILE_SIZE;
|
||||
const size_t overhead = HMX_FP16_TILE_SIZE + 256 + align_margin; // eye_tile + scales + alignment
|
||||
|
||||
size_t M_BLOCK_SIZE, N_BLOCK_SIZE, vtcm_used;
|
||||
// Cost-based search: minimize ceil(m/mc)*m_block_cost + ceil(n/nc)*n_block_cost.
|
||||
// From profiling: wt_dequant per element ≈ 1.5× activation load per element.
|
||||
// m_block_cost = n*3: each extra M-block re-dequants all N×K weight (expensive).
|
||||
// n_block_cost = m*2: each extra N-block re-loads all M×K activation (cheaper).
|
||||
const size_t m_block_cost = (size_t) n * 3;
|
||||
const size_t n_block_cost = (size_t) m * 2;
|
||||
if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn, m, n, m_block_cost, n_block_cost, &M_BLOCK_SIZE,
|
||||
&N_BLOCK_SIZE, &vtcm_used) != 0) {
|
||||
FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Compute precise buffer sizes from searched M,N and fixed K
|
||||
const size_t weight_size = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
const size_t act_size = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
const size_t out_size = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
|
||||
@@ -1554,7 +1668,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
|
||||
const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256;
|
||||
if (total_vtcm > vtcm_budget) {
|
||||
FARF(HIGH, "%s: VTCM too small: need %zu have %zu (m=%d k=%d n=%d)", __func__, total_vtcm, vtcm_budget, m, k, n);
|
||||
FARF(HIGH, "%s: VTCM overflow after search: need %zu have %zu (M=%zu N=%zu K=%zu)", __func__, total_vtcm,
|
||||
vtcm_budget, M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -1568,8 +1683,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
__fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
|
||||
assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);
|
||||
|
||||
FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu", __func__, m, k, n, weight_type,
|
||||
(size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
|
||||
FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", __func__, m, k, n, weight_type,
|
||||
M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);
|
||||
|
||||
// initialize eye tile (32x32 identity matrix)
|
||||
{
|
||||
@@ -1583,7 +1698,7 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
v = Q6_V_vror_VR(v, VLEN - 8);
|
||||
}
|
||||
}
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // fp16: 1.0
|
||||
hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16
|
||||
|
||||
TIMER_DEFINE(fetch);
|
||||
TIMER_DEFINE(act_load);
|
||||
@@ -1601,7 +1716,7 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
const int n_col_tiles = hmx_ceil_div(n_blk_sz, HMX_FP16_TILE_N_COLS);
|
||||
|
||||
for (size_t kk = 0; kk < k; kk += K_BLOCK_SIZE) {
|
||||
size_t k_blk_sz = hex_smin(k - kk, K_BLOCK_SIZE);
|
||||
const size_t k_blk_sz = hex_smin(k - kk, K_BLOCK_SIZE);
|
||||
|
||||
TIMER_START(fetch);
|
||||
// fetch activation block into VTCM
|
||||
@@ -1617,13 +1732,13 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
}
|
||||
|
||||
// fetch weight block into VTCM (x4x2 sub-block: quants + scales)
|
||||
const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
|
||||
{
|
||||
qweight_fetch_task_state_t s;
|
||||
|
||||
const int blk_start = kk / QK_Q4_0x4x2;
|
||||
const int nb_sub = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2;
|
||||
const int full_qrow = (weight_type == HTP_TYPE_Q8_0) ? k : (k / 2);
|
||||
const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
|
||||
const int scale_blk_size =
|
||||
(weight_type == HTP_TYPE_MXFP4) ? HMX_X4X2_MXFP4_EBLK_SIZE : HMX_X4X2_DBLK_SIZE;
|
||||
|
||||
@@ -1663,7 +1778,6 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
// vtcm_scratch0 is used to store the qweight chunk
|
||||
// worker_pool_run_func already returned, so fetch is done
|
||||
const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight, vtcm_scratch0,
|
||||
n_blk_sz, k_blk_sz, sub_row_stride, weight_type);
|
||||
}
|
||||
|
||||
158
ggml/src/ggml-hexagon/htp/hmx-queue.c
Normal file
158
ggml/src/ggml-hexagon/htp/hmx-queue.c
Normal file
@@ -0,0 +1,158 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <qurt_thread.h>
|
||||
#include <qurt_futex.h>
|
||||
|
||||
#include <HAP_compute_res.h>
|
||||
|
||||
#include "hmx-queue.h"
|
||||
|
||||
#define QURT_LOWEST_PRIO (254)
|
||||
|
||||
static inline void hmx_lock(struct hmx_queue *q)
|
||||
{
|
||||
if (!q->hmx_locked) {
|
||||
HAP_compute_res_hmx_lock(q->hap_rctx);
|
||||
q->hmx_locked = true;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hmx_unlock(struct hmx_queue *q)
|
||||
{
|
||||
if (q->hmx_locked) {
|
||||
HAP_compute_res_hmx_unlock(q->hap_rctx);
|
||||
q->hmx_locked = false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hmx_queue_process(struct hmx_queue *q, bool* killed) {
|
||||
unsigned int ir = atomic_load(&q->idx_read);
|
||||
|
||||
while (ir != atomic_load(&q->idx_write)) {
|
||||
struct hmx_queue_desc *d = &q->desc[ir];
|
||||
if (!d->done) {
|
||||
FARF(HIGH, "hmx-queue-process: ir %u func %p data %p", ir, d->func, d->data);
|
||||
|
||||
enum hmx_queue_signal sig = (enum hmx_queue_signal) (unsigned int) d->func;
|
||||
switch (sig) {
|
||||
case HMX_QUEUE_NOOP: /* noop */; break;
|
||||
case HMX_QUEUE_KILL: *killed = true; break;
|
||||
case HMX_QUEUE_SUSPEND: hmx_unlock(q); break;
|
||||
default:
|
||||
hmx_lock(q);
|
||||
d->func(d->data);
|
||||
break;
|
||||
}
|
||||
|
||||
atomic_fetch_add(&d->done, 1);
|
||||
}
|
||||
|
||||
ir = (ir + 1) & q->idx_mask;
|
||||
atomic_store(&q->idx_read, ir);
|
||||
}
|
||||
}
|
||||
|
||||
static void hmx_queue_thread(void * arg) {
|
||||
struct hmx_queue * q = (struct hmx_queue *) arg;
|
||||
|
||||
FARF(HIGH, "hmx-queue-thread: started");
|
||||
|
||||
bool killed = false;
|
||||
|
||||
unsigned int poll_cnt = HMX_QUEUE_POLL_COUNT;
|
||||
unsigned int prev_seqn = 0;
|
||||
while (!killed) {
|
||||
unsigned int seqn = atomic_load(&q->seqn);
|
||||
if (seqn == prev_seqn) {
|
||||
if (--poll_cnt) { hex_pause(); continue; }
|
||||
FARF(HIGH, "hmx-queue-thread: sleeping");
|
||||
qurt_futex_wait(&q->seqn, prev_seqn);
|
||||
continue;
|
||||
}
|
||||
prev_seqn = seqn;
|
||||
poll_cnt = HMX_QUEUE_POLL_COUNT;
|
||||
|
||||
FARF(HIGH, "hmx-queue-thread: new work");
|
||||
|
||||
hmx_queue_process(q, &killed);
|
||||
}
|
||||
|
||||
FARF(HIGH, "hmx-queue-thread: stopped");
|
||||
}
|
||||
|
||||
struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx) {
|
||||
capacity = hex_ceil_pow2(capacity);
|
||||
|
||||
struct hmx_queue * q = (struct hmx_queue *) memalign(32, sizeof(struct hmx_queue));
|
||||
if (q == NULL) {
|
||||
FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
|
||||
return NULL;
|
||||
}
|
||||
memset(q, 0, sizeof(struct hmx_queue));
|
||||
q->capacity = capacity;
|
||||
q->idx_mask = capacity - 1;
|
||||
q->hap_rctx = hap_rctx;
|
||||
|
||||
q->desc = (struct hmx_queue_desc *) memalign(64, capacity * sizeof(struct hmx_queue_desc));
|
||||
if (!q->desc) {
|
||||
FARF(ERROR, "hmx-queue: failed to allocate HMX queue descriptors\n");
|
||||
return NULL;
|
||||
}
|
||||
memset(q->desc, 0, capacity * sizeof(struct hmx_queue_desc));
|
||||
|
||||
const size_t stack_size = HMX_QUEUE_THREAD_STACK_SIZE;
|
||||
q->stack = (unsigned char *) memalign(64, stack_size);
|
||||
if (!q->stack) {
|
||||
FARF(ERROR, "hmx-queue: thread stack allocation failed (%zu bytes)", stack_size);
|
||||
return NULL;
|
||||
}
|
||||
memset(q->stack, 0, stack_size);
|
||||
|
||||
// Match caller thread priority (same pattern as worker-pool.c).
|
||||
int prio = qurt_thread_get_priority(qurt_thread_get_id());
|
||||
if (prio < 1) {
|
||||
prio = 1;
|
||||
}
|
||||
if (prio > QURT_LOWEST_PRIO) {
|
||||
prio = QURT_LOWEST_PRIO;
|
||||
}
|
||||
|
||||
qurt_thread_attr_t attr;
|
||||
qurt_thread_attr_init(&attr);
|
||||
qurt_thread_attr_set_stack_addr(&attr, q->stack);
|
||||
qurt_thread_attr_set_stack_size(&attr, stack_size);
|
||||
qurt_thread_attr_set_priority(&attr, prio);
|
||||
qurt_thread_attr_set_name(&attr, "hmx-queue");
|
||||
|
||||
int err = qurt_thread_create(&q->thread, &attr, hmx_queue_thread, q);
|
||||
if (err) {
|
||||
FARF(ERROR, "hmx-worker: thread create failed (%d)", err);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
FARF(HIGH, "hmx-queue: capacity %u\n", capacity);
|
||||
|
||||
return q;
|
||||
}
|
||||
|
||||
void hmx_queue_delete(struct hmx_queue * q) {
|
||||
if (!q) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Tell the worker to exit.
|
||||
hmx_queue_flush(q);
|
||||
hmx_queue_signal(q, HMX_QUEUE_KILL);
|
||||
hmx_queue_flush(q);
|
||||
|
||||
int status;
|
||||
qurt_thread_join(q->thread, &status);
|
||||
|
||||
free(q->desc);
|
||||
free(q->stack);
|
||||
free(q);
|
||||
}
|
||||
134
ggml/src/ggml-hexagon/htp/hmx-queue.h
Normal file
134
ggml/src/ggml-hexagon/htp/hmx-queue.h
Normal file
@@ -0,0 +1,134 @@
|
||||
#ifndef HMX_QUEUE_H
|
||||
#define HMX_QUEUE_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdatomic.h>
|
||||
|
||||
#include <hexagon_types.h>
|
||||
#include <qurt_thread.h>
|
||||
#include <qurt_futex.h>
|
||||
#include <HAP_farf.h>
|
||||
|
||||
#include "hex-utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define HMX_QUEUE_THREAD_STACK_SIZE (16 * 1024)
|
||||
#define HMX_QUEUE_POLL_COUNT 2000
|
||||
|
||||
typedef void (*hmx_queue_func)(void *);
|
||||
|
||||
// Dummy funcs used as signals
|
||||
enum hmx_queue_signal {
|
||||
HMX_QUEUE_NOOP = 0, // aka NULL
|
||||
HMX_QUEUE_SUSPEND,
|
||||
HMX_QUEUE_KILL
|
||||
};
|
||||
|
||||
struct hmx_queue_desc {
|
||||
hmx_queue_func func;
|
||||
void * data;
|
||||
atomic_uint done;
|
||||
};
|
||||
|
||||
struct hmx_queue {
|
||||
struct hmx_queue_desc * desc;
|
||||
atomic_uint idx_write; // updated by producer (push)
|
||||
atomic_uint idx_read; // updated by consumer (process)
|
||||
unsigned int idx_pop; // updated by producer (pop)
|
||||
uint32_t idx_mask;
|
||||
uint32_t capacity;
|
||||
|
||||
atomic_uint seqn; // incremented for all pushes, used with futex
|
||||
qurt_thread_t thread;
|
||||
void * stack;
|
||||
uint32_t hap_rctx;
|
||||
bool hmx_locked;
|
||||
};
|
||||
|
||||
struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx);
|
||||
void hmx_queue_delete(struct hmx_queue * q);
|
||||
|
||||
static inline struct hmx_queue_desc hmx_queue_make_desc(hmx_queue_func func, void * data) {
|
||||
struct hmx_queue_desc d = { func, data };
|
||||
return d;
|
||||
}
|
||||
|
||||
static inline bool hmx_queue_push(struct hmx_queue * q, struct hmx_queue_desc d) {
|
||||
unsigned int ir = atomic_load(&q->idx_read);
|
||||
unsigned int iw = q->idx_write;
|
||||
|
||||
if (((iw + 1) & q->idx_mask) == ir) {
|
||||
FARF(HIGH, "hmx-queue-push: queue is full\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
atomic_store(&d.done, 0);
|
||||
|
||||
FARF(HIGH, "hmx-queue-push: iw %u func %p data %p\n", iw, d.func, d.data);
|
||||
|
||||
q->desc[iw] = d;
|
||||
atomic_store(&q->idx_write, (iw + 1) & q->idx_mask);
|
||||
// wake up our thread
|
||||
atomic_fetch_add(&q->seqn, 1);
|
||||
qurt_futex_wake(&q->seqn, 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool hmx_queue_signal(struct hmx_queue *q, enum hmx_queue_signal sig) {
|
||||
return hmx_queue_push(q, hmx_queue_make_desc((hmx_queue_func) sig, NULL));
|
||||
}
|
||||
|
||||
static inline bool hmx_queue_empty(struct hmx_queue * q) {
|
||||
return q->idx_pop == q->idx_write;
|
||||
}
|
||||
|
||||
static inline uint32_t hmx_queue_depth(struct hmx_queue * q) {
|
||||
return (q->idx_read - q->idx_read) & q->idx_mask;
|
||||
}
|
||||
|
||||
static inline uint32_t hmx_queue_capacity(struct hmx_queue * q) {
|
||||
return q->capacity;
|
||||
}
|
||||
|
||||
static inline struct hmx_queue_desc hmx_queue_pop(struct hmx_queue * q) {
|
||||
unsigned int ip = q->idx_pop;
|
||||
unsigned int iw = q->idx_write;
|
||||
|
||||
struct hmx_queue_desc rd = { NULL, NULL };
|
||||
if (ip == iw) {
|
||||
return rd;
|
||||
}
|
||||
|
||||
// Wait for desc to complete
|
||||
struct hmx_queue_desc * d = &q->desc[ip];
|
||||
while (!atomic_load(&d->done)) {
|
||||
FARF(HIGH, "hmx-queue-pop: waiting for HMX queue : %u\n", ip);
|
||||
hex_pause();
|
||||
}
|
||||
|
||||
rd = *d;
|
||||
q->idx_pop = (ip + 1) & q->idx_mask;
|
||||
|
||||
FARF(HIGH, "hmx-queue-pop: ip %u func %p data %p\n", ip, rd.func, rd.data);
|
||||
return rd;
|
||||
}
|
||||
|
||||
static inline void hmx_queue_flush(struct hmx_queue * q) {
|
||||
while (hmx_queue_pop(q).func != NULL) ;
|
||||
}
|
||||
|
||||
static inline void hmx_queue_suspend(struct hmx_queue *q) {
|
||||
hmx_queue_signal(q, HMX_QUEUE_SUSPEND);
|
||||
hmx_queue_flush(q);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif /* HMX_QUEUE_H */
|
||||
@@ -14,10 +14,6 @@
|
||||
|
||||
#define HMX_INLINE_ALWAYS inline __attribute__((unused, always_inline))
|
||||
|
||||
static HMX_INLINE_ALWAYS void hmx_set_output_scales(const void *scales) {
|
||||
asm volatile("bias = mxmem2(%0)" :: "r"(scales));
|
||||
}
|
||||
|
||||
// Initialise aligned 256-byte area with scale vector + zero padding.
|
||||
static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) {
|
||||
HVX_Vector *pv = (HVX_Vector *)out_scales;
|
||||
@@ -25,58 +21,6 @@ static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vecto
|
||||
*pv = Q6_V_vzero();
|
||||
}
|
||||
|
||||
// Load multiple contiguous tiles with :deep streaming.
|
||||
// Rt = total region size - 1; the hardware streams through [Rs, Rs + Rt].
|
||||
// IMPORTANT: the tile region [Rs, Rs + Rt] must NOT cross a VTCM 4 MB bank
|
||||
// boundary, otherwise the mxmem instruction will raise a precise bus error.
|
||||
// Callers must ensure their VTCM layout satisfies this constraint.
|
||||
static HMX_INLINE_ALWAYS void hmx_load_tiles_fp16(const __fp16 *row_tiles,
|
||||
const __fp16 *col_tiles,
|
||||
size_t n_tiles) {
|
||||
size_t limit = n_tiles * HMX_FP16_TILE_SIZE - 1;
|
||||
asm volatile(
|
||||
"{ activation.hf = mxmem(%0, %1):deep\n"
|
||||
"weight.hf = mxmem(%2, %3) }\n"
|
||||
:: "r"(row_tiles), "r"(limit), "r"(col_tiles), "r"(limit)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
// Load a single activation+weight tile pair (no :deep streaming).
|
||||
// Rt defines the accessible region [Rs, Rs+Rt]. Following the reference formula
|
||||
// (limit = n_tiles * HMX_FP16_TILE_SIZE - 1), for a single tile Rt = 2047.
|
||||
// The original code used Rt=0x7FFF (32 KB region); when dynamic VTCM allocation
|
||||
// places a tile near a 4 MB bank boundary, the oversized region crosses it and
|
||||
// triggers a precise bus error (0x2601). Rt=2047 confines accesses to exactly
|
||||
// one 2048-byte tile while covering all 16 HVX vectors (offsets 0..2047).
|
||||
static HMX_INLINE_ALWAYS void hmx_load_tile_pair_fp16(const __fp16 *act_tile,
|
||||
const __fp16 *wt_tile) {
|
||||
asm volatile(
|
||||
"{ activation.hf = mxmem(%0, %1)\n"
|
||||
"weight.hf = mxmem(%2, %3) }\n"
|
||||
:: "r"(act_tile), "r"(2047),
|
||||
"r"(wt_tile), "r"(2047)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static HMX_INLINE_ALWAYS void hmx_consume_accumulator_fp16(__fp16 *out) {
|
||||
// Use the combined convert-and-store instruction (matches the reference
|
||||
// Q6_mxmem_AR_after_hf intrinsic). The previous two-instruction sequence
|
||||
// "cvt.hf = acc(2); mxmem = cvt" used an undocumented Rs=2 parameter.
|
||||
asm volatile(
|
||||
"mxmem(%0, %1):after.hf = acc\n"
|
||||
:: "r"(out), "r"(0)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
// Compute inner product of two vectors of tiles and store result.
|
||||
static HMX_INLINE_ALWAYS void hmx_dot_fp16(__fp16 *out,
|
||||
const __fp16 *row_tiles,
|
||||
const __fp16 *col_tiles,
|
||||
size_t n_tiles) {
|
||||
hmx_load_tiles_fp16(row_tiles, col_tiles, n_tiles);
|
||||
hmx_consume_accumulator_fp16(out);
|
||||
}
|
||||
|
||||
// --- VTCM sequential allocator (from htp-ops-lib/include/dsp/vtcm_mgr.h) ---
|
||||
|
||||
static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) {
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#define HTP_CTX_H
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hmx-queue.h"
|
||||
#include "htp-ops.h"
|
||||
#include "worker-pool.h"
|
||||
|
||||
@@ -30,6 +31,8 @@ struct htp_spad {
|
||||
uint32_t size_per_thread; // size per thread
|
||||
};
|
||||
|
||||
struct htp_context;
|
||||
|
||||
// Context while processing an Op
|
||||
// TODO: fold this into the main context
|
||||
struct htp_ops_context {
|
||||
@@ -72,6 +75,10 @@ struct htp_context {
|
||||
atomic_bool vtcm_needs_release;
|
||||
|
||||
struct htp_ops_context octx;
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
struct hmx_queue * hmx_queue; // Async HMX queue for pipeline overlap
|
||||
#endif
|
||||
};
|
||||
|
||||
int op_matmul(struct htp_ops_context * octx);
|
||||
|
||||
@@ -91,7 +91,14 @@ enum htp_op_code {
|
||||
#define HTP_OP_MAX_BUFS 8
|
||||
#define HTP_OP_MAX_REQS 256
|
||||
#define HTP_OP_MAX_TENSORS (HTP_OP_MAX_REQS * HTP_OP_MAX_INPUTS + HTP_OP_MAX_REQS)
|
||||
|
||||
#if __HVX_ARCH__ < 75
|
||||
#define HTP_OP_MAX_VMEM (3167538380u)
|
||||
#else
|
||||
#define HTP_OP_MAX_VMEM (3221225472u)
|
||||
#endif
|
||||
|
||||
#define HTP_MMAP_MAX_VMEM (2147483648u)
|
||||
|
||||
enum htp_tensor_flags {
|
||||
HTP_TENSOR_COMPUTE = (1U << 0), // Tensor buffer temporal compute data (not weights)
|
||||
|
||||
@@ -116,9 +116,14 @@ static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) {
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_f32_to_f16_shuff(HVX_Vector v0, HVX_Vector v1) {
|
||||
#if __HVX_ARCH__ >= 81
|
||||
HVX_Vector q0 = Q6_Vqf32_equals_Vsf(v0);
|
||||
HVX_Vector q1 = Q6_Vqf32_equals_Vsf(v1);
|
||||
#else
|
||||
const HVX_Vector zero = Q6_V_vzero();
|
||||
HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero);
|
||||
HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero);
|
||||
#endif
|
||||
return Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0));
|
||||
}
|
||||
|
||||
|
||||
@@ -18,8 +18,9 @@
|
||||
#include <remote.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hex-utils.h"
|
||||
#include "hex-dma.h"
|
||||
#include "hmx-queue.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
@@ -117,7 +118,11 @@ AEEResult htp_iface_close(remote_handle64 handle) {
|
||||
// release the mmaps (if any)
|
||||
for (uint32_t i=0; i<HTP_MAX_MMAPS; i++) {
|
||||
if (ctx->mmap[i].size) {
|
||||
#if __HVX_ARCH__ > 73
|
||||
HAP_munmap2((void *) ctx->mmap[i].base, ctx->mmap[i].size);
|
||||
#else
|
||||
HAP_munmap((void *) ctx->mmap[i].base, ctx->mmap[i].size);
|
||||
#endif
|
||||
ctx->mmap[i].size = 0;
|
||||
ctx->mmap[i].base = NULL;
|
||||
ctx->mmap[i].fd = -1;
|
||||
@@ -172,8 +177,16 @@ AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t
|
||||
struct htp_mmap *m = &ctx->mmap[i];
|
||||
if (!m->size) {
|
||||
FARF(HIGH, "mmap : fd %u size %u pinned %u", fd, size, pinned);
|
||||
|
||||
#if __HVX_ARCH__ > 73
|
||||
void *va = HAP_mmap2(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
|
||||
#else
|
||||
if (size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
|
||||
FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) size);
|
||||
abort(); // can't do much else at this point
|
||||
}
|
||||
|
||||
void *va = HAP_mmap(NULL, size, HAP_PROT_READ | HAP_PROT_WRITE, 0, fd, 0);
|
||||
#endif
|
||||
if (va == (void*)-1) {
|
||||
FARF(ERROR, "mmap failed : va %p fd %u size %u", va, fd, (uint32_t) size);
|
||||
return AEE_EFAILED;
|
||||
@@ -201,7 +214,11 @@ AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
|
||||
struct htp_mmap *m = &ctx->mmap[i];
|
||||
if (fd < 0 || m->fd == fd) {
|
||||
FARF(HIGH, "unmmap : base %p fd %u size %u", (void*) m->base, m->fd, (uint32_t) m->size);
|
||||
#if __HVX_ARCH__ > 73
|
||||
HAP_munmap2((void *) m->base, m->size);
|
||||
#else
|
||||
HAP_munmap((void *) m->base, m->size);
|
||||
#endif
|
||||
m->size = 0;
|
||||
m->base = NULL;
|
||||
m->fd = -1;
|
||||
@@ -324,6 +341,14 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
ctx->hmx_enabled = use_hmx;
|
||||
ctx->hmx_queue = NULL;
|
||||
if (use_hmx) {
|
||||
ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
|
||||
if (!ctx->hmx_queue) {
|
||||
FARF(ERROR, "hmx-queue-create failed");
|
||||
ctx->hmx_enabled = false;
|
||||
}
|
||||
}
|
||||
FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
|
||||
#endif
|
||||
|
||||
@@ -389,7 +414,11 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
ctx->hmx_enabled = 0;
|
||||
if (ctx->hmx_queue) {
|
||||
hmx_queue_delete(ctx->hmx_queue);
|
||||
ctx->hmx_queue = NULL;
|
||||
}
|
||||
ctx->hmx_enabled = false;
|
||||
#endif
|
||||
|
||||
vtcm_free(ctx);
|
||||
@@ -513,7 +542,11 @@ static inline bool reuse_buf(struct htp_context *ctx, uint32_t *m_reuse, struct
|
||||
static inline void drop_mmap(struct htp_context *ctx, struct htp_mmap *m) {
|
||||
if (m->size && !m->pinned) {
|
||||
FARF(HIGH, "unmap : fd %u base %p size %u pinned %u", m->fd, (void*) m->base, (uint32_t) m->size, m->pinned);
|
||||
#if __HVX_ARCH__ > 73
|
||||
HAP_munmap2((void *) m->base, m->size);
|
||||
#else
|
||||
HAP_munmap((void *) m->base, m->size);
|
||||
#endif
|
||||
m->size = 0;
|
||||
m->base = 0;
|
||||
m->fd = -1;
|
||||
@@ -527,7 +560,16 @@ static inline void mmap_buf(struct htp_context *ctx, struct htp_buf_desc *b) {
|
||||
for (uint32_t i=0; i < HTP_MAX_MMAPS; i++) {
|
||||
struct htp_mmap *m = &ctx->mmap[i];
|
||||
if (!m->size) {
|
||||
#if __HVX_ARCH__ > 73
|
||||
void *va = HAP_mmap2(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
|
||||
#else
|
||||
if (b->size > HTP_MMAP_MAX_VMEM) { // HAP_mmap has a size limit of 2GB
|
||||
FARF(ERROR, "mmap failed : size %u exceeds 2GB limit for HAP_mmap", (uint32_t) b->size);
|
||||
abort(); // can't do much else at this point
|
||||
}
|
||||
|
||||
void *va = HAP_mmap(NULL, b->size, HAP_PROT_READ | HAP_PROT_WRITE, 0, b->fd, 0);
|
||||
#endif
|
||||
if (va == (void*)-1) {
|
||||
FARF(ERROR, "mmap failed : va %p fd %u size %u", va, b->fd, (uint32_t) b->size);
|
||||
abort(); // can't do much else at this point
|
||||
|
||||
@@ -30,6 +30,8 @@ extern "C" {
|
||||
|
||||
void ggml_print_backtrace(void);
|
||||
|
||||
uint64_t ggml_graph_next_uid(void);
|
||||
|
||||
#ifndef MIN
|
||||
# define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
@@ -338,6 +340,10 @@ struct ggml_cgraph {
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
|
||||
// an optional identifier that can be utilized to recognize same graphs if two non-zero values match
|
||||
// a value of 0 means it is not set and should be ignored
|
||||
uint64_t uid;
|
||||
};
|
||||
|
||||
// returns a slice of cgraph with nodes [i0, i1)
|
||||
|
||||
@@ -250,6 +250,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary(ggml_metal
|
||||
case GGML_UNARY_OP_CEIL: op_num = OP_UNARY_NUM_CEIL; break;
|
||||
case GGML_UNARY_OP_ROUND: op_num = OP_UNARY_NUM_ROUND; break;
|
||||
case GGML_UNARY_OP_TRUNC: op_num = OP_UNARY_NUM_TRUNC; break;
|
||||
case GGML_UNARY_OP_XIELU: op_num = OP_UNARY_NUM_XIELU; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
} break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
@@ -1818,6 +1819,23 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale(ggml_met
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_roll(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_ROLL);
|
||||
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
snprintf(base, 256, "kernel_roll_%s", ggml_type_name(op->src[0]->type));
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (!res.pipeline) {
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_PAD);
|
||||
|
||||
|
||||
@@ -152,6 +152,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_3d
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_roll (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
|
||||
@@ -1043,6 +1043,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_UNARY_OP_CEIL:
|
||||
case GGML_UNARY_OP_ROUND:
|
||||
case GGML_UNARY_OP_TRUNC:
|
||||
case GGML_UNARY_OP_XIELU:
|
||||
return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
|
||||
default:
|
||||
return false;
|
||||
@@ -1137,6 +1138,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_TOP_K:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_ROLL:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
// for new head sizes, add checks here
|
||||
@@ -1159,6 +1161,23 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
if (op->src[1]->type != op->src[2]->type) {
|
||||
return false;
|
||||
}
|
||||
switch (op->src[1]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
if (!has_bfloat) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
|
||||
case GGML_OP_SSM_CONV:
|
||||
case GGML_OP_SSM_SCAN:
|
||||
|
||||
@@ -127,6 +127,7 @@
|
||||
#define OP_UNARY_NUM_CEIL 118
|
||||
#define OP_UNARY_NUM_ROUND 119
|
||||
#define OP_UNARY_NUM_TRUNC 120
|
||||
#define OP_UNARY_NUM_XIELU 121
|
||||
|
||||
#define OP_SUM_ROWS_NUM_SUM_ROWS 10
|
||||
#define OP_SUM_ROWS_NUM_MEAN 11
|
||||
@@ -1016,6 +1017,29 @@ typedef struct {
|
||||
int32_t p1;
|
||||
} ggml_metal_kargs_pad_reflect_1d;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne00;
|
||||
int64_t ne01;
|
||||
int64_t ne02;
|
||||
int64_t ne03;
|
||||
uint64_t nb00;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int64_t ne0;
|
||||
int64_t ne1;
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
uint64_t nb0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
int32_t s0;
|
||||
int32_t s1;
|
||||
int32_t s2;
|
||||
int32_t s3;
|
||||
} ggml_metal_kargs_roll;
|
||||
|
||||
typedef struct {
|
||||
uint64_t nb1;
|
||||
int dim;
|
||||
|
||||
@@ -410,6 +410,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
{
|
||||
n_fuse = ggml_metal_op_pad_reflect_1d(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_ROLL:
|
||||
{
|
||||
n_fuse = ggml_metal_op_roll(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
n_fuse = ggml_metal_op_arange(ctx, idx);
|
||||
@@ -787,6 +791,13 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) {
|
||||
args.max = ggml_get_op_params_f32(op, 1);
|
||||
}
|
||||
|
||||
if (op->op == GGML_OP_UNARY && ggml_get_unary_op(op) == GGML_UNARY_OP_XIELU) {
|
||||
args.slope = ggml_get_op_params_f32(op, 1); // alpha_n
|
||||
args.scale = ggml_get_op_params_f32(op, 2); // alpha_p
|
||||
args.bias = ggml_get_op_params_f32(op, 3); // beta
|
||||
args.val = ggml_get_op_params_f32(op, 4); // eps
|
||||
}
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
||||
|
||||
if (pipeline.c4) {
|
||||
@@ -3938,6 +3949,59 @@ int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ggml_metal_op_roll(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_tensor * op = ctx->node(idx);
|
||||
|
||||
ggml_metal_library_t lib = ctx->lib;
|
||||
ggml_metal_encoder_t enc = ctx->enc;
|
||||
|
||||
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||
|
||||
const int32_t s0 = ggml_get_op_params_i32(op, 0);
|
||||
const int32_t s1 = ggml_get_op_params_i32(op, 1);
|
||||
const int32_t s2 = ggml_get_op_params_i32(op, 2);
|
||||
const int32_t s3 = ggml_get_op_params_i32(op, 3);
|
||||
|
||||
ggml_metal_kargs_roll args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.ne03 =*/ ne03,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.ne3 =*/ ne3,
|
||||
/*.nb0 =*/ nb0,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.s0 =*/ s0,
|
||||
/*.s1 =*/ s1,
|
||||
/*.s2 =*/ s2,
|
||||
/*.s3 =*/ s3
|
||||
};
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_roll(lib, op);
|
||||
|
||||
const int nth = std::min(1024, ne0);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_tensor * op = ctx->node(idx);
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ int ggml_metal_op_conv_transpose_2d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_upscale (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_pad (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_pad_reflect_1d (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_roll (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_arange (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_argmax (ggml_metal_op_t ctx, int idx);
|
||||
|
||||
@@ -1177,6 +1177,15 @@ kernel void kernel_unary_impl(
|
||||
if (FC_OP == OP_UNARY_NUM_TRUNC) {
|
||||
dst_ptr[i0] = (T) trunc(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_XIELU) {
|
||||
const TC xi = x;
|
||||
const TC gate = TC(xi > TC(0.0f));
|
||||
const TC clamped = fmin(xi, TC(args.val));
|
||||
const TC y_pos = TC(args.scale) * xi * xi + TC(args.bias) * xi;
|
||||
const TC y_neg = (exp(clamped) - TC(1.0f) - xi) * TC(args.slope) + TC(args.bias) * xi;
|
||||
dst_ptr[i0] = (T) (gate * y_pos + (TC(1.0f) - gate) * y_neg);
|
||||
}
|
||||
}
|
||||
|
||||
#undef FC_OP
|
||||
@@ -5238,6 +5247,40 @@ kernel void kernel_upscale_bicubic_f32(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_roll_f32(
|
||||
constant ggml_metal_kargs_roll & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
device const float * src0_ptr = (device const float *) src0;
|
||||
device float * dst_ptr = (device float *) dst;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
// apply shifts and wrap around
|
||||
int64_t i00 = i0 - args.s0;
|
||||
int64_t i01 = i1 - args.s1;
|
||||
int64_t i02 = i2 - args.s2;
|
||||
int64_t i03 = i3 - args.s3;
|
||||
|
||||
if (i00 < 0) { i00 += args.ne00; } else if (i00 >= args.ne00) { i00 -= args.ne00; }
|
||||
if (i01 < 0) { i01 += args.ne01; } else if (i01 >= args.ne01) { i01 -= args.ne01; }
|
||||
if (i02 < 0) { i02 += args.ne02; } else if (i02 >= args.ne02) { i02 -= args.ne02; }
|
||||
if (i03 < 0) { i03 += args.ne03; } else if (i03 >= args.ne03) { i03 -= args.ne03; }
|
||||
|
||||
int64_t src_idx = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00 + i00;
|
||||
int64_t dst_idx = i3 *args.ne2 *args.ne1 *args.ne0 + i2 *args.ne1 *args.ne0 + i1 *args.ne0 + i0;
|
||||
|
||||
dst_ptr[dst_idx] = src0_ptr[src_idx];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_pad_f32(
|
||||
constant ggml_metal_kargs_pad & args,
|
||||
device const char * src0,
|
||||
|
||||
@@ -121,6 +121,8 @@ set(GGML_OPENCL_KERNELS
|
||||
gemm_noshuffle_q4_k_f32
|
||||
gemv_noshuffle_q6_k_f32
|
||||
gemm_noshuffle_q6_k_f32
|
||||
gemv_noshuffle_q5_k_f32
|
||||
gemm_noshuffle_q5_k_f32
|
||||
mul
|
||||
neg
|
||||
norm
|
||||
|
||||
@@ -542,6 +542,8 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_restore_block_q4_K_noshuffle;
|
||||
cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
|
||||
cl_kernel kernel_convert_block_q5_K, kernel_restore_block_q5_K;
|
||||
cl_kernel kernel_convert_block_q5_K_noshuffle;
|
||||
cl_kernel kernel_restore_block_q5_K_noshuffle;
|
||||
cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
||||
cl_kernel kernel_mul_mv_q4_1_f32;
|
||||
@@ -730,6 +732,8 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_gemm_noshuffle_q4_k_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q6_K_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q6_K_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q5_k_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q5_k_f32;
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
void free() {
|
||||
@@ -944,6 +948,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_K_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_K_noshuffle", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K_noshuffle", &err), err));
|
||||
@@ -2794,6 +2800,45 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q6_K_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q6_K_f32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_noshuffle_q5_k_f32
|
||||
{
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
" -cl-mad-enable ";
|
||||
if (backend_ctx->has_vector_subgroup_broadcast) {
|
||||
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_noshuffle_q5_k_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_noshuffle_q5_k_f32.cl");
|
||||
#endif
|
||||
|
||||
cl_program prog = build_program_from_source(
|
||||
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_k_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_k_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_noshuffle_q5_k_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_q5_k_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_q5_k_f32.cl");
|
||||
#endif
|
||||
cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q5_k_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q5_k_f32", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_LOG_CONT("\n");
|
||||
}
|
||||
@@ -5071,115 +5116,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
|
||||
// Transpose weights
|
||||
size_t q_size_bytes = K * M / 4 * sizeof(float);
|
||||
cl_buffer_region region;
|
||||
region.origin = 0;
|
||||
region.size = q_size_bytes;
|
||||
cl_mem qT_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_quant_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_mem q_d_image1D;
|
||||
cl_mem qT_d_image1D;
|
||||
|
||||
cl_image_format img_fmt_1d;
|
||||
cl_image_desc img_desc_1d;
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4 / 4;
|
||||
img_desc_1d.buffer = extra->q;
|
||||
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4 / 4;
|
||||
img_desc_1d.buffer = qT_d;
|
||||
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
int height_q = M / 4;
|
||||
int width_q = K / 4 / 4;
|
||||
kernel = backend_ctx->kernel_transpose_32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
|
||||
|
||||
size_t local_size_q[3] = {4, 16, 1};
|
||||
size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
// Transpose scales
|
||||
size_t d_size_bytes = M * (K / 32) * 2;
|
||||
region.origin = 0;
|
||||
region.size = d_size_bytes;
|
||||
cl_mem dT_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_scales_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_mem d_d_image1D;
|
||||
cl_mem dT_d_image1D;
|
||||
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_fmt_1d = { CL_R, CL_HALF_FLOAT };
|
||||
img_desc_1d.image_width = M * K / 32;
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.buffer = extra->d;
|
||||
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 32 / 4;
|
||||
img_desc_1d.buffer = dT_d;
|
||||
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
int height_s = M / 4;
|
||||
int width_s = K / 32;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_16_4x1;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
|
||||
|
||||
size_t local_size_s[3] = {4, 16, 1};
|
||||
size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
// copy transposed buffer contents to original buffers
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
|
||||
CL_CHECK(clReleaseMemObject(qT_d));
|
||||
CL_CHECK(clReleaseMemObject(dT_d));
|
||||
|
||||
CL_CHECK(clReleaseMemObject(q_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(d_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(qT_d_image1D));
|
||||
CL_CHECK(clReleaseMemObject(dT_d_image1D));
|
||||
transpose_2d_as_32b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M);
|
||||
} // end transpose
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
@@ -5354,7 +5292,17 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
CL_CHECK(err);
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K;
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
kernel = backend_ctx->kernel_convert_block_q5_K_noshuffle;
|
||||
}
|
||||
#else
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K;
|
||||
#endif
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
||||
@@ -5362,6 +5310,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
@@ -5378,6 +5328,21 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
extra->size_dm = size_dm;
|
||||
|
||||
tensor->extra = extra;
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
|
||||
GGML_ASSERT(K % 32 == 0);
|
||||
|
||||
// Transpose q, d, dm as ushort, qh as uchar
|
||||
transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
|
||||
transpose_2d_as_8b (backend_ctx, extra->qh, extra->qh, size_qh, K/8, M);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/256, M);
|
||||
transpose_2d_as_16b(backend_ctx, extra->dm, extra->dm, size_dm, K/256, M);
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
@@ -5894,6 +5859,57 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_uchar mask_0F = 0x0F;
|
||||
cl_uchar mask_F0 = 0xF0;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_kernels(backend_ctx, tensor)) {
|
||||
int M = tensor->ne[1];
|
||||
int K = tensor->ne[0];
|
||||
|
||||
size_t size_q = extra->size_q;
|
||||
size_t size_qh = extra->size_qh;
|
||||
size_t size_d = extra->size_d;
|
||||
size_t size_dm = extra->size_dm;
|
||||
|
||||
static ggml_cl_buffer buf_trans_q;
|
||||
static ggml_cl_buffer buf_trans_qh;
|
||||
static ggml_cl_buffer buf_trans_d;
|
||||
static ggml_cl_buffer buf_trans_dm;
|
||||
|
||||
buf_trans_q.allocate(backend_ctx->context, size_q);
|
||||
buf_trans_qh.allocate(backend_ctx->context, size_qh);
|
||||
buf_trans_d.allocate(backend_ctx->context, size_d);
|
||||
buf_trans_dm.allocate(backend_ctx->context, size_dm);
|
||||
|
||||
// Reverse transpose q, qh, d, dm
|
||||
transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4);
|
||||
transpose_2d_as_8b (backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/8);
|
||||
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/256);
|
||||
transpose_2d_as_16b(backend_ctx, extra->dm, buf_trans_dm.buffer, size_dm, M, K/256);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_K_noshuffle;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_qh.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_trans_d.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_trans_dm.buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_K;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
|
||||
@@ -5901,6 +5917,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_0F));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_F0));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
||||
size_t local_work_size[] = {1, 1, 1};
|
||||
@@ -9831,19 +9849,18 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const enum ggml_type src0t = src0->type;
|
||||
const enum ggml_type src1t = src1->type;
|
||||
|
||||
GGML_ASSERT(src0t == GGML_TYPE_Q8_0);
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_Q8_0);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
GGML_ASSERT(src1->view_offs == 0);
|
||||
GGML_ASSERT(dst->view_offs == 0);
|
||||
|
||||
@@ -9864,148 +9881,112 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
// init CL objects
|
||||
cl_int status;
|
||||
cl_image_format img_fmt_1d;
|
||||
cl_image_desc img_desc_1d;
|
||||
cl_int err;
|
||||
cl_image_format img_fmt;
|
||||
cl_image_desc img_desc;
|
||||
cl_buffer_region region;
|
||||
cl_mem A_image1d;
|
||||
cl_mem B_image1d;
|
||||
cl_mem B_sub_buffer;
|
||||
cl_mem S_image1d;
|
||||
// for B transpose
|
||||
cl_mem B_image1d_trans = nullptr;
|
||||
cl_mem B_d = nullptr;
|
||||
|
||||
cl_mem D_image1d;
|
||||
cl_mem D_sub_buffer;
|
||||
|
||||
int M = ne01;
|
||||
int N = ne1;
|
||||
int K = ne00;
|
||||
|
||||
// create an image for A
|
||||
img_fmt_1d = { CL_R, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 4; // Divide by 4 for char -> float
|
||||
img_desc_1d.buffer = extra0_q8_0->q;
|
||||
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
if (ne1 == 1) {
|
||||
cl_mem q_img = nullptr;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
|
||||
// create an image for Scale
|
||||
img_fmt_1d = { CL_R, CL_HALF_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * K / 32; // Block size is 32
|
||||
img_desc_1d.buffer = extra0_q8_0->d;
|
||||
S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
// image for q
|
||||
img_fmt = { CL_R, CL_UNSIGNED_INT32};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 4;
|
||||
img_desc.buffer = extra0_q8_0->q;
|
||||
CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// create a sub_buffer for B
|
||||
region.origin = (extra1->offset); // + src1->view_offs);
|
||||
region.size = K * N * sizeof(float);
|
||||
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
// create a sub_buffer for B
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// create an image for B from sub_buffer: RGBA (OCL)
|
||||
img_fmt_1d = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = K * N / 4;
|
||||
img_desc_1d.buffer = B_sub_buffer;
|
||||
B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// Create subbuffer and image1d_buffer for dst
|
||||
region.origin = (extrad->offset); // + dst->view_offs;
|
||||
region.size = M * N * sizeof(float);
|
||||
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
img_fmt_1d = {CL_R, CL_FLOAT};
|
||||
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
||||
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc_1d.image_width = M * N;
|
||||
img_desc_1d.buffer = D_sub_buffer;
|
||||
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
size_t local_work_size[3] = {1, 1, 1};
|
||||
size_t global_work_size[3] = {1, 1, 1};
|
||||
|
||||
if (N == 1) {
|
||||
kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32;
|
||||
|
||||
int r2 = 1;
|
||||
int r3 = 1;
|
||||
cl_uint k_arg = 0;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &extra1->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &extrad->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
||||
|
||||
size_t wavesize = backend_ctx->adreno_wave_size;
|
||||
local_work_size[0] = wavesize;
|
||||
local_work_size[1] = 4; // reduce factor
|
||||
local_work_size[2] = 1;
|
||||
size_t local_work_size[] = { wavesize, 4, 1 };
|
||||
size_t global_work_size[] = { CEIL_DIV(M, wavesize)*wavesize, 4, 1 };
|
||||
|
||||
global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize;
|
||||
global_work_size[1] = 4; // reduce factor
|
||||
global_work_size[2] = 1;
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(q_img));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
} else {
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
int padding;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_sub_buf_trans = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
cl_mem b_img_trans = nullptr;
|
||||
|
||||
//how many extra elements beyond multiple of 8
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// pad N to multiple of 8
|
||||
int extra_elements = N % 8;
|
||||
|
||||
//how much padding to add
|
||||
padding = 0;
|
||||
int padding = 0;
|
||||
if (extra_elements > 0){
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// Specify the starting offset (in bytes)
|
||||
// subbuffer for transposed activations
|
||||
region.origin = 0;
|
||||
// Specify the size of the sub-buffer (divide by 2 for FP16)
|
||||
region.size = K * (N + padding) * sizeof(float)/2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
B_d = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_act_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
|
||||
cl_image_desc image_desc_B_d_output = {
|
||||
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
||||
static_cast<size_t>(K * (N + padding)/4),
|
||||
0, 0, 0, 0, 0, 0, 0, { B_d }
|
||||
};
|
||||
B_image1d_trans = clCreateImage(
|
||||
context,
|
||||
0,
|
||||
&image_format_B_d_output,
|
||||
&image_desc_B_d_output,
|
||||
NULL,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
// image for transposed activations
|
||||
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * (N + padding) / 4;
|
||||
img_desc.buffer = b_sub_buf_trans;
|
||||
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// transpose activations
|
||||
int height_B = N/4;
|
||||
if (height_B == 0) {
|
||||
height_B = 1;
|
||||
@@ -10014,58 +9995,39 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
int padded_height_B = (N + padding)/4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_size_t[2] = { 1, 16 };
|
||||
size_t global_size_t[2] = {
|
||||
static_cast<size_t>(width_B),
|
||||
static_cast<size_t>(padded_height_B)
|
||||
};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
||||
size_t local_work_size_t[2] = { 1, 16 };
|
||||
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4;
|
||||
|
||||
int N_with_padding = N + padding;
|
||||
int padded_N = N + padding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &K));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &M));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &N_with_padding));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &padded_N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
global_work_size[0] = (size_t)(N + 7) / 8;
|
||||
global_work_size[1] = (size_t)(M + 3) / 4;
|
||||
global_work_size[2] = 1;
|
||||
size_t global_work_size[] = { (size_t)CEIL_DIV(N, 8), (size_t)CEIL_DIV(M, 4), 1 };
|
||||
size_t local_work_size[] = { 2, 128, 1 };
|
||||
|
||||
local_work_size[0] = 2;
|
||||
local_work_size[1] = 128;
|
||||
local_work_size[2] = 1;
|
||||
}
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
// enqueue kernel with profiling
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
// deallocate sub buffers and images
|
||||
CL_CHECK(clReleaseMemObject(A_image1d));
|
||||
CL_CHECK(clReleaseMemObject(B_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(B_image1d));
|
||||
CL_CHECK(clReleaseMemObject(S_image1d));
|
||||
CL_CHECK(clReleaseMemObject(D_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(D_image1d));
|
||||
if (B_image1d_trans) {
|
||||
CL_CHECK(clReleaseMemObject(B_image1d_trans));
|
||||
}
|
||||
if (B_d) {
|
||||
CL_CHECK(clReleaseMemObject(B_d));
|
||||
CL_CHECK(clReleaseMemObject(b_img_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
@@ -10451,6 +10413,201 @@ static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_q5_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
ggml_tensor_extra_cl_q5_K * extra0_q5_k = (ggml_tensor_extra_cl_q5_K *)src0->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
|
||||
|
||||
cl_context context = backend_ctx->context;
|
||||
cl_kernel kernel;
|
||||
|
||||
cl_int err;
|
||||
cl_image_format img_fmt;
|
||||
cl_image_desc img_desc;
|
||||
cl_buffer_region region;
|
||||
|
||||
int M = ne01;
|
||||
int N = ne1;
|
||||
int K = ne00;
|
||||
|
||||
cl_uchar mask_d6 = 0x3F;
|
||||
cl_uchar mask_d4 = 0x0F;
|
||||
cl_uchar mask_hi2 = 0xC0;
|
||||
|
||||
if (ne1 == 1) {
|
||||
cl_mem q_img = nullptr;
|
||||
cl_mem qh_img = nullptr;
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
|
||||
// image for q (CL_R, CL_UNSIGNED_INT32): width = M*K/2/4
|
||||
img_fmt = {CL_R, CL_UNSIGNED_INT32};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 2 / 4;
|
||||
img_desc.buffer = extra0_q5_k->q;
|
||||
CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// image for qh (CL_R, CL_HALF_FLOAT): width = M*K/16
|
||||
img_fmt = {CL_R, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 16;
|
||||
img_desc.buffer = extra0_q5_k->qh;
|
||||
CL_CHECK((qh_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations (CL_RGBA, CL_FLOAT): width = K*N/4
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
kernel = backend_ctx->kernel_gemv_noshuffle_q5_k_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qh_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_k->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_k->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra0_q5_k->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_uchar), &mask_d6));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_uchar), &mask_d4));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_uchar), &mask_hi2));
|
||||
|
||||
size_t local_work_size[3] = {64, 4, 1};
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(q_img));
|
||||
CL_CHECK(clReleaseMemObject(qh_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
} else {
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_sub_buf_trans = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
cl_mem b_img_trans = nullptr;
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for activations
|
||||
img_fmt = {CL_RGBA, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * N / 4;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// pad N to multiple of 8
|
||||
int extra_elements = N % 8;
|
||||
int padding = 0;
|
||||
if (extra_elements > 0) {
|
||||
padding = 8 - extra_elements;
|
||||
}
|
||||
|
||||
// subbuffer for transposed activations
|
||||
region.origin = 0;
|
||||
region.size = K * (N + padding) * sizeof(float) / 2;
|
||||
backend_ctx->prealloc_act_trans.allocate(context, region.size);
|
||||
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// image for transposed activations
|
||||
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K * (N + padding) / 4;
|
||||
img_desc.buffer = b_sub_buf_trans;
|
||||
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// transpose activations
|
||||
int height_B = N / 4;
|
||||
if (height_B == 0) height_B = 1;
|
||||
int width_B = K / 4;
|
||||
int padded_height_B = (N + padding) / 4;
|
||||
|
||||
kernel = backend_ctx->kernel_transpose_32_16;
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
|
||||
|
||||
size_t local_work_size_t[2] = {1, 16};
|
||||
size_t global_work_size_t[2] = {(size_t)width_B, (size_t)padded_height_B};
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_gemm_noshuffle_q5_k_f32;
|
||||
int padded_N = N + padding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_k->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_k->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_k->s));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_k->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra0_q5_k->dm));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &b_img_trans));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &padded_N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_uchar), &mask_d6));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_uchar), &mask_d4));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_uchar), &mask_hi2));
|
||||
|
||||
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
|
||||
size_t local_work_size[3] = {1, 128, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_img_trans));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(src0);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -10600,6 +10757,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
return;
|
||||
}
|
||||
|
||||
// q5_K x fp32
|
||||
if (src0t == GGML_TYPE_Q5_K && src1t == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_q5_K_f32_adreno(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// q4_0 x fp32
|
||||
if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
|
||||
// TODO: remove duplicate definitions of image description + format -- move to top
|
||||
|
||||
@@ -568,7 +568,9 @@ kernel void kernel_convert_block_q5_K(
|
||||
global uchar * dst_qh,
|
||||
global uchar * dst_s,
|
||||
global half * dst_d,
|
||||
global half * dst_dm
|
||||
global half * dst_dm,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK_K/2*get_global_id(0);
|
||||
@@ -599,7 +601,9 @@ kernel void kernel_restore_block_q5_K(
|
||||
global uchar * src_s,
|
||||
global half * src_d,
|
||||
global half * src_dm,
|
||||
global struct block_q5_K * dst
|
||||
global struct block_q5_K * dst,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK_K/2*get_global_id(0);
|
||||
@@ -622,6 +626,92 @@ kernel void kernel_restore_block_q5_K(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_K_noshuffle(
|
||||
global struct block_q5_K * src0,
|
||||
global uchar * dst_q,
|
||||
global uchar * dst_qh,
|
||||
global uchar * dst_s,
|
||||
global half * dst_d,
|
||||
global half * dst_dm,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_K * b = (global struct block_q5_K *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK_K/2 * get_global_id(0);
|
||||
global uchar * qh = (global uchar *) dst_qh + QK_K/8 * get_global_id(0);
|
||||
global uchar * s = (global uchar *) dst_s + K_SCALE_SIZE * get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
global half * dm = (global half *) dst_dm + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
*dm = b->dm;
|
||||
|
||||
for (int i = 0; i < QK_K / 64; ++i) {
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
uchar x0 = b->qs[i*32 + 2*j];
|
||||
uchar x1 = b->qs[i*32 + 2*j + 1];
|
||||
q[i*32 + j] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4);
|
||||
q[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0);
|
||||
}
|
||||
}
|
||||
|
||||
for (int l = 0; l < QK_K/8; ++l) {
|
||||
uchar x0 = 0;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
x0 |= ((b->qh[(l%4)*8+i] >> (l/4)) & 0x01) << i;
|
||||
}
|
||||
qh[l] = x0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
||||
s[i] = b->s[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_K_noshuffle(
|
||||
global uchar * src_q,
|
||||
global uchar * src_qh,
|
||||
global uchar * src_s,
|
||||
global half * src_d,
|
||||
global half * src_dm,
|
||||
global struct block_q5_K * dst,
|
||||
uchar mask_0F,
|
||||
uchar mask_F0
|
||||
) {
|
||||
global struct block_q5_K * b = (global struct block_q5_K *) dst + get_global_id(0);
|
||||
global uchar * q = (global uchar *) src_q + QK_K/2 * get_global_id(0);
|
||||
global uchar * qh = (global uchar *) src_qh + QK_K/8 * get_global_id(0);
|
||||
global uchar * s = (global uchar *) src_s + K_SCALE_SIZE * get_global_id(0);
|
||||
global half * d = (global half *) src_d + get_global_id(0);
|
||||
global half * dm = (global half *) src_dm + get_global_id(0);
|
||||
|
||||
b->d = *d;
|
||||
b->dm = *dm;
|
||||
|
||||
for (int i = 0; i < QK_K / 64; ++i) {
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
uchar lo = q[i*32 + j];
|
||||
uchar hi = q[i*32 + j + 16];
|
||||
b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4));
|
||||
b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0));
|
||||
}
|
||||
}
|
||||
|
||||
for (int g = 0; g < 4; ++g) {
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
uchar x0 = 0;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
x0 |= ((qh[4*k+g] >> i) & 0x01) << k;
|
||||
}
|
||||
b->qh[g*8+i] = x0;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < K_SCALE_SIZE; ++i) {
|
||||
b->s[i] = s[i];
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kernel_convert_block_q6_K
|
||||
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
|
||||
|
||||
176
ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl
Normal file
176
ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_k_f32.cl
Normal file
@@ -0,0 +1,176 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
|
||||
inline void get_scale_min_k4(
|
||||
int j,
|
||||
global const uchar * q,
|
||||
uchar * d,
|
||||
uchar * m,
|
||||
uchar mask_d6,
|
||||
uchar mask_d4,
|
||||
uchar mask_hi2
|
||||
) {
|
||||
if (j < 4) {
|
||||
*d = q[j] & mask_d6;
|
||||
*m = q[j+4] & mask_d6;
|
||||
} else {
|
||||
*d = (q[j+4] & mask_d4) | ((q[j-4] & mask_hi2) >> 2);
|
||||
*m = ((q[j+4] >> 4) & mask_d4) | ((q[j] & mask_hi2) >> 2);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
kernel void kernel_gemm_noshuffle_q5_k_f32(
|
||||
global const ushort * src0_q,
|
||||
global const uchar * src0_qh,
|
||||
global const uchar * src0_s,
|
||||
global const half * src0_d,
|
||||
global const half * src0_dm,
|
||||
read_only image1d_buffer_t src1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int m,
|
||||
int n,
|
||||
int k,
|
||||
int n_no_padding,
|
||||
uchar mask_d6,
|
||||
uchar mask_d4,
|
||||
uchar mask_hi2
|
||||
) {
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
int n_4 = n >> 2;
|
||||
int gy = get_global_id(0);
|
||||
int gx = get_global_id(1);
|
||||
int gx_2 = gx << 2;
|
||||
|
||||
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
|
||||
half8 B;
|
||||
half4 dequantized_weights;
|
||||
|
||||
int num_blocks_K = k / QK_K;
|
||||
|
||||
global const ushort * weight_ptr = src0_q + gx_2;
|
||||
global const uchar * qh_ptr = src0_qh + gx_2;
|
||||
global const half * d_ptr = src0_d + gx_2;
|
||||
global const half * dm_ptr = src0_dm + gx_2;
|
||||
|
||||
for (int i = 0; i < k; i += 32) {
|
||||
int sb_idx = i / QK_K;
|
||||
int sub_idx = (i / 32) % 8;
|
||||
|
||||
half4 d = vload4(0, d_ptr + sb_idx * m);
|
||||
half4 dm = vload4(0, dm_ptr + sb_idx * m);
|
||||
|
||||
global const uchar * sc0 = src0_s + (gx_2+0) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
|
||||
global const uchar * sc1 = src0_s + (gx_2+1) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
|
||||
global const uchar * sc2 = src0_s + (gx_2+2) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
|
||||
global const uchar * sc3 = src0_s + (gx_2+3) * num_blocks_K * K_SCALE_SIZE + sb_idx * K_SCALE_SIZE;
|
||||
|
||||
uchar sv0, mn0, sv1, mn1, sv2, mn2, sv3, mn3;
|
||||
get_scale_min_k4(sub_idx, sc0, &sv0, &mn0, mask_d6, mask_d4, mask_hi2);
|
||||
get_scale_min_k4(sub_idx, sc1, &sv1, &mn1, mask_d6, mask_d4, mask_hi2);
|
||||
get_scale_min_k4(sub_idx, sc2, &sv2, &mn2, mask_d6, mask_d4, mask_hi2);
|
||||
get_scale_min_k4(sub_idx, sc3, &sv3, &mn3, mask_d6, mask_d4, mask_hi2);
|
||||
|
||||
half4 scale = convert_half4(convert_float4(d) * convert_float4((uchar4)(sv0, sv1, sv2, sv3)));
|
||||
half4 mval = convert_half4(convert_float4(dm) * convert_float4((uchar4)(mn0, mn1, mn2, mn3)));
|
||||
|
||||
for (int l = 0; l < 32; l += 4) {
|
||||
int ki = i + l;
|
||||
ushort4 bits4 = vload4(0, weight_ptr + (ki/4) * m);
|
||||
uchar4 qh_bits = vload4(0, qh_ptr + (ki/8) * m);
|
||||
int qh_shift = ki % 8;
|
||||
|
||||
// j=0
|
||||
B.s0123 = read_imageh(src1, gy*2 + (ki+0) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2+1 + (ki+0) * n_4);
|
||||
dequantized_weights.s0 = ((bits4.s0 & 0x000F) | (((qh_bits.s0 >> (qh_shift+0)) & 1) << 4)) * scale.s0 - mval.s0;
|
||||
dequantized_weights.s1 = ((bits4.s1 & 0x000F) | (((qh_bits.s1 >> (qh_shift+0)) & 1) << 4)) * scale.s1 - mval.s1;
|
||||
dequantized_weights.s2 = ((bits4.s2 & 0x000F) | (((qh_bits.s2 >> (qh_shift+0)) & 1) << 4)) * scale.s2 - mval.s2;
|
||||
dequantized_weights.s3 = ((bits4.s3 & 0x000F) | (((qh_bits.s3 >> (qh_shift+0)) & 1) << 4)) * scale.s3 - mval.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=1
|
||||
B.s0123 = read_imageh(src1, gy*2 + (ki+1) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2+1 + (ki+1) * n_4);
|
||||
dequantized_weights.s0 = (((bits4.s0 & 0x00F0) >> 4) | (((qh_bits.s0 >> (qh_shift+1)) & 1) << 4)) * scale.s0 - mval.s0;
|
||||
dequantized_weights.s1 = (((bits4.s1 & 0x00F0) >> 4) | (((qh_bits.s1 >> (qh_shift+1)) & 1) << 4)) * scale.s1 - mval.s1;
|
||||
dequantized_weights.s2 = (((bits4.s2 & 0x00F0) >> 4) | (((qh_bits.s2 >> (qh_shift+1)) & 1) << 4)) * scale.s2 - mval.s2;
|
||||
dequantized_weights.s3 = (((bits4.s3 & 0x00F0) >> 4) | (((qh_bits.s3 >> (qh_shift+1)) & 1) << 4)) * scale.s3 - mval.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=2
|
||||
B.s0123 = read_imageh(src1, gy*2 + (ki+2) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2+1 + (ki+2) * n_4);
|
||||
dequantized_weights.s0 = (((bits4.s0 & 0x0F00) >> 8) | (((qh_bits.s0 >> (qh_shift+2)) & 1) << 4)) * scale.s0 - mval.s0;
|
||||
dequantized_weights.s1 = (((bits4.s1 & 0x0F00) >> 8) | (((qh_bits.s1 >> (qh_shift+2)) & 1) << 4)) * scale.s1 - mval.s1;
|
||||
dequantized_weights.s2 = (((bits4.s2 & 0x0F00) >> 8) | (((qh_bits.s2 >> (qh_shift+2)) & 1) << 4)) * scale.s2 - mval.s2;
|
||||
dequantized_weights.s3 = (((bits4.s3 & 0x0F00) >> 8) | (((qh_bits.s3 >> (qh_shift+2)) & 1) << 4)) * scale.s3 - mval.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
|
||||
// j=3
|
||||
B.s0123 = read_imageh(src1, gy*2 + (ki+3) * n_4);
|
||||
B.s4567 = read_imageh(src1, gy*2+1 + (ki+3) * n_4);
|
||||
dequantized_weights.s0 = (((bits4.s0 & 0xF000) >> 12) | (((qh_bits.s0 >> (qh_shift+3)) & 1) << 4)) * scale.s0 - mval.s0;
|
||||
dequantized_weights.s1 = (((bits4.s1 & 0xF000) >> 12) | (((qh_bits.s1 >> (qh_shift+3)) & 1) << 4)) * scale.s1 - mval.s1;
|
||||
dequantized_weights.s2 = (((bits4.s2 & 0xF000) >> 12) | (((qh_bits.s2 >> (qh_shift+3)) & 1) << 4)) * scale.s2 - mval.s2;
|
||||
dequantized_weights.s3 = (((bits4.s3 & 0xF000) >> 12) | (((qh_bits.s3 >> (qh_shift+3)) & 1) << 4)) * scale.s3 - mval.s3;
|
||||
c0 += B * dequantized_weights.s0;
|
||||
c1 += B * dequantized_weights.s1;
|
||||
c2 += B * dequantized_weights.s2;
|
||||
c3 += B * dequantized_weights.s3;
|
||||
}
|
||||
}
|
||||
|
||||
int idx = (gy<<3)*m + (gx<<2);
|
||||
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
||||
idx += m;
|
||||
}
|
||||
if (idx+3 < m*n_no_padding) {
|
||||
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
||||
}
|
||||
}
|
||||
326
ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl
Normal file
326
ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_k_f32.cl
Normal file
@@ -0,0 +1,326 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#endif
|
||||
|
||||
#define QK_K 256
|
||||
#define NSUBGROUPS 4
|
||||
#define SUBGROUP_SIZE 64
|
||||
|
||||
inline void get_scale_min_k4(
|
||||
int j,
|
||||
global const uchar * q,
|
||||
uchar * d,
|
||||
uchar * m,
|
||||
uchar mask_d6,
|
||||
uchar mask_d4,
|
||||
uchar mask_hi2
|
||||
) {
|
||||
if (j < 4) {
|
||||
*d = q[j] & mask_d6;
|
||||
*m = q[j+4] & mask_d6;
|
||||
} else {
|
||||
*d = (q[j+4] & mask_d4) | ((q[j-4] & mask_hi2) >> 2);
|
||||
*m = ((q[j+4] >> 4) & mask_d4) | ((q[j] & mask_hi2) >> 2);
|
||||
}
|
||||
}
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, bits1, scale, minv, y) \
|
||||
float shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | ((bits1.s0 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | ((bits1.s1 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | ((bits1.s2 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | ((bits1.s3 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, bits1, scale, minv, y) \
|
||||
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | ((bits1.s4 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | ((bits1.s5 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | ((bits1.s6 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | ((bits1.s7 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, bits1, scale, minv, y) \
|
||||
float8 shared_y; \
|
||||
shared_y = sub_group_broadcast(y, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | ((bits1.s0 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | ((bits1.s1 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | ((bits1.s2 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | ((bits1.s3 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s7; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, bits1, scale, minv, y) \
|
||||
shared_y = sub_group_broadcast(y, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x000F) | ((bits1.s4 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x000F) | ((bits1.s5 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x000F) | ((bits1.s6 & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s0; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s1; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s2; \
|
||||
total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s3; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s4; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s5; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s6; \
|
||||
total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s0 - minv.s0) * shared_y.s7; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x000F) | ((bits1.s7 & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s0; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s1; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s2; \
|
||||
total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s3; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s4; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s5; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s6; \
|
||||
total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 - minv.s1) * shared_y.s7; \
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_gemv_noshuffle_q5_k_f32(
|
||||
read_only image1d_buffer_t src0_q,
|
||||
read_only image1d_buffer_t src0_qh,
|
||||
global half2 * src0_d,
|
||||
global half2 * src0_m,
|
||||
global uchar * src0_s,
|
||||
read_only image1d_buffer_t src1,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
uchar mask_d6,
|
||||
uchar mask_d4,
|
||||
uchar mask_hi2)
|
||||
{
|
||||
uint groupId = get_local_id(1);
|
||||
uint gid = get_global_id(0);
|
||||
ushort slid = get_sub_group_local_id();
|
||||
|
||||
uint K = ne00;
|
||||
uint M = ne01;
|
||||
|
||||
uint LINE_STRIDE_A = M / 2;
|
||||
uint BLOCK_STRIDE_A = NSUBGROUPS * M;
|
||||
|
||||
uint LINE_STRIDE_A_QH = M / 2;
|
||||
uint BLOCK_STRIDE_A_QH = NSUBGROUPS * M / 2;
|
||||
uint scales_per_row = (K / QK_K) * 12;
|
||||
|
||||
private uint4 regA;
|
||||
private ushort4 regH;
|
||||
private half2 regS;
|
||||
private half2 regM;
|
||||
private float8 regB;
|
||||
|
||||
private float2 totalSum = (float2)(0.0f);
|
||||
|
||||
for (uint k = groupId; k < (K / 32); k += NSUBGROUPS) {
|
||||
uint sb = k / 8;
|
||||
uint j = k % 8;
|
||||
|
||||
half2 d = src0_d[gid + sb * LINE_STRIDE_A];
|
||||
half2 dm = src0_m[gid + sb * LINE_STRIDE_A];
|
||||
|
||||
global const uchar * sc0 = src0_s + 2 * gid * scales_per_row + sb * 12;
|
||||
global const uchar * sc1 = src0_s + (2 * gid + 1) * scales_per_row + sb * 12;
|
||||
|
||||
uchar sv0, mn0, sv1, mn1;
|
||||
get_scale_min_k4(j, sc0, &sv0, &mn0, mask_d6, mask_d4, mask_hi2);
|
||||
get_scale_min_k4(j, sc1, &sv1, &mn1, mask_d6, mask_d4, mask_hi2);
|
||||
|
||||
regS = convert_half2(convert_float2(d) * convert_float2((uchar2)(sv0, sv1)));
|
||||
regM = convert_half2(convert_float2(dm) * convert_float2((uchar2)(mn0, mn1)));
|
||||
|
||||
if (slid < 4) {
|
||||
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||
}
|
||||
|
||||
regH.s0 = as_ushort(read_imageh(src0_qh, (gid + k * BLOCK_STRIDE_A_QH + LINE_STRIDE_A_QH * 0)).x);
|
||||
regH.s1 = as_ushort(read_imageh(src0_qh, (gid + k * BLOCK_STRIDE_A_QH + LINE_STRIDE_A_QH * 1)).x);
|
||||
regH.s2 = as_ushort(read_imageh(src0_qh, (gid + k * BLOCK_STRIDE_A_QH + LINE_STRIDE_A_QH * 2)).x);
|
||||
regH.s3 = as_ushort(read_imageh(src0_qh, (gid + k * BLOCK_STRIDE_A_QH + LINE_STRIDE_A_QH * 3)).x);
|
||||
|
||||
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), as_uchar8(regH), regS, regM, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), as_uchar8(regH), regS, regM, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
|
||||
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAST
|
||||
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), as_uchar8(regH), regS, regM, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), as_uchar8(regH), regS, regM, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAST
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #wave=4
|
||||
local float2 reduceLM[SUBGROUP_SIZE * 3];
|
||||
if (groupId == 1) {
|
||||
reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 2) {
|
||||
reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum;
|
||||
}
|
||||
if (groupId == 3) {
|
||||
reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid];
|
||||
}
|
||||
if (groupId == 0) {
|
||||
totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid];
|
||||
}
|
||||
|
||||
// 2 outputs per fiber in wave 0
|
||||
if (groupId == 0) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||
}
|
||||
}
|
||||
@@ -7,3 +7,26 @@ ggml_add_backend_library(ggml-rpc
|
||||
if (WIN32)
|
||||
target_link_libraries(ggml-rpc PRIVATE ws2_32)
|
||||
endif()
|
||||
|
||||
# RDMA auto-detection (Linux only, requires libibverbs)
|
||||
if (NOT WIN32 AND NOT APPLE)
|
||||
find_library(IBVERBS_LIB ibverbs)
|
||||
if (IBVERBS_LIB)
|
||||
option(GGML_RPC_RDMA "ggml: enable RDMA transport for RPC" ON)
|
||||
else()
|
||||
option(GGML_RPC_RDMA "ggml: enable RDMA transport for RPC" OFF)
|
||||
endif()
|
||||
else()
|
||||
set(GGML_RPC_RDMA OFF CACHE BOOL "RDMA not available on this platform" FORCE)
|
||||
endif()
|
||||
|
||||
if (GGML_RPC_RDMA)
|
||||
if (NOT IBVERBS_LIB)
|
||||
find_library(IBVERBS_LIB ibverbs REQUIRED)
|
||||
endif()
|
||||
target_compile_definitions(ggml-rpc PRIVATE GGML_RPC_RDMA)
|
||||
target_link_libraries(ggml-rpc PRIVATE ${IBVERBS_LIB})
|
||||
message(STATUS " RDMA transport enabled (auto-detected)")
|
||||
else()
|
||||
message(STATUS " RDMA transport disabled")
|
||||
endif()
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <array>
|
||||
#include <cinttypes>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
@@ -31,6 +33,14 @@
|
||||
#include <filesystem>
|
||||
#include <algorithm>
|
||||
|
||||
#ifdef GGML_RPC_RDMA
|
||||
# include <infiniband/verbs.h>
|
||||
# include <time.h>
|
||||
# ifndef _WIN32
|
||||
# include <poll.h>
|
||||
# endif
|
||||
#endif // GGML_RPC_RDMA
|
||||
|
||||
static const char * RPC_DEBUG = std::getenv("GGML_RPC_DEBUG");
|
||||
|
||||
#define LOG_DBG(...) \
|
||||
@@ -49,17 +59,116 @@ typedef int sockfd_t;
|
||||
#endif
|
||||
|
||||
// cross-platform socket
|
||||
|
||||
#ifdef GGML_RPC_RDMA
|
||||
static constexpr size_t RDMA_CHUNK = 256 * 1024; // 256 KiB per send/recv (fits default 8 MiB memlock)
|
||||
static constexpr int RDMA_RX_DEPTH = 24; // pre-posted recv ring: 24 × 256 KiB = 6 MiB
|
||||
static constexpr size_t RDMA_GID_SIZE = 16; // RoCE GID / IB GID is always 16 bytes
|
||||
using rdma_gid_t = std::array<uint8_t, RDMA_GID_SIZE>;
|
||||
|
||||
struct rdma_conn {
|
||||
struct ibv_context * ctx = nullptr;
|
||||
struct ibv_pd * pd = nullptr;
|
||||
struct ibv_cq * scq = nullptr; // send completions
|
||||
struct ibv_cq * rcq = nullptr; // recv completions
|
||||
struct ibv_qp * qp = nullptr;
|
||||
|
||||
void * tx_buf = nullptr;
|
||||
struct ibv_mr * tx_mr = nullptr;
|
||||
|
||||
void * rx_buf = nullptr; // RDMA_RX_DEPTH × RDMA_CHUNK contiguous
|
||||
struct ibv_mr * rx_mr = nullptr;
|
||||
int rx_head = 0;
|
||||
|
||||
uint32_t max_inline = 0;
|
||||
|
||||
uint8_t * rx_slot(int i) const {
|
||||
return static_cast<uint8_t *>(rx_buf) + static_cast<size_t>(i) * RDMA_CHUNK;
|
||||
}
|
||||
|
||||
bool post_rx(int i) {
|
||||
struct ibv_sge sge = {};
|
||||
sge.addr = (uintptr_t)rx_slot(i);
|
||||
sge.length = RDMA_CHUNK;
|
||||
sge.lkey = rx_mr->lkey;
|
||||
struct ibv_recv_wr wr = {}, * bad = nullptr;
|
||||
wr.wr_id = (uint64_t)i;
|
||||
wr.sg_list = &sge;
|
||||
wr.num_sge = 1;
|
||||
return ibv_post_recv(qp, &wr, &bad) == 0;
|
||||
}
|
||||
|
||||
~rdma_conn() {
|
||||
if (tx_mr) ibv_dereg_mr(tx_mr);
|
||||
if (rx_mr) ibv_dereg_mr(rx_mr);
|
||||
free(tx_buf);
|
||||
free(rx_buf);
|
||||
if (qp) ibv_destroy_qp(qp);
|
||||
if (scq) ibv_destroy_cq(scq);
|
||||
if (rcq) ibv_destroy_cq(rcq);
|
||||
if (pd) ibv_dealloc_pd(pd);
|
||||
if (ctx) ibv_close_device(ctx);
|
||||
}
|
||||
};
|
||||
|
||||
// Local RDMA parameters captured during the probe phase and later consumed
|
||||
// by rdma_activate() after the remote side's caps arrive via HELLO.
|
||||
struct rdma_local_info {
|
||||
uint32_t qpn = 0;
|
||||
uint32_t psn = 0;
|
||||
uint8_t gid[RDMA_GID_SIZE] = {};
|
||||
uint8_t ib_port = 0;
|
||||
int gid_idx = 0;
|
||||
enum ibv_mtu path_mtu = IBV_MTU_1024;
|
||||
};
|
||||
#endif // GGML_RPC_RDMA
|
||||
|
||||
// conn_caps size for transport-agnostic capability exchange
|
||||
static constexpr size_t RPC_CONN_CAPS_SIZE = 24;
|
||||
|
||||
// conn_caps RDMA layout helper
|
||||
#ifdef GGML_RPC_RDMA
|
||||
struct rdma_caps {
|
||||
uint32_t qpn;
|
||||
uint32_t psn;
|
||||
uint8_t gid[RDMA_GID_SIZE];
|
||||
};
|
||||
static_assert(sizeof(rdma_caps) == RPC_CONN_CAPS_SIZE, "rdma_caps must match conn_caps size");
|
||||
#endif // GGML_RPC_RDMA
|
||||
|
||||
// Forward declarations for transport function pointers
|
||||
struct socket_t;
|
||||
static bool tcp_send_impl(socket_t * sock, const void * data, size_t size);
|
||||
static bool tcp_recv_impl(socket_t * sock, void * data, size_t size);
|
||||
|
||||
struct socket_t {
|
||||
sockfd_t fd;
|
||||
bool (*fn_send)(socket_t *, const void *, size_t) = tcp_send_impl;
|
||||
bool (*fn_recv)(socket_t *, void *, size_t) = tcp_recv_impl;
|
||||
#ifdef GGML_RPC_RDMA
|
||||
std::unique_ptr<rdma_conn> rdma;
|
||||
rdma_local_info rdma_local = {};
|
||||
#endif // GGML_RPC_RDMA
|
||||
socket_t(sockfd_t fd) : fd(fd) {}
|
||||
~socket_t() {
|
||||
#ifdef GGML_RPC_RDMA
|
||||
rdma.reset();
|
||||
#endif // GGML_RPC_RDMA
|
||||
LOG_DBG("[%s] closing socket %d\n", __func__, this->fd);
|
||||
#ifdef _WIN32
|
||||
closesocket(this->fd);
|
||||
if (fd != INVALID_SOCKET) closesocket(this->fd);
|
||||
#else
|
||||
close(this->fd);
|
||||
if (fd >= 0) close(this->fd);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Advertise local transport capabilities into conn_caps.
|
||||
// May probe RDMA and store the probe on this socket for update_caps.
|
||||
void get_caps(uint8_t * caps);
|
||||
|
||||
// Activate transport upgrade based on remote conn_caps using the probe
|
||||
// previously stored by get_caps.
|
||||
void update_caps(const uint8_t * remote_caps);
|
||||
};
|
||||
|
||||
// macro for nicer error messages on server crash
|
||||
@@ -115,10 +224,16 @@ static_assert(RPC_CMD_HELLO == 14, "RPC_CMD_HELLO must be always 14");
|
||||
// Try RPC_CMD_SET_TENSOR_HASH first when data size is larger than this threshold
|
||||
const size_t HASH_THRESHOLD = 10 * 1024 * 1024;
|
||||
|
||||
struct rpc_msg_hello_req {
|
||||
uint8_t conn_caps[RPC_CONN_CAPS_SIZE];
|
||||
};
|
||||
|
||||
struct rpc_msg_hello_rsp {
|
||||
uint8_t major;
|
||||
uint8_t minor;
|
||||
uint8_t patch;
|
||||
uint8_t padding;
|
||||
uint8_t conn_caps[RPC_CONN_CAPS_SIZE];
|
||||
};
|
||||
|
||||
struct rpc_msg_device_count_rsp {
|
||||
@@ -414,27 +529,414 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool send_msg(sockfd_t sockfd, const void * msg, size_t msg_size) {
|
||||
if (!send_data(sockfd, &msg_size, sizeof(msg_size))) {
|
||||
return false;
|
||||
}
|
||||
return send_data(sockfd, msg, msg_size);
|
||||
// TCP transport implementations (for function-pointer dispatch)
|
||||
|
||||
static bool tcp_send_impl(socket_t * sock, const void * data, size_t size) {
|
||||
return send_data(sock->fd, data, size);
|
||||
}
|
||||
|
||||
static bool recv_msg(sockfd_t sockfd, void * msg, size_t msg_size) {
|
||||
static bool tcp_recv_impl(socket_t * sock, void * data, size_t size) {
|
||||
return recv_data(sock->fd, data, size);
|
||||
}
|
||||
|
||||
// RDMA transport (performance-optimized, auto-negotiated)
|
||||
|
||||
#ifdef GGML_RPC_RDMA
|
||||
|
||||
static bool rdma_send_impl(socket_t * sock, const void * data, size_t size);
|
||||
static bool rdma_recv_impl(socket_t * sock, void * data, size_t size);
|
||||
|
||||
static inline bool tcp_peer_closed(int fd) {
|
||||
if (fd < 0) return false;
|
||||
#ifndef _WIN32
|
||||
struct pollfd pfd = { fd, POLLIN | POLLRDHUP, 0 };
|
||||
int r = poll(&pfd, 1, 0);
|
||||
return r > 0 && (pfd.revents & (POLLHUP | POLLERR | POLLRDHUP));
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool rdma_poll(struct ibv_cq * cq, struct ibv_wc * wc, int tcp_fd) {
|
||||
for (uint64_t s = 0; ; s++) {
|
||||
int n = ibv_poll_cq(cq, 1, wc);
|
||||
if (n > 0) {
|
||||
if (wc->status != IBV_WC_SUCCESS) {
|
||||
GGML_LOG_ERROR("RDMA CQ wc error: status=%d (%s) vendor_err=0x%x\n",
|
||||
wc->status, ibv_wc_status_str(wc->status), wc->vendor_err);
|
||||
}
|
||||
return wc->status == IBV_WC_SUCCESS;
|
||||
}
|
||||
if (n < 0) return false;
|
||||
if ((s & 0xFFFFF) == 0 && s > 0) {
|
||||
if (tcp_peer_closed(tcp_fd)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool rdma_send(rdma_conn * c, const void * data, size_t size, int tcp_fd) {
|
||||
const uint8_t * src = (const uint8_t *)data;
|
||||
size_t rem = size;
|
||||
while (rem > 0) {
|
||||
size_t chunk = std::min(rem, RDMA_CHUNK);
|
||||
|
||||
struct ibv_sge sge = {};
|
||||
struct ibv_send_wr wr = {}, * bad = nullptr;
|
||||
wr.opcode = IBV_WR_SEND;
|
||||
wr.sg_list = &sge;
|
||||
wr.num_sge = 1;
|
||||
|
||||
if (chunk <= c->max_inline) {
|
||||
sge.addr = (uintptr_t)src;
|
||||
sge.length = chunk;
|
||||
wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
|
||||
} else {
|
||||
memcpy(c->tx_buf, src, chunk);
|
||||
sge.addr = (uintptr_t)c->tx_buf;
|
||||
sge.length = chunk;
|
||||
sge.lkey = c->tx_mr->lkey;
|
||||
wr.send_flags = IBV_SEND_SIGNALED;
|
||||
}
|
||||
|
||||
if (ibv_post_send(c->qp, &wr, &bad) != 0) return false;
|
||||
struct ibv_wc wc;
|
||||
if (!rdma_poll(c->scq, &wc, tcp_fd)) return false;
|
||||
|
||||
src += chunk;
|
||||
rem -= chunk;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static bool rdma_recv(rdma_conn * c, void * data, size_t size, int tcp_fd) {
|
||||
uint8_t * dst = (uint8_t *)data;
|
||||
size_t rem = size;
|
||||
while (rem > 0) {
|
||||
struct ibv_wc wc;
|
||||
if (!rdma_poll(c->rcq, &wc, tcp_fd)) return false;
|
||||
|
||||
int slot = (int)wc.wr_id;
|
||||
size_t got = wc.byte_len;
|
||||
memcpy(dst, c->rx_slot(slot), got);
|
||||
|
||||
if (!c->post_rx(slot)) return false;
|
||||
|
||||
dst += got;
|
||||
rem -= got;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool rdma_send_impl(socket_t * sock, const void * data, size_t size) {
|
||||
return rdma_send(sock->rdma.get(), data, size, sock->fd);
|
||||
}
|
||||
|
||||
static bool rdma_recv_impl(socket_t * sock, void * data, size_t size) {
|
||||
return rdma_recv(sock->rdma.get(), data, size, sock->fd);
|
||||
}
|
||||
|
||||
// Build a RoCE GID-shaped 16-byte target from a TCP socket's local address.
|
||||
// Used to match the socket's local IP against the kernel's GID table so that
|
||||
// a single memcmp handles IPv4, IPv4-mapped IPv6, and native IPv6 uniformly:
|
||||
// AF_INET -> ::ffff:a.b.c.d (bytes 10-11 = 0xff, last 4 = IPv4)
|
||||
// AF_INET6 (IPv4-mapped) -> ::ffff:a.b.c.d (already in GID shape)
|
||||
// AF_INET6 (native v6) -> the 16-byte IPv6 address as-is
|
||||
// Returns std::nullopt on unsupported family or getsockname failure.
|
||||
static std::optional<rdma_gid_t> rdma_build_target_gid(sockfd_t tcp_fd) {
|
||||
sockaddr_storage addr = {};
|
||||
socklen_t addr_len = sizeof(addr);
|
||||
if (getsockname(tcp_fd, reinterpret_cast<sockaddr *>(&addr), &addr_len) != 0) {
|
||||
return std::nullopt;
|
||||
}
|
||||
rdma_gid_t target = {};
|
||||
if (addr.ss_family == AF_INET) {
|
||||
const auto * a = reinterpret_cast<const sockaddr_in *>(&addr);
|
||||
target[10] = 0xff;
|
||||
target[11] = 0xff;
|
||||
memcpy(&target[12], &a->sin_addr, 4);
|
||||
return target;
|
||||
}
|
||||
if (addr.ss_family == AF_INET6) {
|
||||
const auto * a = reinterpret_cast<const sockaddr_in6 *>(&addr);
|
||||
memcpy(target.data(), &a->sin6_addr, RDMA_GID_SIZE);
|
||||
return target;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
static rdma_conn * rdma_probe(sockfd_t tcp_fd, rdma_local_info * out) {
|
||||
const char * dev_env = std::getenv("GGML_RDMA_DEV");
|
||||
const char * gid_env = std::getenv("GGML_RDMA_GID");
|
||||
|
||||
auto target_gid = rdma_build_target_gid(tcp_fd);
|
||||
if (!target_gid) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const uint8_t ib_port = 1;
|
||||
int num_devs = 0;
|
||||
ibv_device ** devs = ibv_get_device_list(&num_devs);
|
||||
if (!devs || num_devs == 0) return nullptr;
|
||||
|
||||
ibv_context * ibctx = nullptr;
|
||||
const char * matched_dev = nullptr;
|
||||
int gid_idx = gid_env ? atoi(gid_env) : -1;
|
||||
int gid_version = IBV_GID_TYPE_IB; // 0 = unknown/IB
|
||||
|
||||
for (int d = 0; d < num_devs; d++) {
|
||||
const char * dn = ibv_get_device_name(devs[d]);
|
||||
if (dev_env && strcmp(dev_env, dn) != 0) continue;
|
||||
|
||||
ibv_context * ctx = ibv_open_device(devs[d]);
|
||||
if (!ctx) continue;
|
||||
|
||||
ibv_port_attr pa;
|
||||
if (ibv_query_port(ctx, ib_port, &pa) != 0) { ibv_close_device(ctx); continue; }
|
||||
|
||||
int found_gid = gid_idx;
|
||||
int found_version = IBV_GID_TYPE_IB;
|
||||
if (found_gid < 0) {
|
||||
// Find a GID on this port whose bytes equal the local TCP address
|
||||
// (IPv4 or IPv6). Prefer RoCE v2 (UDP/IP, L3-routable) over v1
|
||||
// (raw Ethernet, same-L2 only) so silent hangs on L3-routed paths
|
||||
// are avoided. ibv_query_gid_ex returns gid+type in one call.
|
||||
int v2_idx = -1;
|
||||
int v1_idx = -1;
|
||||
for (int i = 0; i < pa.gid_tbl_len; i++) {
|
||||
ibv_gid_entry entry = {};
|
||||
if (ibv_query_gid_ex(ctx, ib_port, i, &entry, 0) != 0) continue;
|
||||
if (memcmp(entry.gid.raw, target_gid->data(), RDMA_GID_SIZE) != 0) continue;
|
||||
if (entry.gid_type == IBV_GID_TYPE_ROCE_V2 && v2_idx < 0) {
|
||||
v2_idx = i;
|
||||
} else if (entry.gid_type == IBV_GID_TYPE_ROCE_V1 && v1_idx < 0) {
|
||||
v1_idx = i;
|
||||
}
|
||||
}
|
||||
if (v2_idx >= 0) {
|
||||
found_gid = v2_idx;
|
||||
found_version = IBV_GID_TYPE_ROCE_V2;
|
||||
} else if (v1_idx >= 0) {
|
||||
found_gid = v1_idx;
|
||||
found_version = IBV_GID_TYPE_ROCE_V1;
|
||||
}
|
||||
} else {
|
||||
// Explicit GID index from GGML_RDMA_GID — fetch its type for logging.
|
||||
ibv_gid_entry entry = {};
|
||||
if (ibv_query_gid_ex(ctx, ib_port, found_gid, &entry, 0) == 0) {
|
||||
found_version = entry.gid_type;
|
||||
}
|
||||
}
|
||||
if (found_gid >= 0) {
|
||||
ibctx = ctx;
|
||||
gid_idx = found_gid;
|
||||
gid_version = found_version;
|
||||
matched_dev = dn;
|
||||
out->path_mtu = pa.active_mtu;
|
||||
break;
|
||||
}
|
||||
ibv_close_device(ctx);
|
||||
}
|
||||
ibv_free_device_list(devs);
|
||||
if (!ibctx) return nullptr;
|
||||
|
||||
out->ib_port = ib_port;
|
||||
out->gid_idx = gid_idx;
|
||||
|
||||
// unique_ptr owns ibctx and every subsequent resource via ~rdma_conn(),
|
||||
// so each failure path is a plain `return nullptr;`.
|
||||
auto c = std::make_unique<rdma_conn>();
|
||||
c->ctx = ibctx;
|
||||
|
||||
c->pd = ibv_alloc_pd(ibctx);
|
||||
if (!c->pd) return nullptr;
|
||||
|
||||
c->scq = ibv_create_cq(ibctx, 16, nullptr, nullptr, 0);
|
||||
c->rcq = ibv_create_cq(ibctx, RDMA_RX_DEPTH + 4, nullptr, nullptr, 0);
|
||||
if (!c->scq || !c->rcq) return nullptr;
|
||||
|
||||
ibv_qp_init_attr qia = {};
|
||||
qia.send_cq = c->scq;
|
||||
qia.recv_cq = c->rcq;
|
||||
qia.qp_type = IBV_QPT_RC;
|
||||
qia.cap.max_send_wr = 4;
|
||||
qia.cap.max_recv_wr = RDMA_RX_DEPTH + 4;
|
||||
qia.cap.max_send_sge = 1;
|
||||
qia.cap.max_recv_sge = 1;
|
||||
qia.cap.max_inline_data = 256;
|
||||
|
||||
c->qp = ibv_create_qp(c->pd, &qia);
|
||||
if (!c->qp) return nullptr;
|
||||
c->max_inline = qia.cap.max_inline_data;
|
||||
|
||||
c->tx_buf = aligned_alloc(4096, RDMA_CHUNK);
|
||||
c->rx_buf = aligned_alloc(4096, static_cast<size_t>(RDMA_RX_DEPTH) * RDMA_CHUNK);
|
||||
if (!c->tx_buf || !c->rx_buf) return nullptr;
|
||||
|
||||
c->tx_mr = ibv_reg_mr(c->pd, c->tx_buf, RDMA_CHUNK, IBV_ACCESS_LOCAL_WRITE);
|
||||
c->rx_mr = ibv_reg_mr(c->pd, c->rx_buf, static_cast<size_t>(RDMA_RX_DEPTH) * RDMA_CHUNK,
|
||||
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
|
||||
if (!c->tx_mr || !c->rx_mr) return nullptr;
|
||||
|
||||
ibv_gid local_gid;
|
||||
if (ibv_query_gid(ibctx, ib_port, gid_idx, &local_gid) != 0) return nullptr;
|
||||
|
||||
out->qpn = c->qp->qp_num;
|
||||
out->psn = c->qp->qp_num & 0xffffff;
|
||||
memcpy(out->gid, &local_gid, RDMA_GID_SIZE);
|
||||
|
||||
const char * ver_str = "";
|
||||
if (gid_version == IBV_GID_TYPE_ROCE_V2) {
|
||||
ver_str = " RoCEv2";
|
||||
} else if (gid_version == IBV_GID_TYPE_ROCE_V1) {
|
||||
ver_str = " RoCEv1";
|
||||
}
|
||||
GGML_LOG_INFO("RDMA probed: dev=%s gid=%d%s qpn=%u inline=%u\n",
|
||||
matched_dev, gid_idx, ver_str, out->qpn, c->max_inline);
|
||||
return c.release();
|
||||
}
|
||||
|
||||
// Phase 2: Given remote QPN/PSN/GID, transition QP: RESET->INIT->pre-post->RTR->RTS.
|
||||
// On success, the connection is live and ready for rdma_send/rdma_recv.
|
||||
static bool rdma_activate(rdma_conn * c, const rdma_local_info * local,
|
||||
uint32_t remote_qpn, uint32_t remote_psn, const uint8_t * remote_gid) {
|
||||
// RESET -> INIT
|
||||
{
|
||||
struct ibv_qp_attr a = {};
|
||||
a.qp_state = IBV_QPS_INIT;
|
||||
a.port_num = local->ib_port;
|
||||
a.pkey_index = 0;
|
||||
a.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE;
|
||||
if (ibv_modify_qp(c->qp, &a,
|
||||
IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < RDMA_RX_DEPTH; i++) {
|
||||
if (!c->post_rx(i)) return false;
|
||||
}
|
||||
|
||||
// INIT -> RTR
|
||||
{
|
||||
struct ibv_qp_attr a = {};
|
||||
a.qp_state = IBV_QPS_RTR;
|
||||
a.path_mtu = local->path_mtu;
|
||||
a.dest_qp_num = remote_qpn;
|
||||
a.rq_psn = remote_psn;
|
||||
a.max_dest_rd_atomic = 1;
|
||||
a.min_rnr_timer = 1;
|
||||
a.ah_attr.is_global = 1;
|
||||
memcpy(&a.ah_attr.grh.dgid, remote_gid, RDMA_GID_SIZE);
|
||||
a.ah_attr.grh.hop_limit = 1;
|
||||
a.ah_attr.grh.sgid_index = local->gid_idx;
|
||||
a.ah_attr.dlid = 0;
|
||||
a.ah_attr.port_num = local->ib_port;
|
||||
if (ibv_modify_qp(c->qp, &a,
|
||||
IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
|
||||
IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// RTR -> RTS
|
||||
{
|
||||
struct ibv_qp_attr a = {};
|
||||
a.qp_state = IBV_QPS_RTS;
|
||||
a.timeout = 14;
|
||||
a.retry_cnt = 7;
|
||||
a.rnr_retry = 7;
|
||||
a.sq_psn = local->psn;
|
||||
a.max_rd_atomic = 1;
|
||||
if (ibv_modify_qp(c->qp, &a,
|
||||
IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
|
||||
IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("RDMA activated: qpn=%u->%u mtu=%d rx_depth=%d\n",
|
||||
local->qpn, remote_qpn, 128 << local->path_mtu, RDMA_RX_DEPTH);
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif // GGML_RPC_RDMA
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// socket_t transport capability methods
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
void socket_t::get_caps(uint8_t * caps) {
|
||||
memset(caps, 0, RPC_CONN_CAPS_SIZE);
|
||||
#ifdef GGML_RPC_RDMA
|
||||
rdma_local = {};
|
||||
rdma.reset(rdma_probe(fd, &rdma_local));
|
||||
if (rdma) {
|
||||
rdma_caps rc = {};
|
||||
rc.qpn = rdma_local.qpn;
|
||||
rc.psn = rdma_local.psn;
|
||||
memcpy(rc.gid, rdma_local.gid, RDMA_GID_SIZE);
|
||||
memcpy(caps, &rc, sizeof(rc));
|
||||
}
|
||||
#endif // GGML_RPC_RDMA
|
||||
}
|
||||
|
||||
void socket_t::update_caps(const uint8_t * remote_caps) {
|
||||
#ifdef GGML_RPC_RDMA
|
||||
if (!rdma) {
|
||||
return;
|
||||
}
|
||||
rdma_caps rc = {};
|
||||
memcpy(&rc, remote_caps, sizeof(rc));
|
||||
if (rc.qpn == 0) {
|
||||
rdma.reset();
|
||||
return;
|
||||
}
|
||||
if (rdma_activate(rdma.get(), &rdma_local, rc.qpn, rc.psn, rc.gid)) {
|
||||
fn_send = rdma_send_impl;
|
||||
fn_recv = rdma_recv_impl;
|
||||
} else {
|
||||
GGML_LOG_ERROR("RDMA activate failed, staying on TCP\n");
|
||||
rdma.reset();
|
||||
}
|
||||
#else
|
||||
(void)remote_caps;
|
||||
#endif // GGML_RPC_RDMA
|
||||
}
|
||||
|
||||
// unified transport dispatch (via function pointers)
|
||||
|
||||
static bool send_data(socket_t * sock, const void * data, size_t size) {
|
||||
return sock->fn_send(sock, data, size);
|
||||
}
|
||||
|
||||
static bool recv_data(socket_t * sock, void * data, size_t size) {
|
||||
return sock->fn_recv(sock, data, size);
|
||||
}
|
||||
|
||||
static bool send_msg(socket_t * sock, const void * msg, size_t msg_size) {
|
||||
if (!send_data(sock, &msg_size, sizeof(msg_size))) {
|
||||
return false;
|
||||
}
|
||||
return send_data(sock, msg, msg_size);
|
||||
}
|
||||
|
||||
static bool recv_msg(socket_t * sock, void * msg, size_t msg_size) {
|
||||
uint64_t size;
|
||||
if (!recv_data(sockfd, &size, sizeof(size))) {
|
||||
if (!recv_data(sock, &size, sizeof(size))) {
|
||||
return false;
|
||||
}
|
||||
if (size != msg_size) {
|
||||
return false;
|
||||
}
|
||||
return recv_data(sockfd, msg, msg_size);
|
||||
return recv_data(sock, msg, msg_size);
|
||||
}
|
||||
|
||||
static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
|
||||
static bool recv_msg(socket_t * sock, std::vector<uint8_t> & input) {
|
||||
uint64_t size;
|
||||
if (!recv_data(sockfd, &size, sizeof(size))) {
|
||||
if (!recv_data(sock, &size, sizeof(size))) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
@@ -443,7 +945,7 @@ static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
|
||||
GGML_LOG_ERROR("Failed to allocate input buffer of size %" PRIu64 "\n", size);
|
||||
return false;
|
||||
}
|
||||
return recv_data(sockfd, input.data(), size);
|
||||
return recv_data(sock, input.data(), size);
|
||||
}
|
||||
|
||||
static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) {
|
||||
@@ -452,7 +954,11 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
|
||||
return false;
|
||||
}
|
||||
host = endpoint.substr(0, pos);
|
||||
port = std::stoi(endpoint.substr(pos + 1));
|
||||
try {
|
||||
port = std::stoi(endpoint.substr(pos + 1));
|
||||
} catch (...) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -460,13 +966,13 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
|
||||
// No response
|
||||
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
|
||||
uint8_t cmd_byte = cmd;
|
||||
if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
|
||||
if (!send_data(sock.get(), &cmd_byte, sizeof(cmd_byte))) {
|
||||
return false;
|
||||
}
|
||||
if (!send_data(sock->fd, &input_size, sizeof(input_size))) {
|
||||
if (!send_data(sock.get(), &input_size, sizeof(input_size))) {
|
||||
return false;
|
||||
}
|
||||
if (!send_data(sock->fd, input, input_size)) {
|
||||
if (!send_data(sock.get(), input, input_size)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -478,16 +984,14 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
||||
if (!send_rpc_cmd(sock, cmd, input, input_size)) {
|
||||
return false;
|
||||
}
|
||||
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
|
||||
// even if we do, we can skip sending output_size from the server for commands with known output size
|
||||
uint64_t out_size;
|
||||
if (!recv_data(sock->fd, &out_size, sizeof(out_size))) {
|
||||
if (!recv_data(sock.get(), &out_size, sizeof(out_size))) {
|
||||
return false;
|
||||
}
|
||||
if (out_size != output_size) {
|
||||
return false;
|
||||
}
|
||||
if (!recv_data(sock->fd, output, output_size)) {
|
||||
if (!recv_data(sock.get(), output, output_size)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -495,17 +999,25 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
||||
|
||||
// RPC client-side implementation
|
||||
|
||||
static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
|
||||
rpc_msg_hello_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
|
||||
// Performs HELLO handshake with transport auto-negotiation.
|
||||
// Advertises local capabilities via conn_caps; if the server responds with
|
||||
// matching capabilities, the socket is upgraded transparently.
|
||||
static bool negotiate_hello(const std::shared_ptr<socket_t> & sock) {
|
||||
rpc_msg_hello_req request = {};
|
||||
rpc_msg_hello_rsp response = {};
|
||||
|
||||
sock->get_caps(request.conn_caps);
|
||||
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, &request, sizeof(request), &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
|
||||
if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
|
||||
GGML_LOG_ERROR("RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
GGML_LOG_ERROR("RPC server version mismatch: %d.%d.%d\n",
|
||||
response.major, response.minor, response.patch);
|
||||
return false;
|
||||
}
|
||||
if (response.minor != RPC_PROTO_MINOR_VERSION || response.patch != RPC_PROTO_PATCH_VERSION) {
|
||||
GGML_LOG_INFO("WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
}
|
||||
|
||||
sock->update_caps(response.conn_caps);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -527,6 +1039,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
GGML_LOG_ERROR("Failed to parse endpoint: %s\n", endpoint.c_str());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
if (!initialized) {
|
||||
WSADATA wsaData;
|
||||
@@ -543,10 +1056,10 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
if (sock == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!check_server_version(sock)) {
|
||||
if (!negotiate_hello(sock)) {
|
||||
return nullptr;
|
||||
}
|
||||
LOG_DBG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
||||
LOG_DBG("[%s] connected to %s\n", __func__, endpoint.c_str());
|
||||
sockets[endpoint] = sock;
|
||||
return sock;
|
||||
}
|
||||
@@ -1597,25 +2110,46 @@ rpc_server::~rpc_server() {
|
||||
}
|
||||
|
||||
static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const char * cache_dir,
|
||||
sockfd_t sockfd) {
|
||||
socket_t * sockfd) {
|
||||
rpc_server server(backends, cache_dir);
|
||||
uint8_t cmd;
|
||||
if (!recv_data(sockfd, &cmd, 1)) {
|
||||
return;
|
||||
}
|
||||
// the first command sent by the client must be HELLO
|
||||
if (cmd != RPC_CMD_HELLO) {
|
||||
GGML_LOG_ERROR("Expected HELLO command, update client\n");
|
||||
return;
|
||||
}
|
||||
if (!recv_msg(sockfd, nullptr, 0)) {
|
||||
|
||||
// Read input_size and validate protocol version
|
||||
uint64_t hello_input_size;
|
||||
if (!recv_data(sockfd, &hello_input_size, sizeof(hello_input_size))) {
|
||||
return;
|
||||
}
|
||||
rpc_msg_hello_rsp response;
|
||||
server.hello(response);
|
||||
if (!send_msg(sockfd, &response, sizeof(response))) {
|
||||
|
||||
if (hello_input_size != sizeof(rpc_msg_hello_req)) {
|
||||
GGML_LOG_ERROR("HELLO request size mismatch (%zu vs %zu) — client needs upgrade to protocol v%d.x\n",
|
||||
(size_t)hello_input_size, sizeof(rpc_msg_hello_req), RPC_PROTO_MAJOR_VERSION);
|
||||
return;
|
||||
}
|
||||
|
||||
rpc_msg_hello_req req = {};
|
||||
if (!recv_data(sockfd, &req, sizeof(req))) {
|
||||
return;
|
||||
}
|
||||
|
||||
rpc_msg_hello_rsp rsp = {};
|
||||
server.hello(rsp);
|
||||
|
||||
// Advertise server transport capabilities based on client's caps
|
||||
sockfd->get_caps(rsp.conn_caps);
|
||||
|
||||
if (!send_msg(sockfd, &rsp, sizeof(rsp))) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Activate transport upgrade using client's caps
|
||||
sockfd->update_caps(req.conn_caps);
|
||||
while (true) {
|
||||
if (!recv_data(sockfd, &cmd, 1)) {
|
||||
break;
|
||||
@@ -1884,6 +2418,12 @@ void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir
|
||||
if (!parse_endpoint(endpoint, host, port)) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef GGML_RPC_RDMA
|
||||
printf(" transport : TCP (RDMA auto-negotiate enabled)\n");
|
||||
#else
|
||||
printf(" transport : TCP\n");
|
||||
#endif // GGML_RPC_RDMA
|
||||
#ifdef _WIN32
|
||||
{
|
||||
WSADATA wsaData;
|
||||
@@ -1907,7 +2447,7 @@ void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir
|
||||
}
|
||||
printf("Accepted client connection\n");
|
||||
fflush(stdout);
|
||||
rpc_serve_client(backends, cache_dir, client_socket->fd);
|
||||
rpc_serve_client(backends, cache_dir, client_socket.get());
|
||||
printf("Client connection closed\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
@@ -154,6 +154,11 @@ if (GGML_SYCL_GRAPH)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_HOST_MEM_FALLBACK)
|
||||
message(STATUS "find GGML_SYCL_HOST_MEM_FALLBACK")
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_HOST_MEM_FALLBACK)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_DEVICE_ARCH)
|
||||
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
|
||||
@@ -151,6 +151,25 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int
|
||||
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q8_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::queue_ptr stream) {
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
int constexpr WARP_K = WARP_SIZE * QK8_0;
|
||||
const int n_warp = (k + WARP_K - 1) / WARP_K;
|
||||
GGML_ASSERT(k % QK8_0 == 0);
|
||||
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
|
||||
sycl::range<3>(1, 1, WARP_SIZE),
|
||||
sycl::range<3>(1, 1, WARP_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
|
||||
dequantize_block_q8_0_reorder(vx, y, k, item_ct1);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::queue_ptr stream) {
|
||||
@@ -614,7 +633,12 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
if (dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q8_0_sycl_reorder;
|
||||
} else {
|
||||
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
}
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_sycl;
|
||||
case GGML_TYPE_Q3_K:
|
||||
@@ -683,7 +707,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
if (dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q8_0_sycl_reorder;
|
||||
} else {
|
||||
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
}
|
||||
case GGML_TYPE_Q2_K:
|
||||
return dequantize_row_q2_K_sycl;
|
||||
case GGML_TYPE_Q3_K:
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user