mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-05-28 17:27:26 +03:00
Compare commits
72 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7fb1e70b59 | ||
|
|
d374e71e55 | ||
|
|
30af6e2b98 | ||
|
|
d7be46189f | ||
|
|
bc81d47aba | ||
|
|
0b246862b9 | ||
|
|
a919001134 | ||
|
|
48e7078ee0 | ||
|
|
bb771cbd2b | ||
|
|
7c48fb81ce | ||
|
|
91eb8f4fa0 | ||
|
|
d205df6812 | ||
|
|
e8d2567429 | ||
|
|
09e7b76c93 | ||
|
|
48e7eae41c | ||
|
|
c5229087a5 | ||
|
|
e31cdaa0eb | ||
|
|
491c4d7d2e | ||
|
|
939a7dd648 | ||
|
|
8ad8aef447 | ||
|
|
f12cc6d0fa | ||
|
|
aa50b2c2ae | ||
|
|
c40006a62e | ||
|
|
c6e4088376 | ||
|
|
b36eefc1b3 | ||
|
|
837bb6b447 | ||
|
|
ba4dd0bc67 | ||
|
|
617255d437 | ||
|
|
87b0a60cdd | ||
|
|
fda8528aa8 | ||
|
|
2d0656fbdd | ||
|
|
6b4e4bd582 | ||
|
|
9f0e4b14d2 | ||
|
|
b3a739c9b6 | ||
|
|
4d8cc0c56f | ||
|
|
0d227ec358 | ||
|
|
1d971bba36 | ||
|
|
9777256c31 | ||
|
|
7085492c6f | ||
|
|
b4c0549a49 | ||
|
|
0d18aaa9d1 | ||
|
|
08bc21b459 | ||
|
|
35a74c8fb9 | ||
|
|
5190c2ea8d | ||
|
|
7799d31e68 | ||
|
|
3a3ed153d9 | ||
|
|
ef66bfab68 | ||
|
|
678d43d720 | ||
|
|
ef41a69179 | ||
|
|
3dc7684f39 | ||
|
|
dbe9c0c8ce | ||
|
|
6fe90deffa | ||
|
|
581d020b12 | ||
|
|
7623de11d9 | ||
|
|
c9d98295a3 | ||
|
|
1506d39e76 | ||
|
|
54121f7325 | ||
|
|
192d8ae8b8 | ||
|
|
35c9b1f39e | ||
|
|
4bead4e30d | ||
|
|
302e2c2652 | ||
|
|
328874d054 | ||
|
|
c1f1e28d29 | ||
|
|
5a4126adc1 | ||
|
|
a4d2d4ae41 | ||
|
|
d161ea7071 | ||
|
|
45158f460e | ||
|
|
22307b3e8b | ||
|
|
ce5890b5f7 | ||
|
|
b251f74f49 | ||
|
|
fa97041524 | ||
|
|
ae251b5ff2 |
101
.devops/zendnn.Dockerfile
Normal file
101
.devops/zendnn.Dockerfile
Normal file
@@ -0,0 +1,101 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
|
||||
|
||||
ENV CC=gcc-13 CXX=g++-13
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base image
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libnuma1 curl \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
31
.github/actions/windows-setup-cuda/action.yml
vendored
31
.github/actions/windows-setup-cuda/action.yml
vendored
@@ -96,3 +96,34 @@ runs:
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 13.3
|
||||
if: ${{ inputs.cuda_version == '13.3' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.3.29-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.5.1.27-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.3.29-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.3.27-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.3.27-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cccl/windows-x86_64/cccl-windows-x86_64-13.3.3.3.1-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_crt-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_cudart-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvcc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvrtc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libcublas-windows-x86_64-13.5.1.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libnvvm-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvtx-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_profiler_api-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\visual_studio_integration-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cccl-windows-x86_64-13.3.3.3.1-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V13_3=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
6
.github/workflows/build-3rd-party.yml
vendored
6
.github/workflows/build-3rd-party.yml
vendored
@@ -22,9 +22,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-llguidance:
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
android-ndk-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -61,7 +61,7 @@ jobs:
|
||||
linux-iot-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
66
.github/workflows/build-android.yml
vendored
66
.github/workflows/build-android.yml
vendored
@@ -27,12 +27,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
android:
|
||||
default:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
@@ -58,7 +58,7 @@ jobs:
|
||||
cd examples/llama.android
|
||||
./gradlew build --no-daemon
|
||||
|
||||
android-ndk:
|
||||
ndk:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
|
||||
@@ -91,3 +91,59 @@ jobs:
|
||||
with:
|
||||
name: llama-cpp-android-arm64-cpu
|
||||
path: pkg-adb/llama.cpp
|
||||
|
||||
arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
NDK_VERSION: "29.0.14206865"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
|
||||
# for some reason, the ccache does not improve the build time in this case
|
||||
# example:
|
||||
# cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
|
||||
# cache on: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
|
||||
#
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: android-ubuntu-arm64
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: temurin
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Install NDK
|
||||
run: |
|
||||
sdkmanager "ndk;${{ env.NDK_VERSION }}"
|
||||
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
103
.github/workflows/build-apple.yml
vendored
103
.github/workflows/build-apple.yml
vendored
@@ -32,12 +32,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macOS-latest-ios:
|
||||
macos-latest-arm64:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -48,7 +48,80 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-ios
|
||||
key: apple-arm64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -E "test-llama-archs" --verbose --timeout 900
|
||||
|
||||
macos-latest-x64:
|
||||
runs-on: macos-15-intel
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: apple-x64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
macos-latest-ios:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: apple-ios
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -117,7 +190,7 @@ jobs:
|
||||
xcodebuild -downloadPlatform iOS
|
||||
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
macOS-latest-tvos:
|
||||
macos-latest-tvos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -125,10 +198,11 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-tvos
|
||||
key: apple-tvos
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -150,7 +224,7 @@ jobs:
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-visionos:
|
||||
macos-latest-visionos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -158,6 +232,14 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: apple-visionos
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
@@ -176,7 +258,7 @@ jobs:
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-swift:
|
||||
macos-latest-swift:
|
||||
runs-on: macos-latest
|
||||
needs: macos-latest-ios-xcode
|
||||
|
||||
@@ -189,10 +271,11 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-swift
|
||||
key: apple-swift
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
|
||||
8
.github/workflows/build-cache.yml
vendored
8
.github/workflows/build-cache.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
# key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
# - name: Setup SpacemiT Toolchain
|
||||
# if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
@@ -108,7 +108,7 @@ jobs:
|
||||
id: cache-rocm
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
|
||||
140
.github/workflows/build-cann.yml
vendored
140
.github/workflows/build-cann.yml
vendored
@@ -29,74 +29,76 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
openEuler-latest-cann:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -el {0}
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [x86, aarch64]
|
||||
chip_type: ['910b', '310p']
|
||||
build: ['Release']
|
||||
use_acl_graph: ['on', 'off']
|
||||
exclude:
|
||||
# 310P does not support USE_ACL_GRAPH=on
|
||||
- chip_type: '310p'
|
||||
use_acl_graph: 'on'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build }}
|
||||
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
run: |
|
||||
HOST_UID=$(id -u)
|
||||
HOST_GID=$(id -g)
|
||||
|
||||
docker run --rm \
|
||||
-v "${PWD}:/workspace" \
|
||||
-w /workspace \
|
||||
-e SOC_TYPE=${SOC_TYPE} \
|
||||
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
"${{ steps.cann-image.outputs.image }}" \
|
||||
bash -lc '
|
||||
set -e
|
||||
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
git config --global --add safe.directory "/workspace"
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE} \
|
||||
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
'
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# openEuler-latest-cann:
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash -el {0}
|
||||
# strategy:
|
||||
# matrix:
|
||||
# arch: [x86, aarch64]
|
||||
# chip_type: ['910b', '310p']
|
||||
# build: ['Release']
|
||||
# use_acl_graph: ['on', 'off']
|
||||
# exclude:
|
||||
# # 310P does not support USE_ACL_GRAPH=on
|
||||
# - chip_type: '310p'
|
||||
# use_acl_graph: 'on'
|
||||
# runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
# steps:
|
||||
# - name: Checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Free up disk space
|
||||
# uses: ggml-org/free-disk-space@v1.3.1
|
||||
# with:
|
||||
# tool-cache: true
|
||||
#
|
||||
# - name: Set container image
|
||||
# id: cann-image
|
||||
# run: |
|
||||
# image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
# echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
#
|
||||
# - name: Pull container image
|
||||
# run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
#
|
||||
# - name: Build
|
||||
# env:
|
||||
# BUILD_TYPE: ${{ matrix.build }}
|
||||
# SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
# USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
# run: |
|
||||
# HOST_UID=$(id -u)
|
||||
# HOST_GID=$(id -g)
|
||||
#
|
||||
# docker run --rm \
|
||||
# -v "${PWD}:/workspace" \
|
||||
# -w /workspace \
|
||||
# -e SOC_TYPE=${SOC_TYPE} \
|
||||
# -e BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
# "${{ steps.cann-image.outputs.image }}" \
|
||||
# bash -lc '
|
||||
# set -e
|
||||
# yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
# yum clean all && rm -rf /var/cache/yum
|
||||
# git config --global --add safe.directory "/workspace"
|
||||
# export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
# cmake -S . -B build \
|
||||
# -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -DGGML_CANN=on \
|
||||
# -DSOC_TYPE=${SOC_TYPE} \
|
||||
# -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
# cmake --build build -j $(nproc)
|
||||
#
|
||||
# chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
# '
|
||||
|
||||
231
.github/workflows/build-cpu.yml
vendored
Normal file
231
.github/workflows/build-cpu.yml
vendored
Normal file
@@ -0,0 +1,231 @@
|
||||
name: CI (cpu)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-cpu.yml',
|
||||
'.github/workflows/build-cmake-pkg.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-cpu.yml',
|
||||
'.github/workflows/build-cmake-pkg.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
build-cmake-pkg:
|
||||
uses: ./.github/workflows/build-cmake-pkg.yml
|
||||
|
||||
ubuntu:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-24.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cpu-${{ matrix.os }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build Dependencies
|
||||
id: build_depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-dev python3-wheel \
|
||||
libjpeg-dev build-essential libssl-dev \
|
||||
git-lfs
|
||||
|
||||
- name: Toolchain workaround (GCC 14)
|
||||
if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
|
||||
run: |
|
||||
sudo apt-get install -y gcc-14 g++-14
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Python Dependencies
|
||||
id: python_depends
|
||||
run: |
|
||||
export PIP_BREAK_SYSTEM_PACKAGES="1"
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
pip3 install ./gguf-py
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
windows:
|
||||
runs-on: windows-2025
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.4.313.2
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64-cpu-static'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
|
||||
- build: 'x64-openblas'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'x64-vulkan'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cpu-windows-2025-${{ matrix.build }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'x64-openblas' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
mkdir $env:RUNNER_TEMP/openblas
|
||||
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'x64-vulkan' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'x64-openblas' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.arch == 'x64' }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
# TODO: disabled for now, consider adding tests for all CPU variants instead
|
||||
# - name: Test (Intel SDE)
|
||||
# id: cmake_test_sde
|
||||
# if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
# run: |
|
||||
# curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# # for some weird reason windows tar doesn't like sde tar.xz
|
||||
# 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
|
||||
# 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
# $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
# cd build
|
||||
# $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||
# & $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||
2
.github/workflows/build-cross.yml
vendored
2
.github/workflows/build-cross.yml
vendored
@@ -287,7 +287,7 @@ jobs:
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
# key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup SpacemiT Toolchain
|
||||
#if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
|
||||
134
.github/workflows/build-cuda-ubuntu.yml
vendored
Normal file
134
.github/workflows/build-cuda-ubuntu.yml
vendored
Normal file
@@ -0,0 +1,134 @@
|
||||
name: CI (CUDA, ubuntu)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-cuda-ubuntu.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-cuda-ubuntu.yml',
|
||||
'ggml/src/ggml-cuda/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
cuda:
|
||||
runs-on: ubuntu-24.04
|
||||
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Install dependencies
|
||||
env:
|
||||
DEBIAN_FRONTEND: noninteractive
|
||||
run: |
|
||||
apt update
|
||||
apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cuda-ubuntu-24.04-cuda
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build with CMake
|
||||
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
|
||||
run: |
|
||||
cmake -S . -B build -G Ninja \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_CUDA_ARCHITECTURES=89-real \
|
||||
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CUDA=ON \
|
||||
-DGGML_CUDA_CUB_3DOT2=ON
|
||||
cmake --build build
|
||||
|
||||
hip:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cuda-ubuntu-22.04-hip
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build with native CMake HIP support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGPU_TARGETS="gfx1030" \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y build-essential git cmake libssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cuda-ubuntu-22.04-musa
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build with native CMake MUSA support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DGGML_MUSA=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
146
.github/workflows/build-cuda-windows.yml
vendored
Normal file
146
.github/workflows/build-cuda-windows.yml
vendored
Normal file
@@ -0,0 +1,146 @@
|
||||
name: CI (CUDA, windows)
|
||||
|
||||
# TODO: this workflow is only triggered manually because it is very heavy on the CI
|
||||
# when we provision dedicated windows runners, we can enable it for pushes too
|
||||
# note: running this workflow manually will populate the ccache for the release builds
|
||||
# this can be used before merging a PR to speed up the release workflow
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
|
||||
# note: this will run in queue with the release workflow
|
||||
concurrency:
|
||||
group: release
|
||||
queue: max
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
cuda:
|
||||
runs-on: windows-2022
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '13.3']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
with:
|
||||
cuda_version: ${{ matrix.cuda }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DLLAMA_BUILD_SERVER=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=ON ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DGGML_RPC=ON ^
|
||||
-DGGML_CUDA_CUB_3DOT2=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
hip:
|
||||
runs-on: windows-2022
|
||||
|
||||
env:
|
||||
# Make sure this is in sync with build-cache.yml
|
||||
HIPSDK_INSTALLER_VERSION: "26.Q1"
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
# sync with release.yml
|
||||
- name: "radeon"
|
||||
gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Grab rocWMMA package
|
||||
id: grab_rocwmma
|
||||
run: |
|
||||
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
|
||||
7z x rocwmma.deb
|
||||
7z x data.tar
|
||||
|
||||
- name: Use ROCm Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-rocm
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/windows-setup-rocm
|
||||
with:
|
||||
version: ${{ env.HIPSDK_INSTALLER_VERSION }}
|
||||
|
||||
- name: Verify ROCm
|
||||
id: verify
|
||||
run: |
|
||||
# Find and test ROCm installation
|
||||
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
|
||||
if (-not $clangPath) {
|
||||
Write-Error "ROCm installation not found"
|
||||
exit 1
|
||||
}
|
||||
& $clangPath.FullName --version
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
# TODO: this build does not match the build in release.yml, so we use a different cache key
|
||||
# ideally, the builds should match, similar to the CUDA build above so that we would be able
|
||||
# to populate the ccache for the release with manual runs of this workflow
|
||||
#key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
|
||||
key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON `
|
||||
-DROCM_DIR="${env:HIP_PATH}" `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGPU_TARGETS="gfx1100" `
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
150
.github/workflows/build-ibm.yml
vendored
Normal file
150
.github/workflows/build-ibm.yml
vendored
Normal file
@@ -0,0 +1,150 @@
|
||||
name: CI (ibm)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-ibm.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-ibm.yml',
|
||||
'ggml/src/ggml-cpu/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
ubuntu-24-s390x:
|
||||
runs-on: ubuntu-24.04-s390x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build Dependencies
|
||||
id: build_depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-dev python3-wheel \
|
||||
libjpeg-dev build-essential libssl-dev \
|
||||
git-lfs
|
||||
|
||||
- name: Toolchain workaround (GCC 14)
|
||||
run: |
|
||||
sudo apt-get install -y gcc-14 g++-14
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Python Dependencies
|
||||
id: python_depends
|
||||
run: |
|
||||
export PIP_BREAK_SYSTEM_PACKAGES="1"
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
pip3 install ./gguf-py
|
||||
|
||||
- name: Swap Endianness
|
||||
id: endianness
|
||||
run: |
|
||||
for f in models/*.gguf; do
|
||||
echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
|
||||
done
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c (s390x)
|
||||
id: llama2c_test_s390x
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch llama2c big-endian model"
|
||||
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
|
||||
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
ubuntu-24-ppc64le:
|
||||
runs-on: ubuntu-24.04-ppc64le
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build Dependencies
|
||||
id: build_depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-dev python3-wheel \
|
||||
libjpeg-dev build-essential libssl-dev \
|
||||
git-lfs
|
||||
|
||||
- name: Toolchain workaround (GCC 14)
|
||||
run: |
|
||||
sudo apt-get install -y gcc-14 g++-14
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Python Dependencies
|
||||
id: python_depends
|
||||
run: |
|
||||
export PIP_BREAK_SYSTEM_PACKAGES="1"
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
pip3 install ./gguf-py
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
8
.github/workflows/build-msys.yml
vendored
8
.github/workflows/build-msys.yml
vendored
@@ -15,9 +15,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
windows-msys2:
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.16
|
||||
# with:
|
||||
# key: windows-msys2
|
||||
# key: msys-windows-2025-x64
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
82
.github/workflows/build-opencl.yml
vendored
Normal file
82
.github/workflows/build-opencl.yml
vendored
Normal file
@@ -0,0 +1,82 @@
|
||||
name: CI (opencl)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-opencl.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-opencl.yml',
|
||||
'ggml/src/ggml-opencl/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
windows-2025-opencl-adreno:
|
||||
runs-on: windows-2025
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: opencl-windows-2025-x64
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
run: |
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||
cd OpenCL-Headers
|
||||
cmake -B build `
|
||||
-DBUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build --target install
|
||||
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
||||
cd OpenCL-ICD-Loader
|
||||
cmake -B build-arm64-release `
|
||||
-A arm64 `
|
||||
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build-arm64-release --target install --config release
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -S . -B build -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
10
.github/workflows/build-openvino.yml
vendored
10
.github/workflows/build-openvino.yml
vendored
@@ -29,9 +29,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-openvino:
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
|
||||
key: openvino-ubuntu-24.04-${{ matrix.variant }}-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
|
||||
92
.github/workflows/build-riscv.yml
vendored
92
.github/workflows/build-riscv.yml
vendored
@@ -29,11 +29,84 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-cpu-riscv64-native:
|
||||
runs-on: ubuntu-24.04-riscv
|
||||
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
# Install necessary packages
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libssl-dev
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
|
||||
git lfs install
|
||||
|
||||
- name: Check environment
|
||||
run: |
|
||||
uname -a
|
||||
gcc --version
|
||||
g++ --version
|
||||
ldd --version
|
||||
cmake --version
|
||||
rustc --version
|
||||
env
|
||||
echo "nproc=$(nproc)"
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
# with:
|
||||
# key: riscv-ubuntu-native
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=ON \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
ubuntu-riscv64-native-sanitizer:
|
||||
runs-on: ubuntu-24.04-riscv
|
||||
|
||||
@@ -62,12 +135,13 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
with:
|
||||
key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
# note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
# with:
|
||||
# key: riscv-ubuntu-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
66
.github/workflows/build-rpc.yml
vendored
Normal file
66
.github/workflows/build-rpc.yml
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
name: CI (rpc)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-rpc.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-rpc.yml',
|
||||
'ggml/src/ggml-rpc/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-rpc:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev ninja-build
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose
|
||||
51
.github/workflows/build-sanitize.yml
vendored
51
.github/workflows/build-sanitize.yml
vendored
@@ -22,66 +22,65 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
ctest:
|
||||
runs-on: [self-hosted, X64, CPU, Linux]
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
# with UNDEFINED sanitizer, we have to build in Debug to avoid GCC 13 false-positive warnings
|
||||
- name: Build (undefined)
|
||||
id: cmake_build_undefined
|
||||
if: ${{ matrix.sanitizer == 'UNDEFINED' }}
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
|
||||
cmake --build build --config Debug -j $(nproc)
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
if: ${{ matrix.sanitizer == 'ADDRESS' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
cmake --build build --config RelWithDebInfo -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
cmake --build build --config RelWithDebInfo -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# skip run in Debug - very slow
|
||||
if: ${{ matrix.sanitizer != 'UNDEFINED' }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
ctest -L main -E tokenizer --verbose --timeout 900
|
||||
|
||||
150
.github/workflows/build-self-hosted.yml
vendored
150
.github/workflows/build-self-hosted.yml
vendored
@@ -50,12 +50,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ggml-ci-nvidia-cuda:
|
||||
gpu-cuda:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
@@ -69,7 +69,7 @@ jobs:
|
||||
nvidia-smi
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm:
|
||||
gpu-vulkan-nvidia-cm:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm2:
|
||||
gpu-vulkan-nvidia-cm2:
|
||||
runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
|
||||
|
||||
steps:
|
||||
@@ -97,7 +97,7 @@ jobs:
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-webgpu:
|
||||
gpu-webgpu-nvidia:
|
||||
runs-on: [self-hosted, Linux, NVIDIA, X64]
|
||||
|
||||
steps:
|
||||
@@ -127,7 +127,7 @@ jobs:
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMX-compatible machine
|
||||
#ggml-ci-cpu-amx:
|
||||
#cpu-amx:
|
||||
# runs-on: [self-hosted, Linux, CPU, AMX]
|
||||
|
||||
# steps:
|
||||
@@ -141,7 +141,7 @@ jobs:
|
||||
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-vulkan:
|
||||
# amd-vulkan:
|
||||
# runs-on: [self-hosted, Linux, AMD]
|
||||
|
||||
# steps:
|
||||
@@ -156,7 +156,7 @@ jobs:
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-rocm:
|
||||
# amd-rocm:
|
||||
# runs-on: [self-hosted, Linux, AMD]
|
||||
|
||||
# steps:
|
||||
@@ -170,7 +170,7 @@ jobs:
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-metal:
|
||||
gpu-metal:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -183,7 +183,7 @@ jobs:
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-webgpu:
|
||||
gpu-webgpu-apple:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -210,7 +210,7 @@ jobs:
|
||||
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-vulkan:
|
||||
gpu-vulkan:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -224,7 +224,7 @@ jobs:
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
gpu-vulkan-intel-linux:
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
steps:
|
||||
@@ -240,7 +240,7 @@ jobs:
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-win-intel-vulkan:
|
||||
gpu-vulkan-intel-windows:
|
||||
runs-on: [self-hosted, Windows, X64, Intel]
|
||||
|
||||
steps:
|
||||
@@ -261,7 +261,7 @@ jobs:
|
||||
# a valid python environment for testing
|
||||
LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
|
||||
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
cpu-openvino-low-perf:
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
concurrency:
|
||||
@@ -297,8 +297,8 @@ jobs:
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-arm64-cpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
cpu-any-low-perf:
|
||||
runs-on: [self-hosted, CPU]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -310,8 +310,8 @@ jobs:
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
cpu-any-high-perf:
|
||||
runs-on: [self-hosted, CPU]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -323,34 +323,82 @@ jobs:
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: not sure how to detect ARM flags on DGX Spark. currently get this error during cmake:
|
||||
# CMake Warning at ggml/src/ggml-cpu/CMakeLists.txt:147 (message):
|
||||
# ARM -march/-mcpu not found, -mcpu=native will be used
|
||||
#
|
||||
# if we resolve this, we should be able to offload these jobs to the self-hosted runners
|
||||
#
|
||||
# ggml-ci-arm64-cpu-high-perf-sve:
|
||||
# runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-arm64-cpu-kleidiai:
|
||||
# runs-on: [self-hosted, Linux, ARM64, CPU]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
cpu-arm64-graviton4:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
set -euxo pipefail
|
||||
sudo apt-get update
|
||||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
time \
|
||||
git-lfs
|
||||
|
||||
git lfs install
|
||||
|
||||
# install the latest cmake
|
||||
sudo install -d /usr/share/keyrings
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
||||
| gpg --dearmor \
|
||||
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
||||
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
||||
| sudo tee /etc/apt/sources.list.d/kitware.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
cpu-arm64-graviton4-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
set -euxo pipefail
|
||||
sudo apt-get update
|
||||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
time \
|
||||
git-lfs
|
||||
|
||||
git lfs install
|
||||
|
||||
# install the latest cmake
|
||||
sudo install -d /usr/share/keyrings
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
||||
| gpg --dearmor \
|
||||
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
||||
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
||||
| sudo tee /etc/apt/sources.list.d/kitware.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_KLEIDIAI=1 \
|
||||
GG_BUILD_EXTRA_TESTS_0=1 \
|
||||
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
252
.github/workflows/build-sycl.yml
vendored
252
.github/workflows/build-sycl.yml
vendored
@@ -29,132 +29,134 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
ubuntu-24-sycl:
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# continue-on-error: true
|
||||
#
|
||||
# steps:
|
||||
# - uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: sycl-ubuntu-24-${{ matrix.build }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
windows-latest-sycl:
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-latest-sycl:
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: sycl-windows-latest
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
59
.github/workflows/build-vulkan.yml
vendored
59
.github/workflows/build-vulkan.yml
vendored
@@ -31,12 +31,59 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-vulkan-llvmpipe:
|
||||
ubuntu:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-24.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-24.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: vulkan-${{ matrix.os }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Configure
|
||||
id: cmake_configure
|
||||
run: |
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_VULKAN=ON
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
time cmake --build build -j $(nproc)
|
||||
|
||||
ubuntu-llvmpipe:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
@@ -47,7 +94,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-vulkan-llvmpipe
|
||||
key: vulkan-ubuntu-24.04-llvmpipe
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -68,7 +115,7 @@ jobs:
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
|
||||
181
.github/workflows/build-webgpu.yml
vendored
Normal file
181
.github/workflows/build-webgpu.yml
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
name: CI (webgpu)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-webgpu.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-webgpu.yml',
|
||||
'ggml/src/ggml-webgpu/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: webgpu-macos-latest
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v20260317.182325"
|
||||
DAWN_OWNER="google"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
export CMAKE_PREFIX_PATH=dawn
|
||||
cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: webgpu-ubuntu-24.04
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
|
||||
DAWN_VERSION="v20260317.182325"
|
||||
DAWN_OWNER="google"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
export Dawn_DIR=dawn/lib64/cmake/Dawn
|
||||
cmake -B build \
|
||||
-DGGML_WEBGPU=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
# test-backend-ops is too slow on llvmpipe, skip it
|
||||
ctest -L main -E test-backend-ops --verbose --timeout 900
|
||||
|
||||
ubuntu-wasm:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-24.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-24.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: webgpu-${{ matrix.os }}-wasm
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Install Emscripten
|
||||
run: |
|
||||
git clone https://github.com/emscripten-core/emsdk.git
|
||||
cd emsdk
|
||||
./emsdk install latest
|
||||
./emsdk activate latest
|
||||
|
||||
- name: Fetch emdawnwebgpu
|
||||
run: |
|
||||
DAWN_TAG="v20260317.182325"
|
||||
EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
|
||||
echo "Downloading ${EMDAWN_PKG}"
|
||||
curl -L -o emdawn.zip \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
|
||||
unzip emdawn.zip
|
||||
|
||||
- name: Build WASM WebGPU
|
||||
run: |
|
||||
source emsdk/emsdk_env.sh
|
||||
emcmake cmake -B build-wasm \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_WEBGPU=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
|
||||
|
||||
time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
|
||||
1112
.github/workflows/build.yml
vendored
1112
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load Diff
8
.github/workflows/hip-quality-check.yml
vendored
8
.github/workflows/hip-quality-check.yml
vendored
@@ -28,9 +28,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-22-hip-quality-check:
|
||||
@@ -50,7 +50,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-22-hip-quality-check
|
||||
key: hip-quality-check-ubuntu-22.04
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
|
||||
745
.github/workflows/release.yml
vendored
745
.github/workflows/release.yml
vendored
@@ -27,18 +27,40 @@ on:
|
||||
'**/*.glsl'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
|
||||
|
||||
# note: run this workflow one at a time for better cache reuse
|
||||
concurrency:
|
||||
group: release
|
||||
queue: max
|
||||
|
||||
jobs:
|
||||
check_release:
|
||||
runs-on: ubuntu-slim
|
||||
|
||||
macOS-cpu:
|
||||
outputs:
|
||||
should_release: ${{ steps.check.outputs.should_release }}
|
||||
|
||||
steps:
|
||||
- id: check
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
||||
echo "should_release=true" >> $GITHUB_OUTPUT
|
||||
elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
|
||||
if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
|
||||
echo "should_release=false" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "should_release=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
else
|
||||
echo "should_release=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
macos-cpu:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
@@ -46,10 +68,12 @@ jobs:
|
||||
arch: 'arm64'
|
||||
os: macos-14
|
||||
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
|
||||
- build: 'arm64-kleidiai'
|
||||
arch: 'arm64'
|
||||
os: macos-14
|
||||
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23780)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
#- build: 'arm64-kleidiai'
|
||||
# arch: 'arm64'
|
||||
# os: macos-14
|
||||
# defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
|
||||
- build: 'x64'
|
||||
arch: 'x64'
|
||||
os: macos-15-intel
|
||||
@@ -76,8 +100,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-${{ matrix.arch }}
|
||||
evict-old-files: 1d
|
||||
key: release-${{ matrix.os }}-${{ matrix.arch }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -109,7 +133,8 @@ jobs:
|
||||
name: llama-bin-macos-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-cpu:
|
||||
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
@@ -140,8 +165,8 @@ jobs:
|
||||
if: ${{ matrix.build != 's390x' }}
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-cpu-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
key: release-${{ matrix.os }}-cpu
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -186,6 +211,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-vulkan:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -214,8 +241,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-vulkan-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
key: release-${{ matrix.os }}-vulkan
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -262,6 +289,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
|
||||
android-arm64:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -282,11 +311,17 @@ jobs:
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: android-arm64
|
||||
evict-old-files: 1d
|
||||
# note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
|
||||
# for some reason, the ccache does not improve the build time in this case
|
||||
# example:
|
||||
# cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
|
||||
# cache on: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
|
||||
#
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-android-arm64
|
||||
# append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
@@ -339,6 +374,8 @@ jobs:
|
||||
name: llama-bin-android-arm64.tar.gz
|
||||
|
||||
ubuntu-24-openvino:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
@@ -371,8 +408,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-openvino-release-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
key: release-ubuntu-24.04-openvino-release-no-preset-v1
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
@@ -385,7 +422,7 @@ jobs:
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
@@ -427,6 +464,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
|
||||
|
||||
windows-cpu:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2025
|
||||
|
||||
@@ -452,9 +491,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-cpu-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
key: release-windows-2025-${{ matrix.arch }}-cpu
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Install Ninja
|
||||
run: |
|
||||
@@ -487,6 +525,8 @@ jobs:
|
||||
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||
|
||||
windows:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2025
|
||||
|
||||
@@ -521,9 +561,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
@@ -577,12 +616,14 @@ jobs:
|
||||
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
|
||||
windows-cuda:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2022
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '13.1']
|
||||
cuda: ['12.4', '13.3']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -596,12 +637,11 @@ jobs:
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: Install ccache
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
@@ -655,214 +695,216 @@ jobs:
|
||||
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-sycl:
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-sycl:
|
||||
#
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: Setup Node.js
|
||||
# uses: actions/setup-node@v6
|
||||
# with:
|
||||
# node-version: "24"
|
||||
# cache: "npm"
|
||||
# cache-dependency-path: "tools/ui/package-lock.json"
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-windows-2022-x64-sycl
|
||||
# append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# shell: cmd
|
||||
# run: |
|
||||
# call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
# cmake -G "Ninja" -B build ^
|
||||
# -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
# -DCMAKE_BUILD_TYPE=Release ^
|
||||
# -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
# -DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
# -DLLAMA_BUILD_BORINGSSL=ON
|
||||
# cmake --build build --target ggml-sycl -j
|
||||
#
|
||||
# - name: Build the release package
|
||||
# id: pack_artifacts
|
||||
# run: |
|
||||
# echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
# ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
# if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
# echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
# cp "$ZE_LOADER_DLL" ./build/bin
|
||||
# else
|
||||
# echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
# fi
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
#
|
||||
# echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
# 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
#
|
||||
# - name: Upload the release package
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-bin-win-sycl-x64.zip
|
||||
# name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
cmake -G "Ninja" -B build ^
|
||||
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --target ggml-sycl -j
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
cp "$ZE_LOADER_DLL" ./build/bin
|
||||
else
|
||||
echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
fi
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
ubuntu-24-sycl:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Setup Node.js
|
||||
# uses: actions/setup-node@v6
|
||||
# with:
|
||||
# node-version: "24"
|
||||
# cache: "npm"
|
||||
# cache-dependency-path: "tools/ui/package-lock.json"
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-ubuntu-24.04-sycl
|
||||
# append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
#
|
||||
# - name: Determine tag name
|
||||
# id: tag
|
||||
# uses: ./.github/actions/get-tag-name
|
||||
#
|
||||
# - name: Pack artifacts
|
||||
# id: pack_artifacts
|
||||
# run: |
|
||||
# cp LICENSE ./build/bin/
|
||||
# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
#
|
||||
# - name: Upload artifacts
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
# name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
|
||||
ubuntu-22-rocm:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -895,8 +937,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
key: release-ubuntu-22.04-rocm-${{ matrix.ROCM_VERSION }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -974,6 +1016,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
|
||||
|
||||
windows-hip:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2022
|
||||
|
||||
@@ -1010,13 +1054,13 @@ jobs:
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
|
||||
evict-old-files: 1d
|
||||
key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Install ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
@@ -1088,6 +1132,8 @@ jobs:
|
||||
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
|
||||
ios-xcode-build:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
runs-on: macos-15
|
||||
|
||||
steps:
|
||||
@@ -1143,98 +1189,101 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
|
||||
|
||||
openEuler-cann:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
# 910b with aclgraph (both architectures)
|
||||
- arch: x86
|
||||
chip_type: '910b'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'on'
|
||||
- arch: aarch64
|
||||
chip_type: '910b'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'on'
|
||||
# 310p without aclgraph (both architectures)
|
||||
- arch: x86
|
||||
chip_type: '310p'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'off'
|
||||
- arch: aarch64
|
||||
chip_type: '310p'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'off'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build }}
|
||||
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
run: |
|
||||
HOST_UID=$(id -u)
|
||||
HOST_GID=$(id -g)
|
||||
|
||||
docker run --rm \
|
||||
-v "${PWD}:/workspace" \
|
||||
-w /workspace \
|
||||
-e SOC_TYPE=${SOC_TYPE} \
|
||||
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
"${{ steps.cann-image.outputs.image }}" \
|
||||
bash -lc '
|
||||
set -e
|
||||
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
git config --global --add safe.directory "/workspace"
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE} \
|
||||
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
'
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# openEuler-cann:
|
||||
# strategy:
|
||||
# matrix:
|
||||
# include:
|
||||
# # 910b with aclgraph (both architectures)
|
||||
# - arch: x86
|
||||
# chip_type: '910b'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'on'
|
||||
# - arch: aarch64
|
||||
# chip_type: '910b'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'on'
|
||||
# # 310p without aclgraph (both architectures)
|
||||
# - arch: x86
|
||||
# chip_type: '310p'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'off'
|
||||
# - arch: aarch64
|
||||
# chip_type: '310p'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'off'
|
||||
# runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
# steps:
|
||||
# - name: Checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Free up disk space
|
||||
# uses: ggml-org/free-disk-space@v1.3.1
|
||||
# with:
|
||||
# tool-cache: true
|
||||
#
|
||||
# - name: Set container image
|
||||
# id: cann-image
|
||||
# run: |
|
||||
# image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
# echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
#
|
||||
# - name: Pull container image
|
||||
# run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
#
|
||||
# - name: Build
|
||||
# env:
|
||||
# BUILD_TYPE: ${{ matrix.build }}
|
||||
# SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
# USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
# run: |
|
||||
# HOST_UID=$(id -u)
|
||||
# HOST_GID=$(id -g)
|
||||
#
|
||||
# docker run --rm \
|
||||
# -v "${PWD}:/workspace" \
|
||||
# -w /workspace \
|
||||
# -e SOC_TYPE=${SOC_TYPE} \
|
||||
# -e BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
# "${{ steps.cann-image.outputs.image }}" \
|
||||
# bash -lc '
|
||||
# set -e
|
||||
# yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
# yum clean all && rm -rf /var/cache/yum
|
||||
# git config --global --add safe.directory "/workspace"
|
||||
# export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
# cmake -S . -B build \
|
||||
# -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -DGGML_CANN=on \
|
||||
# -DSOC_TYPE=${SOC_TYPE} \
|
||||
# -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
# cmake --build build -j $(nproc)
|
||||
#
|
||||
# chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
# '
|
||||
#
|
||||
# - name: Determine tag name
|
||||
# id: tag
|
||||
# uses: ./.github/actions/get-tag-name
|
||||
#
|
||||
# - name: Pack artifacts
|
||||
# run: |
|
||||
# cp LICENSE ./build/bin/
|
||||
# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
#
|
||||
# - name: Upload artifacts
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
# name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
|
||||
ui-build:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
release:
|
||||
@@ -1251,17 +1300,17 @@ jobs:
|
||||
- windows
|
||||
- windows-cpu
|
||||
- windows-cuda
|
||||
- windows-sycl
|
||||
#- windows-sycl
|
||||
- windows-hip
|
||||
- ubuntu-22-rocm
|
||||
- ubuntu-cpu
|
||||
- ubuntu-vulkan
|
||||
- ubuntu-24-openvino
|
||||
- ubuntu-24-sycl
|
||||
#- ubuntu-24-sycl
|
||||
- android-arm64
|
||||
- macOS-cpu
|
||||
- macos-cpu
|
||||
- ios-xcode-build
|
||||
- openEuler-cann
|
||||
#- openEuler-cann
|
||||
- ui-build
|
||||
|
||||
outputs:
|
||||
@@ -1350,7 +1399,7 @@ jobs:
|
||||
|
||||
**macOS/iOS:**
|
||||
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
|
||||
- [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
|
||||
- macOS Apple Silicon (arm64, KleidiAI enabled) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23780)
|
||||
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
|
||||
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
|
||||
|
||||
@@ -1362,8 +1411,7 @@ jobs:
|
||||
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
|
||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
|
||||
- Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
|
||||
**Android:**
|
||||
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
|
||||
@@ -1372,16 +1420,17 @@ jobs:
|
||||
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
||||
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
|
||||
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
|
||||
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
|
||||
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
|
||||
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
|
||||
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
||||
- Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
||||
|
||||
**openEuler:**
|
||||
- [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
|
||||
- [openEuler x86 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86-aclgraph.tar.gz)
|
||||
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
|
||||
- [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)
|
||||
- [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
- openEuler x86 (310p)
|
||||
- openEuler x86 (910b, ACL Graph)
|
||||
- openEuler aarch64 (310p)
|
||||
- openEuler aarch64 (910b, ACL Graph)
|
||||
|
||||
**UI:**
|
||||
- [UI](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-ui.tar.gz)
|
||||
|
||||
36
.github/workflows/server-sanitize.yml
vendored
36
.github/workflows/server-sanitize.yml
vendored
@@ -26,10 +26,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -37,7 +37,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: [self-hosted, CPU, Linux, llama-server]
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -46,19 +46,19 @@ jobs:
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install \
|
||||
build-essential \
|
||||
xxd \
|
||||
git \
|
||||
cmake \
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
libssl-dev
|
||||
#- name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get -y install \
|
||||
# build-essential \
|
||||
# xxd \
|
||||
# git \
|
||||
# cmake \
|
||||
# curl \
|
||||
# wget \
|
||||
# language-pack-en \
|
||||
# libssl-dev
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
|
||||
8
.github/workflows/server-self-hosted.yml
vendored
8
.github/workflows/server-self-hosted.yml
vendored
@@ -29,10 +29,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
|
||||
51
.github/workflows/server.yml
vendored
51
.github/workflows/server.yml
vendored
@@ -44,25 +44,20 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ui-build:
|
||||
name: Build Web UI
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
ubuntu:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
needs: ui-build
|
||||
|
||||
name: server (${{ matrix.wf_name }})
|
||||
name: ubuntu (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
@@ -98,17 +93,17 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Download built UI
|
||||
uses: actions/download-artifact@v7
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist
|
||||
key: server-ubuntu-24.04-x64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
@@ -135,8 +130,8 @@ jobs:
|
||||
export ${{ matrix.extra_args }}
|
||||
SLOW_TESTS=1 pytest -v -x
|
||||
|
||||
server-windows:
|
||||
runs-on: windows-2022
|
||||
windows:
|
||||
runs-on: windows-2025
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -146,16 +141,24 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
node-version: "24"
|
||||
key: server-windows-2025-x64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
cmake -B build -G "Ninja Multi-Config" ^
|
||||
-DCMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_SCHED_NO_REALLOC=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
|
||||
8
.github/workflows/ui-self-hosted.yml
vendored
8
.github/workflows/ui-self-hosted.yml
vendored
@@ -30,10 +30,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
|
||||
8
.github/workflows/ui.yml
vendored
8
.github/workflows/ui.yml
vendored
@@ -26,10 +26,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
|
||||
@@ -63,6 +63,7 @@ After submitting your PR:
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
|
||||
- Let other maintainers merge their own PRs
|
||||
- When merging a PR, make sure you have a good understanding of the changes
|
||||
- If a PR does not warrant a new release, add `[no release]` in the squashed commit to spare CI resources
|
||||
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
|
||||
|
||||
Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
|
||||
|
||||
13
ci/run.sh
13
ci/run.sh
@@ -66,6 +66,8 @@ fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
else
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
@@ -114,10 +116,7 @@ fi
|
||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
||||
|
||||
# if on Mac, disable METAL
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
|
||||
MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
|
||||
MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
|
||||
if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
|
||||
@@ -133,7 +132,7 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
|
||||
|
||||
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
|
||||
if [ -z "${CMAKE_PREFIX_PATH}" ]; then
|
||||
@@ -167,6 +166,8 @@ fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_BLAS} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
|
||||
else
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=OFF"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
|
||||
@@ -700,8 +701,8 @@ function gg_sum_test_backend_ops_cpu {
|
||||
|
||||
## main
|
||||
|
||||
export LLAMA_LOG_PREFIX=1
|
||||
export LLAMA_LOG_TIMESTAMPS=1
|
||||
export LLAMA_ARG_LOG_PREFIX=1
|
||||
export LLAMA_ARG_LOG_TIMESTAMPS=1
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
|
||||
|
||||
@@ -2998,7 +2998,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
key_file.close();
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_KEY_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--ssl-key-file"}, "FNAME",
|
||||
"path to file a PEM-encoded SSL private key",
|
||||
@@ -3026,7 +3026,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.default_template_kwargs[item.key()] = item.value().dump();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CHAT_TEMPLATE_KWARGS"));
|
||||
add_opt(common_arg(
|
||||
{"-to", "--timeout"}, "N",
|
||||
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
||||
@@ -3327,7 +3327,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params &, const std::string & value) {
|
||||
common_log_set_file(common_log_main(), value.c_str());
|
||||
}
|
||||
).set_env("LLAMA_LOG_FILE"));
|
||||
).set_env("LLAMA_ARG_LOG_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--log-colors"}, "[on|off|auto]",
|
||||
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
|
||||
@@ -3344,7 +3344,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_LOG_COLORS"));
|
||||
).set_env("LLAMA_ARG_LOG_COLORS"));
|
||||
add_opt(common_arg(
|
||||
{"-v", "--verbose", "--log-verbose"},
|
||||
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
||||
@@ -3359,7 +3359,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.offline = true;
|
||||
}
|
||||
).set_env("LLAMA_OFFLINE"));
|
||||
).set_env("LLAMA_ARG_OFFLINE"));
|
||||
add_opt(common_arg(
|
||||
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
||||
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
|
||||
@@ -3374,7 +3374,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.verbosity = value;
|
||||
common_log_set_verbosity_thold(value);
|
||||
}
|
||||
).set_env("LLAMA_LOG_VERBOSITY"));
|
||||
).set_env("LLAMA_ARG_LOG_VERBOSITY"));
|
||||
add_opt(common_arg(
|
||||
{"--log-prefix"},
|
||||
{"--no-log-prefix"},
|
||||
|
||||
@@ -74,6 +74,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Gemma3nForCausalLM": "gemma",
|
||||
"Gemma3nForConditionalGeneration": "gemma",
|
||||
"Gemma4ForConditionalGeneration": "gemma",
|
||||
"Gemma4ForCausalLM": "gemma",
|
||||
"GemmaForCausalLM": "gemma",
|
||||
"Glm4ForCausalLM": "glm",
|
||||
"Glm4MoeForCausalLM": "glm",
|
||||
@@ -215,6 +216,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"T5EncoderModel": "t5",
|
||||
"T5ForConditionalGeneration": "t5",
|
||||
"T5WithLMHeadModel": "t5",
|
||||
"TalkieForCausalLM": "talkie",
|
||||
"UMT5ForConditionalGeneration": "t5",
|
||||
"UMT5Model": "t5",
|
||||
"UltravoxModel": "ultravox",
|
||||
|
||||
@@ -119,7 +119,8 @@ class ModelBase:
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
|
||||
disable_mistral_community_chat_template: bool = False,
|
||||
sentence_transformers_dense_modules: bool = False,
|
||||
fuse_gate_up_exps: bool = False):
|
||||
fuse_gate_up_exps: bool = False,
|
||||
fp8_as_q8: bool = False):
|
||||
if type(self) is ModelBase or \
|
||||
type(self) is TextModel or \
|
||||
type(self) is MmprojModel:
|
||||
@@ -148,6 +149,8 @@ class ModelBase:
|
||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||
self._is_nvfp4 = False
|
||||
self._is_mxfp4 = False
|
||||
self._fp8_as_q8 = fp8_as_q8
|
||||
self._fp8_dequantized: set[str] = set()
|
||||
|
||||
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
|
||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||
@@ -429,6 +432,8 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||
tensors_to_remove.append(name)
|
||||
if self._fp8_as_q8:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
if name.endswith(".activation_scale"): # unused
|
||||
tensors_to_remove.append(name)
|
||||
if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused
|
||||
@@ -440,6 +445,8 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||
tensors_to_remove.append(name)
|
||||
if self._fp8_as_q8:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
if name.endswith(".qscale_act"):
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method == "gptq":
|
||||
@@ -467,7 +474,14 @@ class ModelBase:
|
||||
elif quant_method == "compressed-tensors":
|
||||
quant_format = quant_config["format"]
|
||||
groups = quant_config["config_groups"]
|
||||
if len(groups) > 1:
|
||||
nvfp4_compressed_tensors = (
|
||||
quant_format == "nvfp4-pack-quantized"
|
||||
or quant_format == "mixed-precision"
|
||||
and bool(groups)
|
||||
and all(g.get("format") == "nvfp4-pack-quantized" for g in groups.values() if isinstance(g, dict))
|
||||
)
|
||||
|
||||
if len(groups) > 1 and not nvfp4_compressed_tensors:
|
||||
raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
|
||||
weight_config = tuple(groups.values())[0]["weights"]
|
||||
|
||||
@@ -476,6 +490,11 @@ class ModelBase:
|
||||
strategy = weight_config.get("strategy")
|
||||
assert strategy == "channel" or strategy == "block"
|
||||
assert weight_config.get("group_size") is None # didn't find a model using this yet
|
||||
is_fp8 = (
|
||||
quant_format == "float-quantized"
|
||||
and weight_config.get("type") == "float"
|
||||
and weight_config.get("num_bits") == 8
|
||||
)
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
weight_name = name.removesuffix("_scale")
|
||||
@@ -483,6 +502,8 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size)
|
||||
tensors_to_remove.append(name)
|
||||
if self._fp8_as_q8 and is_fp8:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
elif quant_format == "pack-quantized":
|
||||
assert weight_config.get("strategy") == "group"
|
||||
assert weight_config.get("type", "int") == "int"
|
||||
@@ -505,6 +526,9 @@ class ModelBase:
|
||||
tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
|
||||
if (base_name + "_zero_point") in self.model_tensors:
|
||||
tensors_to_remove.append(base_name + "_zero_point")
|
||||
elif nvfp4_compressed_tensors:
|
||||
# Don't error from compressed-tensors, we'll handle them in _generate_nvfp4_tensors
|
||||
pass
|
||||
else:
|
||||
raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
|
||||
elif quant_method == "modelopt":
|
||||
@@ -514,10 +538,18 @@ class ModelBase:
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
weight_name = name.removesuffix("_scale")
|
||||
if weight_name not in self.model_tensors:
|
||||
tensors_to_remove.append(name)
|
||||
continue
|
||||
w = self.model_tensors[weight_name]
|
||||
s = self.model_tensors[name]
|
||||
is_fp8_weight = False
|
||||
if self._fp8_as_q8:
|
||||
is_fp8_weight = w().dtype in (torch.float8_e4m3fn, torch.float8_e5m2)
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
|
||||
tensors_to_remove.append(name)
|
||||
if is_fp8_weight:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method is not None:
|
||||
@@ -605,8 +637,10 @@ class ModelBase:
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||
del name, new_name, bid, n_dims # unused
|
||||
|
||||
del new_name, bid # unused
|
||||
# Force FP8-original tensors to Q8_0 when requested; Q8_0 is faster than F16/BF16.
|
||||
if self._fp8_as_q8 and name in self._fp8_dequantized and n_dims >= 2:
|
||||
return gguf.GGMLQuantizationType.Q8_0
|
||||
return False
|
||||
|
||||
# some models need extra generated tensors (like rope_freqs)
|
||||
@@ -746,10 +780,13 @@ class ModelBase:
|
||||
del experts, merged
|
||||
|
||||
def prepare_tensors(self):
|
||||
# detect NVFP4 quantization (ModelOpt format)
|
||||
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
|
||||
quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
|
||||
quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
|
||||
# detect NVFP4 quantization (ModelOpt and Compressed-tensors formats)
|
||||
quantization_config = self.hparams.get("quantization_config") or {}
|
||||
quant_algo = quantization_config.get("quant_algo")
|
||||
quant_method = quantization_config.get("quant_method")
|
||||
quant_format = quantization_config.get("format")
|
||||
quant_groups = quantization_config.get("config_groups") or {}
|
||||
quant_layers = quantization_config.get("quantized_layers") or {}
|
||||
quant_config_file = self.dir_model / "hf_quant_config.json"
|
||||
|
||||
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
|
||||
@@ -760,13 +797,25 @@ class ModelBase:
|
||||
producer_name = (producer.get("name") or "").lower()
|
||||
if quant_method is None:
|
||||
self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
|
||||
quant_method = producer_name
|
||||
quant_algo = quant_config.get("quant_algo", quant_algo)
|
||||
quant_method = quant_config.get("quant_method", quant_method)
|
||||
quant_format = quant_config.get("format", quant_format)
|
||||
quant_groups = quant_config.get("config_groups", quant_groups) or {}
|
||||
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
|
||||
|
||||
# Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with
|
||||
# per-layer NVFP4/FP8) instead of a single global "NVFP4" value.
|
||||
nvfp4_compressed_tensors = quant_method == "compressed-tensors" and (
|
||||
quant_format == "nvfp4-pack-quantized"
|
||||
or quant_format == "mixed-precision"
|
||||
and bool(quant_groups)
|
||||
and all(g.get("format") == "nvfp4-pack-quantized" for g in quant_groups.values() if isinstance(g, dict))
|
||||
)
|
||||
if quant_algo != "NVFP4":
|
||||
if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)):
|
||||
if nvfp4_compressed_tensors:
|
||||
quant_algo = "NVFP4"
|
||||
elif any(str(v.get("quant_algo")).endswith("NVFP4") for v in quant_layers.values() if isinstance(v, dict)):
|
||||
quant_algo = "NVFP4"
|
||||
|
||||
self._is_nvfp4 = quant_algo == "NVFP4"
|
||||
@@ -776,6 +825,28 @@ class ModelBase:
|
||||
# This must run before dequant_model so NVFP4 tensors are removed
|
||||
# from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
|
||||
if self._is_nvfp4:
|
||||
if nvfp4_compressed_tensors:
|
||||
# Convert compressed-tensors 'global' scales into the reciprocal
|
||||
def inverse_scale(gen):
|
||||
def load():
|
||||
scale = LazyTorchTensor.to_eager(gen()).float()
|
||||
return 1.0 / scale
|
||||
return load
|
||||
|
||||
# Change the compressed-tensors names to the ModelOpt names for handling consistently later
|
||||
for name in list(self.model_tensors.keys()):
|
||||
if name.endswith(".weight_packed"):
|
||||
weight_name = name.removesuffix("_packed")
|
||||
if weight_name not in self.model_tensors:
|
||||
self.model_tensors[weight_name] = self.model_tensors.pop(name)
|
||||
elif name.endswith(".weight_global_scale"):
|
||||
scale2_name = name.replace(".weight_global_scale", ".weight_scale_2")
|
||||
if scale2_name not in self.model_tensors:
|
||||
self.model_tensors[scale2_name] = inverse_scale(self.model_tensors.pop(name))
|
||||
elif name.endswith(".input_global_scale"):
|
||||
input_scale_name = name.replace(".input_global_scale", ".input_scale")
|
||||
if input_scale_name not in self.model_tensors:
|
||||
self.model_tensors[input_scale_name] = inverse_scale(self.model_tensors.pop(name))
|
||||
self._generate_nvfp4_tensors()
|
||||
|
||||
self.dequant_model()
|
||||
@@ -1575,6 +1646,12 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
|
||||
# ref: https://huggingface.co/sarvamai/sarvam-30b
|
||||
res = "sarvam-moe"
|
||||
if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af":
|
||||
# ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf
|
||||
res = "talkie"
|
||||
if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
|
||||
# ref: https://huggingface.co/openbmb/MiniCPM5-1B
|
||||
res = "minicpm5"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2364,10 +2441,9 @@ class MmprojModel(ModelBase):
|
||||
raise KeyError(f"could not find any of: {keys}")
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, name, n_dims # unused
|
||||
if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name:
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return False
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
class LazyTorchTensor(gguf.LazyBase):
|
||||
|
||||
@@ -614,7 +614,7 @@ class Gemma3NModel(Gemma3Model):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration", "Gemma4ForCausalLM")
|
||||
class Gemma4Model(Gemma3Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA4
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
@@ -549,6 +548,7 @@ class _Qwen35MtpMixin:
|
||||
tensor_map: gguf.TensorNameMap
|
||||
no_mtp: bool
|
||||
mtp_only: bool
|
||||
_original_block_count: int | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -557,22 +557,44 @@ class _Qwen35MtpMixin:
|
||||
self.block_count += self.hparams.get("mtp_num_hidden_layers", 0)
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
|
||||
hparams = {**self.hparams, **self.hparams.get("text_config", {})}
|
||||
key = next((k for k in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] if k in hparams), None)
|
||||
type(self)._original_block_count = hparams.get(key)
|
||||
return super().index_tensors(remote_hf_model_id=remote_hf_model_id) # ty: ignore[unresolved-attribute]
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item):
|
||||
name, _ = item
|
||||
assert cls._original_block_count is not None
|
||||
# TODO: change TextModel to super()
|
||||
if (titem := TextModel.filter_tensors(item)) is None:
|
||||
return None
|
||||
name, gen = titem
|
||||
if name.startswith("model.mtp."):
|
||||
name = name.replace("model.", "", 1)
|
||||
if name.startswith("mtp."):
|
||||
if cls.no_mtp:
|
||||
return None
|
||||
return item
|
||||
if cls.mtp_only:
|
||||
canonical = name.replace("language_model.", "")
|
||||
keep = canonical in (
|
||||
remapper = {
|
||||
"fc": "eh_proj",
|
||||
"pre_fc_norm_embedding": "enorm",
|
||||
"pre_fc_norm_hidden": "hnorm",
|
||||
"norm": "shared_head.norm",
|
||||
}
|
||||
parts = name.split(".", 3)
|
||||
if len(parts) == 4 and parts[1] == "layers" and parts[2].isdecimal():
|
||||
mtp_idx = int(parts[2])
|
||||
name = f"model.layers.{cls._original_block_count + mtp_idx}.{parts[3]}"
|
||||
elif len(parts) == 3 and parts[1] in remapper:
|
||||
name = f"model.layers.{cls._original_block_count}.{remapper[parts[1]]}.{parts[2]}"
|
||||
elif cls.mtp_only:
|
||||
keep = name in (
|
||||
"model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
|
||||
"embed_tokens.weight", "norm.weight",
|
||||
)
|
||||
if not keep:
|
||||
return None
|
||||
return super().filter_tensors(item) # ty: ignore[unresolved-attribute]
|
||||
return name, gen
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters() # ty: ignore[unresolved-attribute]
|
||||
@@ -594,29 +616,6 @@ class _Qwen35MtpMixin:
|
||||
self.metadata.version, size_label=None, output_type=output_type, model_type=None) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
|
||||
self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("mtp."):
|
||||
n_layer = self.hparams["num_hidden_layers"]
|
||||
if name.find("layers.") != -1:
|
||||
assert bid is not None
|
||||
name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + n_layer}")
|
||||
bid = bid + n_layer
|
||||
else:
|
||||
remapper = {
|
||||
"mtp.fc": "model.layers.{bid}.eh_proj",
|
||||
"mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm",
|
||||
"mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm",
|
||||
"mtp.norm": "model.layers.{bid}.shared_head.norm",
|
||||
}
|
||||
stem = Path(name).stem
|
||||
suffix = Path(name).suffix
|
||||
tmpl = remapper[stem] + suffix
|
||||
for b in range(n_layer, self.block_count):
|
||||
yield from super().modify_tensors(data_torch, tmpl.format(bid=b), b) # ty: ignore[unresolved-attribute]
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid) # ty: ignore[unresolved-attribute]
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
|
||||
class Qwen3_5TextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase):
|
||||
|
||||
53
conversion/talkie.py
Normal file
53
conversion/talkie.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import LazyTorchTensor, ModelBase, TextModel, gguf
|
||||
|
||||
|
||||
@ModelBase.register("TalkieForCausalLM")
|
||||
class TalkieModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.TALKIE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
# Talkie used F.rms_norm without an explicit eps
|
||||
self.gguf_writer.add_layer_norm_rms_eps(torch.finfo(torch.float32).eps)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
prefix = f"model.blocks.{bid}." if bid is not None else ""
|
||||
suffix = name.removeprefix(prefix)
|
||||
|
||||
if suffix == "attn_gain.a_g":
|
||||
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, ".scale"), data_torch
|
||||
return
|
||||
elif suffix == "mlp_gain.a_g":
|
||||
yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, ".scale"), data_torch
|
||||
return
|
||||
elif suffix == "lm_head_gain.w_g":
|
||||
self.gguf_writer.add_logit_scale(LazyTorchTensor.to_eager(data_torch).item())
|
||||
return
|
||||
elif suffix in ("attn.attn_query.weight", "attn.attn_key.weight"):
|
||||
# absorb inverse rope
|
||||
head_dim = self.hparams["head_dim"]
|
||||
shape = data_torch.shape
|
||||
data_torch = torch.reshape(data_torch, (-1, head_dim, shape[-1]))
|
||||
signs = torch.ones((1, head_dim, 1), dtype=data_torch.dtype)
|
||||
signs[:, head_dim // 2 :, :] = -1
|
||||
if self.lazy:
|
||||
signs = LazyTorchTensor.from_eager(signs)
|
||||
# (n_head, head_dim, n_in) -> (n_out, n_in)
|
||||
data_torch = torch.reshape(data_torch * signs, shape)
|
||||
elif suffix == "attn.head_gain.head_g":
|
||||
# allow head gain to broadcast
|
||||
data_torch = data_torch.unsqueeze(-1)
|
||||
|
||||
if not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
@@ -148,6 +148,10 @@ def parse_args() -> argparse.Namespace:
|
||||
"--fuse-gate-up-exps", action="store_true",
|
||||
help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp8-as-q8", action="store_true",
|
||||
help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.print_supported_models and args.model is None:
|
||||
@@ -264,7 +268,8 @@ def main() -> None:
|
||||
small_first_shard=args.no_tensor_first_split,
|
||||
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
|
||||
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
|
||||
fuse_gate_up_exps=args.fuse_gate_up_exps
|
||||
fuse_gate_up_exps=args.fuse_gate_up_exps,
|
||||
fp8_as_q8=args.fp8_as_q8,
|
||||
)
|
||||
|
||||
if args.vocab_only:
|
||||
|
||||
@@ -156,6 +156,8 @@ models = [
|
||||
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
|
||||
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
|
||||
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
|
||||
{"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
|
||||
{"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
||||
@@ -208,6 +208,16 @@ class LoraTorchTensor:
|
||||
def to(self, *args, **kwargs):
|
||||
return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
|
||||
|
||||
def __mul__(self, other) -> LoraTorchTensor:
|
||||
# Only output-side multiplication for now
|
||||
# W = B @ A, so M_out * W == (M_out * B) @ A
|
||||
if not isinstance(other, (int, float)) and other.shape and other.shape[-1] != 1:
|
||||
raise NotImplementedError
|
||||
return LoraTorchTensor(self._lora_A, self._lora_B * other)
|
||||
|
||||
def __rmul__(self, other) -> LoraTorchTensor:
|
||||
return self * other
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
||||
del types # unused
|
||||
|
||||
@@ -459,7 +459,7 @@ Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepend
|
||||
|
||||
- Usage: `./bin/llama-template-analysis path/to/template.jinja`
|
||||
|
||||
**Debug Logging**: Enable with `LLAMA_LOG_VERBOSITY=2`
|
||||
**Debug Logging**: Enable with `LLAMA_ARG_LOG_VERBOSITY=2`
|
||||
|
||||
- Shows detailed analysis steps, pattern extraction results, and generated parser structure
|
||||
|
||||
|
||||
@@ -743,6 +743,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
|
||||
| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
|
||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||
| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
|
||||
|
||||
@@ -753,6 +754,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
|
||||
| Name | Function |
|
||||
|-----------------|----------------------------------------------------------------------------------|
|
||||
| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
|
||||
| DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. |
|
||||
|
||||
## Design Rule
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
|
||||
This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.7
|
||||
[d]/> cd /workspace
|
||||
```
|
||||
|
||||
|
||||
@@ -176,7 +176,7 @@ Note that currently you cannot quantize the visual encoder because granite visio
|
||||
|
||||
|
||||
### 5. Running the Model in Llama cpp
|
||||
Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
|
||||
Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the llama.cpp banner.
|
||||
|
||||
```bash
|
||||
$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
|
||||
|
||||
@@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO
|
||||
|
||||
## HuggingFace utilities
|
||||
The following targets are useful for creating collections and model repositories
|
||||
on Hugging Face in the the ggml-org. These can be used when preparing a release
|
||||
on Hugging Face in the ggml-org. These can be used when preparing a release
|
||||
to script the process for new model releases.
|
||||
|
||||
For the following targets a `HF_TOKEN` environment variable is required.
|
||||
|
||||
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
|
||||
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 12)
|
||||
set(GGML_VERSION_MINOR 13)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
|
||||
@@ -1189,8 +1189,8 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// a - x
|
||||
// b - dy
|
||||
// a - dy
|
||||
// b - x
|
||||
GGML_API struct ggml_tensor * ggml_silu_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
||||
@@ -150,7 +150,7 @@ static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t o
|
||||
|
||||
static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
|
||||
// shift all elements after idx by 1 to the left, overwriting the element at idx
|
||||
for (int i = idx; i < chunk->n_free_blocks; i++) {
|
||||
for (int i = idx; i < chunk->n_free_blocks - 1; i++) {
|
||||
chunk->free_blocks[i] = chunk->free_blocks[i+1];
|
||||
}
|
||||
chunk->n_free_blocks--;
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
@@ -392,64 +393,100 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type(
|
||||
// meta backend buffer
|
||||
//
|
||||
|
||||
// Container to hold the tensor slices per simple ggml backend buffer.
|
||||
struct ggml_backend_meta_simple_tensor_container {
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::map<const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
|
||||
|
||||
ggml_backend_meta_simple_tensor_container(const ggml_init_params & params, const int n_simple) {
|
||||
ctxs.reserve(n_simple);
|
||||
for (int i = 0; i < n_simple; i++) {
|
||||
ctxs.emplace_back(ggml_init(params));
|
||||
}
|
||||
}
|
||||
ggml_backend_meta_simple_tensor_container() {}
|
||||
};
|
||||
|
||||
struct ggml_backend_meta_buffer_context {
|
||||
// FIXME
|
||||
// Most tensors can simply be stored statically in their own buffer.
|
||||
// Externally created views however also need a mapping to simple tensors but they use the buffer of the view source.
|
||||
// If external views are simply using that buffer they will slowly deplete its memory.
|
||||
// Current solution: rotating set of 2 "compute" containers to hold external views, works correctly for llama.cpp.
|
||||
// Long-term: tie the lifetime of external views to the meta backend executing the graph instead,
|
||||
// currently not possible due to graph-external operations in the backend scheduler.
|
||||
ggml_backend_meta_simple_tensor_container stc_static;
|
||||
ggml_backend_meta_simple_tensor_container stc_compute[2];
|
||||
int stc_compute_index = 0;
|
||||
int stc_compute_index_next = 0;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
// FIXME
|
||||
// The size of the split state cache is unbounded and can theoretically grow infinitely large.
|
||||
// However, it is also expensive to build and clearing it on every rebuild in ggml_backend_meta_graph_compute is too expensive.
|
||||
static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding);
|
||||
|
||||
std::map<std::pair<const ggml_tensor *, bool>, std::pair<ggml_backend_meta_split_state, char[nbtc]>> split_state_cache;
|
||||
std::map< const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
|
||||
|
||||
struct buffer_config {
|
||||
ggml_context * ctx;
|
||||
ggml_backend_buffer_t buf;
|
||||
|
||||
buffer_config(ggml_context * ctx, ggml_backend_buffer_t buf) : ctx(ctx), buf(buf) {}
|
||||
};
|
||||
std::vector<buffer_config> buf_configs;
|
||||
|
||||
int debug;
|
||||
|
||||
ggml_backend_meta_buffer_context() {
|
||||
ggml_backend_meta_buffer_context(
|
||||
ggml_backend_meta_simple_tensor_container & stc_static,
|
||||
ggml_backend_meta_simple_tensor_container & stc_compute_0,
|
||||
ggml_backend_meta_simple_tensor_container & stc_compute_1,
|
||||
const std::vector<ggml_backend_buffer_t> & bufs)
|
||||
: stc_static(std::move(stc_static)), stc_compute{std::move(stc_compute_0), std::move(stc_compute_1)} {
|
||||
this->bufs.reserve(bufs.size());
|
||||
for (ggml_backend_buffer_t buf : bufs) {
|
||||
this->bufs.emplace_back(buf);
|
||||
}
|
||||
const char * GGML_META_DEBUG = getenv("GGML_META_DEBUG");
|
||||
debug = GGML_META_DEBUG ? atoi(GGML_META_DEBUG) : 0;
|
||||
}
|
||||
|
||||
ggml_backend_meta_simple_tensor_container & get_simple_tensor_container(const ggml_tensor * tensor) {
|
||||
if (stc_static.simple_tensors.find(tensor) != stc_static.simple_tensors.end()) {
|
||||
return stc_static;
|
||||
}
|
||||
return stc_compute[stc_compute_index];
|
||||
}
|
||||
};
|
||||
|
||||
static void ggml_backend_meta_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
|
||||
for (auto & [ctx, buf] : buf_ctx->buf_configs) {
|
||||
ggml_backend_buffer_free(buf);
|
||||
ggml_free(ctx);
|
||||
}
|
||||
delete buf_ctx;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
|
||||
return buf_ctx->buf_configs.size();
|
||||
return buf_ctx->bufs.size();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
|
||||
GGML_ASSERT(index < buf_ctx->buf_configs.size());
|
||||
return buf_ctx->buf_configs[index].buf;
|
||||
GGML_ASSERT(index < buf_ctx->bufs.size());
|
||||
return buf_ctx->bufs[index].get();
|
||||
}
|
||||
|
||||
static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
|
||||
GGML_ASSERT(index < buf_ctx->buf_configs.size());
|
||||
GGML_ASSERT(index < buf_ctx->bufs.size());
|
||||
|
||||
auto it = buf_ctx->simple_tensors.find(tensor);
|
||||
if (it == buf_ctx->simple_tensors.end()) {
|
||||
ggml_backend_meta_simple_tensor_container & stc = buf_ctx->get_simple_tensor_container(tensor);
|
||||
auto it = stc.simple_tensors.find(tensor);
|
||||
if (it == stc.simple_tensors.end()) {
|
||||
return nullptr;
|
||||
}
|
||||
return it->second[index];
|
||||
}
|
||||
|
||||
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
|
||||
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
|
||||
|
||||
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) {
|
||||
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
|
||||
|
||||
@@ -785,7 +822,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
|
||||
src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
continue;
|
||||
}
|
||||
src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
|
||||
src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true);
|
||||
GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
|
||||
}
|
||||
|
||||
@@ -1079,17 +1116,23 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
|
||||
return ggml_backend_meta_get_split_state(buf_ctx->get_simple_tensor_container(tensor), tensor, assume_sync);
|
||||
}
|
||||
|
||||
static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
GGML_UNUSED(buffer);
|
||||
return (void *) 0x1000000000000000; // FIXME
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
|
||||
const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
|
||||
static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_meta_simple_tensor_container & stc, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
|
||||
const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
|
||||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ true);
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(stc, tensor, /*assume_sync =*/ true);
|
||||
GGML_ASSERT(ggml_nelements(tensor) == 0 || split_state.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
|
||||
GGML_ASSERT(split_state.n_segments <= 16);
|
||||
|
||||
@@ -1104,8 +1147,8 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
||||
std::vector<ggml_tensor *> simple_tensors;
|
||||
simple_tensors.reserve(n_simple_bufs);
|
||||
for (size_t j = 0; j < n_simple_bufs; j++) {
|
||||
ggml_context * simple_ctx = buf_ctx->buf_configs[j].ctx;
|
||||
ggml_backend_buffer_t simple_buf = buf_ctx->buf_configs[j].buf;
|
||||
ggml_context * simple_ctx = stc.ctxs[j].get();
|
||||
ggml_backend_buffer_t simple_buf = buf_ctx->bufs[j].get();
|
||||
|
||||
if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
|
||||
// TODO: the following assert fails for llama-parallel even though the results are correct:
|
||||
@@ -1158,7 +1201,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
||||
t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs;
|
||||
} else if (simple_buf != nullptr) {
|
||||
t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf)
|
||||
+ size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer));
|
||||
+ size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(tensor->buffer));
|
||||
}
|
||||
t_ij->extra = tensor->extra;
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
@@ -1194,11 +1237,18 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
||||
}
|
||||
}
|
||||
|
||||
buf_ctx->simple_tensors[tensor] = simple_tensors;
|
||||
stc.simple_tensors[tensor] = simple_tensors;
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
|
||||
buf_ctx->stc_compute_index = buf_ctx->stc_compute_index_next;
|
||||
return ggml_backend_meta_buffer_init_tensor_impl(buf_ctx->get_simple_tensor_container(tensor), tensor);
|
||||
}
|
||||
|
||||
static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||
@@ -1413,8 +1463,9 @@ static void ggml_backend_meta_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||
}
|
||||
|
||||
static void ggml_backend_meta_buffer_reset(ggml_backend_buffer_t buffer) {
|
||||
const size_t n_buffers = ggml_backend_meta_buffer_n_bufs(buffer);
|
||||
for (size_t i = 0; i < n_buffers; i++) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
|
||||
for (size_t i = 0; i < buf_ctx->bufs.size(); i++) {
|
||||
ggml_backend_buffer_reset(ggml_backend_meta_buffer_simple_buffer(buffer, i));
|
||||
}
|
||||
}
|
||||
@@ -1440,21 +1491,24 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) {
|
||||
static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ 1024*1024*1024, // FIXME
|
||||
const ggml_init_params params = {
|
||||
/*.mem_size =*/ 1024*1024*ggml_tensor_overhead(), // FIXME
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_backend_meta_simple_tensor_container stc_static;
|
||||
ggml_backend_meta_simple_tensor_container stc_compute_0(params, n_simple_bufts);
|
||||
ggml_backend_meta_simple_tensor_container stc_compute_1(params, n_simple_bufts);
|
||||
|
||||
ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context();
|
||||
size_t max_size = 0;
|
||||
buf_ctx->buf_configs.reserve(n_simple_bufts);
|
||||
std::vector<ggml_backend_buffer_t> bufs;
|
||||
bufs.reserve(n_simple_bufts);
|
||||
for (size_t i = 0; i < n_simple_bufts; i++) {
|
||||
ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size);
|
||||
GGML_ASSERT(simple_buf != nullptr);
|
||||
max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf));
|
||||
buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf);
|
||||
bufs.push_back(ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size));
|
||||
GGML_ASSERT(bufs.back() != nullptr);
|
||||
max_size = std::max(max_size, ggml_backend_buffer_get_size(bufs.back()));
|
||||
}
|
||||
ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, buf_ctx, max_size);
|
||||
}
|
||||
@@ -1462,26 +1516,32 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac
|
||||
struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ 1024*1024*1024, // FIXME
|
||||
constexpr size_t compute_headroom = 16; // Maximum number of views per statically allocated tensor that can be created between evals.
|
||||
const ggml_init_params params_static = {
|
||||
/*.mem_size =*/ ggml_get_mem_size(ctx),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
const ggml_init_params params_compute = {
|
||||
/*.mem_size =*/ compute_headroom*ggml_get_mem_size(ctx),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_backend_meta_simple_tensor_container stc_static (params_static, n_simple_bufts);
|
||||
ggml_backend_meta_simple_tensor_container stc_compute_0(params_compute, n_simple_bufts);
|
||||
ggml_backend_meta_simple_tensor_container stc_compute_1(params_compute, n_simple_bufts);
|
||||
|
||||
ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context();
|
||||
meta_buf_ctx->buf_configs.reserve(n_simple_bufts);
|
||||
for (size_t i = 0; i < n_simple_bufts; i++) {
|
||||
meta_buf_ctx->buf_configs.emplace_back(ggml_init(params), nullptr);
|
||||
}
|
||||
std::vector<ggml_backend_buffer_t> bufs(n_simple_bufts, nullptr);
|
||||
ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);
|
||||
|
||||
ggml_backend_buffer_t meta_buf = ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, meta_buf_ctx, 0);
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
t->buffer = meta_buf;
|
||||
ggml_backend_meta_buffer_init_tensor(meta_buf, t);
|
||||
ggml_backend_meta_buffer_init_tensor_impl(meta_buf_ctx->stc_static, t);
|
||||
t->data = (void *) 0x2000000000000000; // FIXME
|
||||
}
|
||||
for (size_t i = 0; i < n_simple_bufts; i++) {
|
||||
ggml_context * ctx = meta_buf_ctx->buf_configs[i].ctx;
|
||||
ggml_context * ctx = meta_buf_ctx->stc_static.ctxs[i].get();
|
||||
ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i);
|
||||
|
||||
// If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL.
|
||||
@@ -1494,15 +1554,15 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc
|
||||
}
|
||||
}
|
||||
if (any_nonzero_slice) {
|
||||
meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft);
|
||||
meta_buf_ctx->bufs[i].reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft));
|
||||
} else {
|
||||
meta_buf_ctx->buf_configs[i].buf = ggml_backend_buft_alloc_buffer(simple_buft, 0);
|
||||
meta_buf_ctx->bufs[i].reset(ggml_backend_buft_alloc_buffer(simple_buft, 0));
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
t->buffer = meta_buf_ctx->buf_configs[i].buf;
|
||||
t->buffer = meta_buf_ctx->bufs[i].get();
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(meta_buf_ctx->buf_configs[i].buf != nullptr);
|
||||
meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf));
|
||||
GGML_ASSERT(meta_buf_ctx->bufs[i]);
|
||||
meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->bufs[i].get()));
|
||||
}
|
||||
return meta_buf;
|
||||
}
|
||||
@@ -1724,6 +1784,26 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
}
|
||||
|
||||
if (needs_rebuild) {
|
||||
std::set<ggml_backend_buffer_t> used_buffers;
|
||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
if (ggml_backend_buffer_is_meta(cgraph->leafs[i]->buffer)) {
|
||||
used_buffers.emplace(cgraph->leafs[i]->buffer);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (ggml_backend_buffer_is_meta(cgraph->nodes[i]->buffer)) {
|
||||
used_buffers.emplace(cgraph->nodes[i]->buffer);
|
||||
}
|
||||
}
|
||||
for (ggml_backend_buffer_t buf : used_buffers) {
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buf->context;
|
||||
buf_ctx->stc_compute_index_next = buf_ctx->stc_compute_index ^ 1;
|
||||
ggml_backend_meta_simple_tensor_container & stc = buf_ctx->stc_compute[buf_ctx->stc_compute_index_next];
|
||||
for (ggml_context_ptr & ctx : stc.ctxs) {
|
||||
ggml_reset(ctx.get());
|
||||
}
|
||||
stc.simple_tensors.clear();
|
||||
}
|
||||
size_t n_subgraphs = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
|
||||
@@ -1909,7 +1989,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
|
||||
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
|
||||
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
|
||||
ggml_init_params params = {
|
||||
const ggml_init_params params = {
|
||||
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
|
||||
@@ -273,67 +273,51 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8; //get vector length
|
||||
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
||||
const int ggml_f16_epr = svcnth();
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr;
|
||||
const int np = n - (n % ggml_f16_step);
|
||||
const int np2 = n - (n % ggml_f16_epr);
|
||||
|
||||
const int np= (n & ~(ggml_f16_step - 1));
|
||||
svfloat16_t sum1 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum2 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum3 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum4 = svdup_n_f16(0.0f);
|
||||
svfloat32_t sum1_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum1_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum2_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum2_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum3_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum3_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum4_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum4_hi = svdup_n_f32(0.0f);
|
||||
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
||||
sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
|
||||
ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0), GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0));
|
||||
ggml_sve_f16_fma_widened(&sum2_lo, &sum2_hi, GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1), GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1));
|
||||
ggml_sve_f16_fma_widened(&sum3_lo, &sum3_hi, GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2), GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2));
|
||||
ggml_sve_f16_fma_widened(&sum4_lo, &sum4_hi, GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3), GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3));
|
||||
ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4), GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4));
|
||||
ggml_sve_f16_fma_widened(&sum2_lo, &sum2_hi, GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5), GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5));
|
||||
ggml_sve_f16_fma_widened(&sum3_lo, &sum3_hi, GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6), GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6));
|
||||
ggml_sve_f16_fma_widened(&sum4_lo, &sum4_hi, GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7), GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7));
|
||||
}
|
||||
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
|
||||
for (int i = np; i < np2; i += ggml_f16_epr) {
|
||||
ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i, 0), GGML_F16x_VEC_LOAD(y + i, 0));
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
const svbool_t pg = svwhilelt_b16(np2, n);
|
||||
const svfloat16_t rx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
const svfloat16_t ry = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
|
||||
sum1 = svmad_f16_x(pg, hx, hy, sum1);
|
||||
ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, rx, ry);
|
||||
}
|
||||
GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
|
||||
|
||||
sum1_lo = svadd_f32_m(DEFAULT_PG32, sum1_lo, sum2_lo);
|
||||
sum1_hi = svadd_f32_m(DEFAULT_PG32, sum1_hi, sum2_hi);
|
||||
sum3_lo = svadd_f32_m(DEFAULT_PG32, sum3_lo, sum4_lo);
|
||||
sum3_hi = svadd_f32_m(DEFAULT_PG32, sum3_hi, sum4_hi);
|
||||
sum1_lo = svadd_f32_m(DEFAULT_PG32, sum1_lo, sum3_lo);
|
||||
sum1_hi = svadd_f32_m(DEFAULT_PG32, sum1_hi, sum3_hi);
|
||||
|
||||
sumf = ggml_sve_sum_f32x2(sum1_lo, sum1_hi);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_zvfh)
|
||||
int vl = __riscv_vsetvlmax_e32m2();
|
||||
|
||||
@@ -14,6 +14,35 @@
|
||||
// floating point type used to accumulate sums
|
||||
typedef double ggml_float;
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
inline static void ggml_sve_f16_fma_widened(
|
||||
svfloat32_t * acc_lo,
|
||||
svfloat32_t * acc_hi,
|
||||
svfloat16_t x,
|
||||
svfloat16_t y) {
|
||||
#if defined(__ARM_FEATURE_SVE2)
|
||||
*acc_lo = svmlalb_f32(*acc_lo, x, y);
|
||||
*acc_hi = svmlalt_f32(*acc_hi, x, y);
|
||||
#else
|
||||
// Plain SVE fallback path if SVE2 instructions not available
|
||||
svfloat16_t x_even = svtrn1_f16(x, x);
|
||||
svfloat16_t x_odd = svtrn2_f16(x, x);
|
||||
|
||||
svfloat16_t y_even = svtrn1_f16(y, y);
|
||||
svfloat16_t y_odd = svtrn2_f16(y, y);
|
||||
|
||||
svbool_t pg = svptrue_b32();
|
||||
|
||||
*acc_lo = svmla_f32_x(pg, *acc_lo, svcvt_f32_f16_x(pg, x_even), svcvt_f32_f16_x(pg, y_even));
|
||||
*acc_hi = svmla_f32_x(pg, *acc_hi, svcvt_f32_f16_x(pg, x_odd), svcvt_f32_f16_x(pg, y_odd));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline static ggml_float ggml_sve_sum_f32x2(svfloat32_t sum_lo, svfloat32_t sum_hi) {
|
||||
return (ggml_float) (svaddv_f32(svptrue_b32(), sum_lo) + svaddv_f32(svptrue_b32(), sum_hi));
|
||||
}
|
||||
#endif
|
||||
|
||||
#define GGML_GELU_FP16
|
||||
#define GGML_GELU_QUICK_FP16
|
||||
|
||||
@@ -122,108 +151,61 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
|
||||
const int sve_register_length = svcntb() * 8;
|
||||
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
||||
const int ggml_f16_epr = svcnth();
|
||||
const int ggml_f16_step = 2 * ggml_f16_epr;
|
||||
int np = n - (n % ggml_f16_step);
|
||||
int np2 = n - (n % ggml_f16_epr);
|
||||
|
||||
int np = (n & ~(ggml_f16_step - 1));
|
||||
|
||||
svfloat16_t sum_00 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_01 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_02 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_03 = svdup_n_f16(0.0f);
|
||||
|
||||
svfloat16_t sum_10 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_11 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_12 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_13 = svdup_n_f16(0.0f);
|
||||
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
svfloat32_t sum_0_0_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_0_0_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_0_1_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_0_1_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_1_0_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_1_0_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_1_1_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_1_1_hi = svdup_n_f32(0.0f);
|
||||
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
|
||||
const svfloat16_t ay0 = GGML_F16x_VEC_LOAD(y + i, 0);
|
||||
const svfloat16_t ax00 = GGML_F16x_VEC_LOAD(x[0] + i, 0);
|
||||
const svfloat16_t ax01 = GGML_F16x_VEC_LOAD(x[1] + i, 0);
|
||||
|
||||
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
||||
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
||||
ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax00, ay0);
|
||||
ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax01, ay0);
|
||||
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
|
||||
const svfloat16_t ay1 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 0);
|
||||
const svfloat16_t ax10 = GGML_F16x_VEC_LOAD(x[0] + i + 1 * ggml_f16_epr, 0);
|
||||
const svfloat16_t ax11 = GGML_F16x_VEC_LOAD(x[1] + i + 1 * ggml_f16_epr, 0);
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
|
||||
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
||||
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
|
||||
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
||||
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
|
||||
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
||||
ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
|
||||
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
||||
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
|
||||
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
|
||||
ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
|
||||
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
|
||||
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
|
||||
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
|
||||
ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
|
||||
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
|
||||
|
||||
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
|
||||
ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
|
||||
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
|
||||
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
|
||||
|
||||
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
|
||||
ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
|
||||
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
|
||||
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
|
||||
|
||||
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
|
||||
ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
|
||||
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
|
||||
ggml_sve_f16_fma_widened(&sum_0_1_lo, &sum_0_1_hi, ax10, ay1);
|
||||
ggml_sve_f16_fma_widened(&sum_1_1_lo, &sum_1_1_hi, ax11, ay1);
|
||||
}
|
||||
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1));
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
for (int i = np; i < np2; i += ggml_f16_epr) {
|
||||
const svfloat16_t ry = GGML_F16x_VEC_LOAD(y + i, 0);
|
||||
const svfloat16_t rx0 = GGML_F16x_VEC_LOAD(x[0] + i, 0);
|
||||
const svfloat16_t rx1 = GGML_F16x_VEC_LOAD(x[1] + i, 0);
|
||||
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
|
||||
rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
|
||||
ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, rx0, ry);
|
||||
ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, rx1, ry);
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
|
||||
svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
const svbool_t pg = svwhilelt_b16(np2, n);
|
||||
const svfloat16_t ay = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
const svfloat16_t ax0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
|
||||
const svfloat16_t ax1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
|
||||
|
||||
sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
|
||||
sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
|
||||
ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax0, ay);
|
||||
ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax1, ay);
|
||||
}
|
||||
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
|
||||
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
|
||||
|
||||
svfloat32_t sum_0_lo = svadd_f32_x(DEFAULT_PG32, sum_0_0_lo, sum_0_1_lo);
|
||||
svfloat32_t sum_0_hi = svadd_f32_x(DEFAULT_PG32, sum_0_0_hi, sum_0_1_hi);
|
||||
svfloat32_t sum_1_lo = svadd_f32_x(DEFAULT_PG32, sum_1_0_lo, sum_1_1_lo);
|
||||
svfloat32_t sum_1_hi = svadd_f32_x(DEFAULT_PG32, sum_1_0_hi, sum_1_1_hi);
|
||||
sumf[0] = ggml_sve_sum_f32x2(sum_0_lo, sum_0_hi);
|
||||
sumf[1] = ggml_sve_sum_f32x2(sum_1_lo, sum_1_hi);
|
||||
np = n;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_zvfh)
|
||||
|
||||
@@ -110,11 +110,14 @@
|
||||
# define GGML_CUDA_USE_CUB
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
|
||||
// PDL host-side support (cudaLaunchKernelEx) requires CUDART >= 11.8 and excludes HIP/MUSA.
|
||||
// PDL host-side support (cudaLaunchKernelEx) requires CUDART >= 11.8.
|
||||
// However, this has been bugged in CTK < 12.3 for MSVC builds, see
|
||||
// https://github.com/ggml-org/llama.cpp/pull/22522#discussion_r3302393293
|
||||
// __CUDA_ARCH__ is undefined in host passes; GPU arch check happens in device-side code.
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11080
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && \
|
||||
(CUDART_VERSION >= 12030 || (!(defined(_MSC_VER) && !defined(__clang__)) && CUDART_VERSION >= 11080))
|
||||
# define GGML_CUDA_USE_PDL
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11080
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && (CUDART_VERSION >= 12030 || (!(defined(_MSC_VER) && !defined(__clang__)) && CUDART_VERSION >= 11080))
|
||||
|
||||
static __device__ __forceinline__ void ggml_cuda_pdl_sync() {
|
||||
#if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
|
||||
|
||||
@@ -472,7 +472,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
|
||||
|
||||
const int i = 8 * (threadIdx.x % (nbatch_fa/8));
|
||||
|
||||
cp_async_cg_16<preload>(tile_mask_32 + j_sram*(nbatch_fa*sizeof(half) + 16) + i*sizeof(half), mask_h + j_vram*stride_mask + i);
|
||||
cp_async_cg_16<preload>(tile_mask_32 + j_sram*(nbatch_fa*sizeof(half) + 16) + i*sizeof(half), mask_h + int64_t(j_vram)*stride_mask + i);
|
||||
}
|
||||
} else if constexpr (oob_check) {
|
||||
#pragma unroll
|
||||
@@ -488,7 +488,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
|
||||
for (int i0 = 0; i0 < nbatch_fa; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
tile_mask[j_sram*(nbatch_fa + 8) + i] = i < i_sup ? mask_h[j_vram*stride_mask + i] : half(0.0f);
|
||||
tile_mask[j_sram*(nbatch_fa + 8) + i] = i < i_sup ? mask_h[int64_t(j_vram)*stride_mask + i] : half(0.0f);
|
||||
}
|
||||
}
|
||||
} else if constexpr (nbatch_fa < 2*warp_size) {
|
||||
@@ -505,7 +505,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
|
||||
|
||||
const int i = threadIdx.x % (warp_size/cols_per_warp);
|
||||
|
||||
ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + 2*i, mask_h + j_vram*stride_mask + 2*i);
|
||||
ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + 2*i, mask_h + int64_t(j_vram)*stride_mask + 2*i);
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
@@ -521,7 +521,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
|
||||
for (int i0 = 0; i0 < nbatch_fa; i0 += 2*warp_size) {
|
||||
const int i = i0 + 2*threadIdx.x;
|
||||
|
||||
ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + i, mask_h + j_vram*stride_mask + i);
|
||||
ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + i, mask_h + int64_t(j_vram)*stride_mask + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
101
ggml/src/ggml-cuda/fwht.cu
Normal file
101
ggml/src/ggml-cuda/fwht.cu
Normal file
@@ -0,0 +1,101 @@
|
||||
#include "common.cuh"
|
||||
#include "fwht.cuh"
|
||||
|
||||
template <int N>
|
||||
__launch_bounds__(4*ggml_cuda_get_physical_warp_size(), 1)
|
||||
__global__ void fwht_cuda(const float * src, float * dst, const int64_t n_rows, const float scale) {
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
const int64_t r = (int64_t) blockIdx.x * blockDim.y + threadIdx.y;
|
||||
|
||||
if (r >= n_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
src += r * N;
|
||||
dst += r * N;
|
||||
|
||||
static constexpr int el_w = N / warp_size;
|
||||
float reg[el_w];
|
||||
const int lane = threadIdx.x;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
#pragma unroll
|
||||
for (int i = 0; i < el_w; ++i) {
|
||||
reg[i] = src[i * warp_size + lane] * scale;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int h = 1; h < warp_size; h *= 2) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < el_w; j++) {
|
||||
const float val = reg[j];
|
||||
const float val2 = __shfl_xor_sync(0xFFFFFFFF, val, h, warp_size);
|
||||
|
||||
reg[j] = (lane & h) == 0 ? val + val2 : val2 - val;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int h = warp_size; h < N; h *= 2) {
|
||||
const int step = h / warp_size;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < el_w; j += 2 * step) {
|
||||
#pragma unroll
|
||||
for (int k = 0; k < step; k++) {
|
||||
const float x = reg[j + k];
|
||||
const float y = reg[j + k + step];
|
||||
|
||||
reg[j + k] = x + y;
|
||||
reg[j + k + step] = x - y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < el_w; ++i) {
|
||||
dst[i * warp_size + lane] = reg[i];
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_are_same_shape(src, dst));
|
||||
if (!ggml_is_contiguous(src) || !ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
const int n = src->ne[0];
|
||||
const int64_t rows = ggml_nrows(src);
|
||||
|
||||
const float * src_d = (const float *) src->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||
const int rows_per_block = 4;
|
||||
|
||||
const int64_t num_blocks = (rows + rows_per_block - 1) / rows_per_block;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
dim3 grid_dims(num_blocks, 1, 1);
|
||||
dim3 block_dims(warp_size, rows_per_block, 1);
|
||||
const ggml_cuda_kernel_launch_params launch_params =
|
||||
ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
|
||||
|
||||
const float scale = 1 / sqrtf(n);
|
||||
|
||||
switch (n) {
|
||||
case 64:
|
||||
ggml_cuda_kernel_launch(fwht_cuda<64>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
case 128:
|
||||
ggml_cuda_kernel_launch(fwht_cuda<128>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
case 256:
|
||||
ggml_cuda_kernel_launch(fwht_cuda<256>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
case 512:
|
||||
ggml_cuda_kernel_launch(fwht_cuda<512>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
4
ggml/src/ggml-cuda/fwht.cuh
Normal file
4
ggml/src/ggml-cuda/fwht.cuh
Normal file
@@ -0,0 +1,4 @@
|
||||
#include "common.cuh"
|
||||
|
||||
// Returns whether the Fast Walsh-Hadamard transform could be used.
|
||||
bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst);
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "ggml-cuda/diagmask.cuh"
|
||||
#include "ggml-cuda/diag.cuh"
|
||||
#include "ggml-cuda/fattn.cuh"
|
||||
#include "ggml-cuda/fwht.cuh"
|
||||
#include "ggml-cuda/getrows.cuh"
|
||||
#include "ggml-cuda/im2col.cuh"
|
||||
#include "ggml-cuda/mmf.cuh"
|
||||
@@ -2569,6 +2570,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
||||
use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
} else {
|
||||
@@ -2577,6 +2579,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
||||
use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
|
||||
@@ -2594,6 +2597,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
|
||||
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
|
||||
|
||||
const int32_t hint = ggml_get_op_params_i32(dst, 1);
|
||||
if (hint == GGML_HINT_SRC0_IS_HADAMARD && !split && ggml_cuda_op_fwht(ctx, src1, dst)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!split && use_mul_mat_vec_f) {
|
||||
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
|
||||
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
|
||||
@@ -4986,8 +4994,14 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
|
||||
GGML_UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, ctx->device));
|
||||
|
||||
return prop.integrated
|
||||
? GGML_BACKEND_DEVICE_TYPE_IGPU
|
||||
: GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
|
||||
@@ -63,6 +63,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
|
||||
enum mmvq_parameter_table_id {
|
||||
MMVQ_PARAMETERS_GENERIC = 0,
|
||||
MMVQ_PARAMETERS_TURING,
|
||||
MMVQ_PARAMETERS_GCN,
|
||||
MMVQ_PARAMETERS_RDNA2,
|
||||
MMVQ_PARAMETERS_RDNA3_0,
|
||||
@@ -78,6 +79,8 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
|
||||
return MMVQ_PARAMETERS_RDNA2;
|
||||
#elif defined(GCN) || defined(CDNA)
|
||||
return MMVQ_PARAMETERS_GCN;
|
||||
#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING && __CUDA_ARCH__ < GGML_CUDA_CC_AMPERE
|
||||
return MMVQ_PARAMETERS_TURING;
|
||||
#else
|
||||
return MMVQ_PARAMETERS_GENERIC;
|
||||
#endif
|
||||
@@ -96,6 +99,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
|
||||
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return MMVQ_PARAMETERS_GCN;
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING && ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_AMPERE) {
|
||||
return MMVQ_PARAMETERS_TURING;
|
||||
}
|
||||
return MMVQ_PARAMETERS_GENERIC;
|
||||
}
|
||||
|
||||
@@ -271,6 +277,53 @@ int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
|
||||
return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
if (GGML_CUDA_CC_IS_CDNA1(cc)) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
return ne11 <= 7;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return ne11 <= 7;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return ne11 <= 6;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return ne11 <= 4;
|
||||
case GGML_TYPE_Q3_K:
|
||||
return ne11 <= 3;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return ne11 <= 2;
|
||||
case GGML_TYPE_Q5_K:
|
||||
return ne11 <= 3;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return ne11 <= 4;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return ne11 <= 5;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return ne11 <= 6;
|
||||
default:
|
||||
return ne11 <= MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
switch (type) { // tuned for CDNA2
|
||||
case GGML_TYPE_Q2_K:
|
||||
return ne11 <= 5;
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
return ne11 <= 3;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return ne11 <= 5;
|
||||
default:
|
||||
return ne11 <= MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
return ne11 <= MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
// Device constexpr: returns the max batch size for the current arch+type at compile time.
|
||||
template <ggml_type type>
|
||||
static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
|
||||
@@ -370,11 +423,38 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
if (table_id == MMVQ_PARAMETERS_TURING) {
|
||||
if (ncols_dst == 1) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 2;
|
||||
default:
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
switch (ncols_dst) {
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
return 4;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
case 8:
|
||||
return 2;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id, bool small_k = false, int nwarps = 1) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
|
||||
if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN || table_id == MMVQ_PARAMETERS_TURING) {
|
||||
switch (ncols_dst) {
|
||||
case 1:
|
||||
return small_k ? nwarps : 1;
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
|
||||
|
||||
bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11);
|
||||
|
||||
// Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
|
||||
// based on the quantization type and GPU architecture (compute capability).
|
||||
int get_mmvq_mmid_max_batch(ggml_type type, int cc);
|
||||
|
||||
@@ -68,6 +68,7 @@ static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C }
|
||||
static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
|
||||
static int opt_opbatch = 1024; // max number of ops in a batch
|
||||
static int opt_opqueue = 16; // max number of pending batches
|
||||
static int opt_oppoll = 0; // polling for batch completions
|
||||
|
||||
static std::regex* opt_opfilter = NULL; // regex of ops to not claim
|
||||
|
||||
@@ -550,7 +551,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
|
||||
|
||||
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
||||
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
|
||||
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
||||
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
||||
|
||||
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
||||
// or write more than the tensor can hold.
|
||||
@@ -611,7 +612,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
|
||||
|
||||
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
||||
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad
|
||||
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
||||
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
||||
|
||||
// Ensure we don't try to copy more data than the tensor actually contains.
|
||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
||||
@@ -660,6 +661,239 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
|
||||
ggml_aligned_free(buf_rp, row_size_rp);
|
||||
}
|
||||
|
||||
static void unpack_q4_1_quants(uint8_t * qs, const block_q4_1 * x, unsigned int bi) {
|
||||
static const int qk = QK4_1;
|
||||
|
||||
for (unsigned int i = 0; i < qk / 2; ++i) {
|
||||
const int x0 = (x->qs[i] & 0x0F);
|
||||
const int x1 = (x->qs[i] >> 4);
|
||||
qs[bi * qk + i + 0] = x0;
|
||||
qs[bi * qk + i + qk / 2] = x1;
|
||||
}
|
||||
}
|
||||
|
||||
static void pack_q4_1_quants(block_q4_1 * x, const uint8_t * qs, unsigned int bi) {
|
||||
static const int qk = QK4_1;
|
||||
|
||||
for (unsigned int i = 0; i < qk / 2; ++i) {
|
||||
const uint8_t x0 = qs[bi * qk + i + 0];
|
||||
const uint8_t x1 = qs[bi * qk + i + qk / 2];
|
||||
x->qs[i] = x0 | (x1 << 4);
|
||||
}
|
||||
}
|
||||
|
||||
static void repack_row_q4_1x4x2(uint8_t * y, const block_q4_1 * x, int64_t k) {
|
||||
static const int qk = QK_Q4_0x4x2;
|
||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||
const int nloe = k % qk; // leftovers
|
||||
|
||||
const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes
|
||||
const int qblk_size = qk / 2; // int4 = 128 bytes
|
||||
const int qrow_size = k / 2; // int4 (not padded to blocks)
|
||||
|
||||
uint8_t * y_q = y + 0; // quants first
|
||||
uint8_t * y_d = y + qrow_size; // then scales/offsets
|
||||
|
||||
// Repack the quants
|
||||
for (int i = 0; i < nb; i++) {
|
||||
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
||||
unpack_q4_1_quants(qs, &x[i * 8 + 0], 0);
|
||||
unpack_q4_1_quants(qs, &x[i * 8 + 1], 1);
|
||||
unpack_q4_1_quants(qs, &x[i * 8 + 2], 2);
|
||||
unpack_q4_1_quants(qs, &x[i * 8 + 3], 3);
|
||||
unpack_q4_1_quants(qs, &x[i * 8 + 4], 4);
|
||||
unpack_q4_1_quants(qs, &x[i * 8 + 5], 5);
|
||||
unpack_q4_1_quants(qs, &x[i * 8 + 6], 6);
|
||||
unpack_q4_1_quants(qs, &x[i * 8 + 7], 7);
|
||||
|
||||
bool partial = (nloe && i == nb-1);
|
||||
|
||||
uint8_t * q = y_q + (i * qblk_size);
|
||||
for (int j = 0; j < qk / 2; j++) {
|
||||
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
|
||||
}
|
||||
}
|
||||
|
||||
// Repack the scales and offsets
|
||||
for (int i = 0; i < nb; i++) {
|
||||
ggml_half * d_m = (ggml_half *) (y_d + i * dblk_size);
|
||||
for (int j = 0; j < 8; j++) {
|
||||
d_m[j * 2 + 0] = x[i * 8 + j].d;
|
||||
d_m[j * 2 + 1] = x[i * 8 + j].m;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void unpack_row_q4_1x4x2(block_q4_1 * x, const uint8_t * y, int64_t k) {
|
||||
static const int qk = QK_Q4_0x4x2;
|
||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||
const int nloe = k % qk; // leftovers
|
||||
|
||||
const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes
|
||||
const int qblk_size = qk / 2; // int4 = 128 bytes
|
||||
const int qrow_size = k / 2; // int4 (not padded to blocks)
|
||||
|
||||
const uint8_t * y_q = y + 0; // quants first
|
||||
const uint8_t * y_d = y + qrow_size; // then scales/offsets
|
||||
|
||||
// Unpack the quants
|
||||
for (int i = 0; i < nb; i++) {
|
||||
uint8_t qs[QK_Q4_0x4x2];
|
||||
bool partial = (nloe && i == nb-1);
|
||||
|
||||
const uint8_t * q = y_q + (i * qblk_size);
|
||||
for (int j = 0; j < qk / 2; j++) {
|
||||
if (partial) {
|
||||
qs[j*2+0] = q[j] & 0x0F;
|
||||
qs[j*2+1] = q[j] >> 4;
|
||||
} else {
|
||||
qs[j+000] = q[j] & 0x0F;
|
||||
qs[j+128] = q[j] >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
|
||||
pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
|
||||
pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
|
||||
pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
|
||||
pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
|
||||
pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
|
||||
pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
|
||||
pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
|
||||
}
|
||||
|
||||
// Unpack the scales and offsets
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const ggml_half * d_m = (const ggml_half *) (y_d + i * dblk_size);
|
||||
for (int j = 0; j < 8; j++) {
|
||||
x[i * 8 + j].d = d_m[j * 2 + 0];
|
||||
x[i * 8 + j].m = d_m[j * 2 + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void init_row_q4_1x4x2(block_q4_1 * x, int64_t k) {
|
||||
static const int qk = QK_Q4_0x4x2;
|
||||
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
|
||||
|
||||
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
|
||||
memset(qs, 0, sizeof(qs));
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
pack_q4_1_quants(&x[i * 8 + 0], qs, 0);
|
||||
pack_q4_1_quants(&x[i * 8 + 1], qs, 1);
|
||||
pack_q4_1_quants(&x[i * 8 + 2], qs, 2);
|
||||
pack_q4_1_quants(&x[i * 8 + 3], qs, 3);
|
||||
pack_q4_1_quants(&x[i * 8 + 4], qs, 4);
|
||||
pack_q4_1_quants(&x[i * 8 + 5], qs, 5);
|
||||
pack_q4_1_quants(&x[i * 8 + 6], qs, 6);
|
||||
pack_q4_1_quants(&x[i * 8 + 7], qs, 7);
|
||||
}
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int j = 0; j < 8; j++) {
|
||||
x[i * 8 + j].d = 0;
|
||||
x[i * 8 + j].m = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void repack_q4_1_q4x4x2(ggml_tensor * t, const void * data, size_t size) {
|
||||
int64_t nrows = ggml_nrows(t);
|
||||
|
||||
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
||||
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
|
||||
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
||||
|
||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
||||
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
||||
|
||||
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
||||
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
|
||||
|
||||
void * buf_pd = ggml_aligned_malloc(row_size_pd);
|
||||
GGML_ASSERT(buf_pd != NULL);
|
||||
|
||||
void * buf_rp = ggml_aligned_malloc(row_size_rp);
|
||||
GGML_ASSERT(buf_rp != NULL);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: repack-q4_1-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
|
||||
t->ne[0], nrows, row_size);
|
||||
|
||||
init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
|
||||
|
||||
for (int64_t i = 0; i < n_full_rows; i++) {
|
||||
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
||||
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
||||
|
||||
memcpy(buf_pd, src, row_size);
|
||||
repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
|
||||
memcpy(dst, buf_rp, row_size);
|
||||
}
|
||||
|
||||
if (n_rem_bytes > 0) {
|
||||
const int64_t i = n_full_rows;
|
||||
const uint8_t * src = (const uint8_t *) data + (i * row_size);
|
||||
uint8_t * dst = (uint8_t *) t->data + (i * row_size);
|
||||
|
||||
init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]);
|
||||
memcpy(buf_pd, src, n_rem_bytes);
|
||||
repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]);
|
||||
memcpy(dst, buf_rp, n_rem_bytes);
|
||||
}
|
||||
|
||||
ggml_aligned_free(buf_pd, row_size_pd);
|
||||
ggml_aligned_free(buf_rp, row_size_rp);
|
||||
}
|
||||
|
||||
static void repack_q4x4x2_q4_1(void * data, const ggml_tensor * t, size_t size) {
|
||||
int64_t nrows = ggml_nrows(t);
|
||||
|
||||
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
||||
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));
|
||||
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
||||
|
||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
||||
const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
|
||||
|
||||
const int64_t n_full_rows = n_bytes_to_copy / row_size;
|
||||
const size_t n_rem_bytes = n_bytes_to_copy % row_size;
|
||||
|
||||
void * buf_pd = ggml_aligned_malloc(row_size_pd);
|
||||
GGML_ASSERT(buf_pd != NULL);
|
||||
|
||||
void * buf_rp = ggml_aligned_malloc(row_size_rp);
|
||||
GGML_ASSERT(buf_rp != NULL);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_1 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size,
|
||||
t->ne[0], nrows, row_size);
|
||||
|
||||
memset(buf_rp, 0, row_size_rp); // clear-out padded buffer to make sure the tail is all zeros
|
||||
|
||||
for (int64_t i = 0; i < n_full_rows; i++) {
|
||||
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
||||
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
||||
|
||||
memcpy(buf_rp, src, row_size);
|
||||
unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
|
||||
memcpy(dst, buf_pd, row_size);
|
||||
}
|
||||
|
||||
if (n_rem_bytes > 0) {
|
||||
const int64_t i = n_full_rows;
|
||||
const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
|
||||
uint8_t * dst = (uint8_t *) data + (i * row_size);
|
||||
|
||||
// We still need to read and unpack the entire source row because quantization is block-based.
|
||||
memcpy(buf_rp, src, row_size);
|
||||
unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]);
|
||||
memcpy(dst, buf_pd, n_rem_bytes);
|
||||
}
|
||||
|
||||
ggml_aligned_free(buf_pd, row_size_pd);
|
||||
ggml_aligned_free(buf_rp, row_size_rp);
|
||||
}
|
||||
|
||||
// ======== Q8x4x2 ====================
|
||||
static void dump_block_q8_0(const block_q8_0 * b, int i) {
|
||||
HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2],
|
||||
@@ -876,7 +1110,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
|
||||
|
||||
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
||||
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
|
||||
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
||||
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales)
|
||||
|
||||
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
||||
// or write more than the tensor can hold.
|
||||
@@ -937,7 +1171,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
|
||||
|
||||
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
||||
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad
|
||||
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
||||
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales)
|
||||
|
||||
// Ensure we don't try to copy more data than the tensor actually contains.
|
||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
||||
@@ -1238,7 +1472,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
|
||||
|
||||
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
||||
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
|
||||
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
||||
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
||||
|
||||
// Ensure we don't try to read more data than is available in the source buffer 'data'
|
||||
// or write more than the tensor can hold.
|
||||
@@ -1299,7 +1533,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
|
||||
|
||||
size_t row_size = ggml_row_size(t->type, t->ne[0]);
|
||||
size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad
|
||||
size_t row_size_rp = row_size * 2; // extra space for tmp pad (if any)
|
||||
size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales)
|
||||
|
||||
// Ensure we don't try to copy more data than the tensor actually contains.
|
||||
const size_t total_tensor_size = (size_t)nrows * row_size;
|
||||
@@ -1365,6 +1599,12 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
repack_q4_0_q4x4x2(tensor, data, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_Q4_1:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
repack_q4_1_q4x4x2(tensor, data, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_Q8_0:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
@@ -1407,6 +1647,12 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
repack_q4x4x2_q4_0(data, tensor, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_Q4_1:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
repack_q4x4x2_q4_1(data, tensor, size);
|
||||
break;
|
||||
|
||||
case GGML_TYPE_Q8_0:
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
|
||||
@@ -1886,7 +2132,8 @@ void ggml_hexagon_session::flush_pending(bool all) {
|
||||
uint32_t n_dbufs;
|
||||
|
||||
// Read response packet from queue
|
||||
int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, DSPQUEUE_TIMEOUT);
|
||||
const uint32_t timeo = opt_oppoll ? 0 : DSPQUEUE_TIMEOUT;
|
||||
int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, timeo);
|
||||
if (err == AEE_EEXPIRED) {
|
||||
continue;
|
||||
}
|
||||
@@ -2290,6 +2537,7 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses
|
||||
const int64_t H = v->ne[1];
|
||||
const int64_t n_tokens = v->ne[2];
|
||||
const int64_t n_seqs = v->ne[3];
|
||||
const int64_t K = state->ne[1];
|
||||
|
||||
if (S_v <= 0 || S_v > 128 || H <= 0 || n_tokens <= 0 || n_seqs <= 0) {
|
||||
return false;
|
||||
@@ -2302,10 +2550,10 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses
|
||||
if ((g->ne[0] != 1 && g->ne[0] != S_v) || beta->ne[0] != 1) {
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(state) != S_v * S_v * H * n_seqs) {
|
||||
if (ggml_nelements(state) != S_v * S_v * H * n_seqs * K) {
|
||||
return false;
|
||||
}
|
||||
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
|
||||
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2327,6 +2575,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
@@ -2377,6 +2626,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
@@ -2874,6 +3124,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
case GGML_OP_NORM: return HTP_OP_NORM;
|
||||
case GGML_OP_L2_NORM: return HTP_OP_L2_NORM;
|
||||
case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM;
|
||||
case GGML_OP_CONCAT: return HTP_OP_CONCAT;
|
||||
case GGML_OP_SCALE: return HTP_OP_SCALE;
|
||||
case GGML_OP_SQR: return HTP_OP_SQR;
|
||||
case GGML_OP_SQRT: return HTP_OP_SQRT;
|
||||
@@ -3286,6 +3537,25 @@ static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * se
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_concat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
int dim = ((const int32_t *) op->op_params)[0];
|
||||
if (dim < 0 || dim >= GGML_MAX_DIMS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
||||
const struct ggml_tensor * src = op->src[i];
|
||||
if (!src) {
|
||||
continue;
|
||||
}
|
||||
if (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_I32 && src->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_fill(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
@@ -3434,6 +3704,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_cumsum(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_CONCAT:
|
||||
supp = ggml_hexagon_supported_concat(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_FILL:
|
||||
supp = ggml_hexagon_supported_fill(sess, op);
|
||||
break;
|
||||
@@ -3598,6 +3872,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
||||
// Basic sanity checks to make sure definitions match
|
||||
static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
|
||||
"please update hexagon_type to match ggml_type");
|
||||
static_assert((unsigned int) HTP_TYPE_Q4_1 == (unsigned int) GGML_TYPE_Q4_1,
|
||||
"please update hexagon_type to match ggml_type");
|
||||
static_assert((unsigned int) HTP_TYPE_Q8_0 == (unsigned int) GGML_TYPE_Q8_0,
|
||||
"please update hexagon_type to match ggml_type");
|
||||
static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
|
||||
@@ -3610,6 +3886,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
||||
const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
|
||||
const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
|
||||
const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
|
||||
const char * str_oppoll = getenv("GGML_HEXAGON_OPPOLL");
|
||||
const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER");
|
||||
const char * str_profile = getenv("GGML_HEXAGON_PROFILE");
|
||||
const char * str_etm = getenv("GGML_HEXAGON_ETM");
|
||||
@@ -3647,6 +3924,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
||||
opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage;
|
||||
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
|
||||
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
|
||||
opt_oppoll = str_oppoll ? strtoul(str_oppoll, NULL, 0) : opt_oppoll;
|
||||
opt_profile = str_profile ? atoi(str_profile) : 0;
|
||||
opt_etm = str_etm ? atoi(str_etm) : 0;
|
||||
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
|
||||
|
||||
@@ -35,6 +35,7 @@ add_library(${HTP_LIB} SHARED
|
||||
ssm-conv.c
|
||||
cumsum-ops.c
|
||||
fill-ops.c
|
||||
concat-ops.c
|
||||
diag-ops.c
|
||||
solve-tri-ops.c
|
||||
gated-delta-net-ops.c
|
||||
@@ -57,15 +58,16 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
|
||||
|
||||
if (_hmx_idx GREATER_EQUAL 0)
|
||||
target_sources(${HTP_LIB} PRIVATE
|
||||
hmx-queue.c
|
||||
hmx-matmul-ops.c
|
||||
hmx-flash-attn-ops.c
|
||||
hmx-matmul-ops.c
|
||||
hmx-queue.c
|
||||
)
|
||||
|
||||
# -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
|
||||
set_source_files_properties(
|
||||
hmx-matmul-ops.c
|
||||
hmx-flash-attn-ops.c
|
||||
hmx-matmul-ops.c
|
||||
hmx-queue.c
|
||||
PROPERTIES COMPILE_OPTIONS "-mhmx"
|
||||
)
|
||||
|
||||
|
||||
275
ggml/src/ggml-hexagon/htp/concat-ops.c
Normal file
275
ggml/src/ggml-hexagon/htp/concat-ops.c
Normal file
@@ -0,0 +1,275 @@
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hexagon_types.h"
|
||||
#include "hexagon_protos.h"
|
||||
#include "hvx_hexagon_protos.h"
|
||||
#include "hex-dma.h"
|
||||
#include "vtcm-utils.h"
|
||||
#include "hvx-utils.h"
|
||||
#include "hex-fastdiv.h"
|
||||
#include <string.h>
|
||||
|
||||
struct htp_concat_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t dim;
|
||||
uint32_t nrows_per_thread;
|
||||
struct fastdiv_values div_ne0;
|
||||
struct fastdiv_values div_ne1;
|
||||
struct fastdiv_values div_ne2;
|
||||
};
|
||||
|
||||
static void concat_2d_f32_transposed(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_concat_context * cctx = (struct htp_concat_context *) data;
|
||||
struct htp_ops_context * octx = cctx->octx;
|
||||
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * src1 = octx->src[1];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
const uint32_t src0_ne0 = src0->ne[0];
|
||||
const uint32_t src1_ne0 = src1->ne[0];
|
||||
const uint32_t ne1 = dst->ne[1];
|
||||
|
||||
const uint32_t start_i = ith * cctx->nrows_per_thread;
|
||||
const uint32_t end_i = (start_i + cctx->nrows_per_thread < ne1) ? (start_i + cctx->nrows_per_thread) : ne1;
|
||||
if (start_i >= end_i) return;
|
||||
|
||||
dma_queue * q = octx->ctx->dma[ith];
|
||||
|
||||
uint8_t * spad0_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
|
||||
uint8_t * spad1_base = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
|
||||
|
||||
const uint32_t block_i = 32;
|
||||
const uint32_t spad1_stride = block_i * sizeof(float);
|
||||
|
||||
int32_t offsets[32] __attribute__((aligned(128)));
|
||||
for(int k=0; k<32; k++) {
|
||||
offsets[k] = k * spad1_stride;
|
||||
}
|
||||
HVX_Vector vv = *(HVX_Vector*)offsets;
|
||||
const uint32_t src1_ne0_padded = hex_round_up(src1_ne0, 32);
|
||||
const uint32_t spad0_row_bytes = hex_round_up((src0_ne0 + src1_ne0_padded) * sizeof(float), VLEN);
|
||||
uint32_t mu = src1_ne0_padded * spad1_stride;
|
||||
|
||||
for (uint32_t i = start_i; i < end_i; i += block_i) {
|
||||
uint32_t current_block_i = (end_i - i < block_i) ? (end_i - i) : block_i;
|
||||
|
||||
uint32_t src1_width_bytes = current_block_i * sizeof(float);
|
||||
uint8_t * src1_ptr = (uint8_t *)src1->data + i * src1->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(spad1_base, src1_ptr), spad1_stride, src1->nb[0], src1_width_bytes, src1_ne0);
|
||||
|
||||
uint32_t src0_row_bytes = src0_ne0 * sizeof(float);
|
||||
uint8_t * src0_ptr = (uint8_t *)src0->data + i * src0->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(spad0_base, src0_ptr), spad0_row_bytes, src0->nb[1], src0_row_bytes, current_block_i);
|
||||
|
||||
dma_queue_pop(q); // src1
|
||||
|
||||
HVX_Vector * vtcm_tmp = (HVX_Vector *)(spad1_base + src1_ne0_padded * spad1_stride);
|
||||
|
||||
for (uint32_t j = 0; j < src1_ne0_padded; j += 32) {
|
||||
#pragma unroll(4)
|
||||
for (uint32_t ii = 0; ii < current_block_i; ii++) {
|
||||
size_t rt = (size_t)(spad1_base + j * spad1_stride + ii * sizeof(float));
|
||||
Q6_vgather_ARMVw(&vtcm_tmp[ii], rt, mu, vv);
|
||||
uint8_t * dst_ptr = spad0_base + ii * spad0_row_bytes + (src0_ne0 + j) * sizeof(float);
|
||||
hvx_vmemu(dst_ptr) = vtcm_tmp[ii];
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_pop(q); // src0
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *)dst->data + i * dst->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(dst_ptr, spad0_base), dst->nb[1], spad0_row_bytes, (src0_ne0 + src1_ne0) * sizeof(float), current_block_i);
|
||||
|
||||
dma_queue_pop(q);
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_2d_f16_transposed(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_concat_context * cctx = (struct htp_concat_context *) data;
|
||||
struct htp_ops_context * octx = cctx->octx;
|
||||
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * src1 = octx->src[1];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
const uint32_t src0_ne0 = src0->ne[0];
|
||||
const uint32_t src1_ne0 = src1->ne[0];
|
||||
const uint32_t ne1 = dst->ne[1];
|
||||
|
||||
const uint32_t start_i = ith * cctx->nrows_per_thread;
|
||||
const uint32_t end_i = (start_i + cctx->nrows_per_thread < ne1) ? (start_i + cctx->nrows_per_thread) : ne1;
|
||||
if (start_i >= end_i) return;
|
||||
|
||||
dma_queue * q = octx->ctx->dma[ith];
|
||||
|
||||
uint8_t * spad0_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread;
|
||||
uint8_t * spad1_base = octx->src1_spad.data + ith * octx->src1_spad.size_per_thread;
|
||||
|
||||
const uint32_t block_i = 64;
|
||||
const uint32_t spad1_stride = block_i * sizeof(__fp16);
|
||||
|
||||
int16_t offsets[64] __attribute__((aligned(128)));
|
||||
for(int k=0; k<64; k++) {
|
||||
offsets[k] = k * spad1_stride;
|
||||
}
|
||||
HVX_Vector vv = *(HVX_Vector*)offsets;
|
||||
const uint32_t src1_ne0_padded = hex_round_up(src1_ne0, 64);
|
||||
const uint32_t spad0_row_bytes = hex_round_up((src0_ne0 + src1_ne0_padded) * sizeof(__fp16), VLEN);
|
||||
uint32_t mu = src1_ne0_padded * spad1_stride;
|
||||
|
||||
for (uint32_t i = start_i; i < end_i; i += block_i) {
|
||||
uint32_t current_block_i = (end_i - i < block_i) ? (end_i - i) : block_i;
|
||||
|
||||
uint32_t src1_width_bytes = current_block_i * sizeof(__fp16);
|
||||
uint8_t * src1_ptr = (uint8_t *)src1->data + i * src1->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(spad1_base, src1_ptr), spad1_stride, src1->nb[0], src1_width_bytes, src1_ne0);
|
||||
|
||||
uint32_t src0_row_bytes = src0_ne0 * sizeof(__fp16);
|
||||
uint8_t * src0_ptr = (uint8_t *)src0->data + i * src0->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(spad0_base, src0_ptr), spad0_row_bytes, src0->nb[1], src0_row_bytes, current_block_i);
|
||||
|
||||
dma_queue_pop(q); // src1
|
||||
|
||||
HVX_Vector * vtcm_tmp = (HVX_Vector *)(spad1_base + src1_ne0_padded * spad1_stride);
|
||||
|
||||
for (uint32_t j = 0; j < src1_ne0_padded; j += 64) {
|
||||
#pragma unroll(4)
|
||||
for (uint32_t ii = 0; ii < current_block_i; ii++) {
|
||||
size_t rt = (size_t)(spad1_base + j * spad1_stride + ii * sizeof(__fp16));
|
||||
Q6_vgather_ARMVh(&vtcm_tmp[ii], rt, mu, vv);
|
||||
uint8_t * dst_ptr = spad0_base + ii * spad0_row_bytes + (src0_ne0 + j) * sizeof(__fp16);
|
||||
hvx_vmemu(dst_ptr) = vtcm_tmp[ii];
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_pop(q); // src0
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *)dst->data + i * dst->nb[1];
|
||||
dma_queue_push(q, dma_make_ptr(dst_ptr, spad0_base), dst->nb[1], spad0_row_bytes, (src0_ne0 + src1_ne0) * sizeof(__fp16), current_block_i);
|
||||
|
||||
dma_queue_pop(q);
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_generic(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_concat_context * cctx = (struct htp_concat_context *) data;
|
||||
struct htp_ops_context * octx = cctx->octx;
|
||||
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * src1 = octx->src[1];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
const int dim = cctx->dim;
|
||||
const uint32_t type_size = (dst->type == HTP_TYPE_F32 || dst->type == HTP_TYPE_I32) ? 4 : 2;
|
||||
|
||||
const uint32_t ne[4] = {dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]};
|
||||
const uint32_t total_elements = ne[0] * ne[1] * ne[2] * ne[3];
|
||||
const uint32_t chunk_size = (total_elements + nth - 1) / nth;
|
||||
|
||||
const uint32_t start_idx = MIN(ith * chunk_size, total_elements);
|
||||
const uint32_t end_idx = MIN(start_idx + chunk_size, total_elements);
|
||||
|
||||
// Naive scalar element-wise copy
|
||||
for (uint32_t idx = start_idx; idx < end_idx; idx++) {
|
||||
uint32_t idx_div_ne0 = fastdiv(idx, &cctx->div_ne0);
|
||||
uint32_t i0 = idx - idx_div_ne0 * ne[0];
|
||||
|
||||
uint32_t idx_div_ne01 = fastdiv(idx_div_ne0, &cctx->div_ne1);
|
||||
uint32_t i1 = idx_div_ne0 - idx_div_ne01 * ne[1];
|
||||
|
||||
uint32_t idx_div_ne012 = fastdiv(idx_div_ne01, &cctx->div_ne2);
|
||||
uint32_t i2 = idx_div_ne01 - idx_div_ne012 * ne[2];
|
||||
uint32_t i3 = idx_div_ne012;
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2] + i1 * dst->nb[1] + i0 * dst->nb[0];
|
||||
|
||||
uint32_t idx_dim = 0;
|
||||
if (dim == 0) idx_dim = i0;
|
||||
else if (dim == 1) idx_dim = i1;
|
||||
else if (dim == 2) idx_dim = i2;
|
||||
else if (dim == 3) idx_dim = i3;
|
||||
|
||||
const struct htp_tensor * src = (idx_dim < src0->ne[dim]) ? src0 : src1;
|
||||
|
||||
uint32_t s0 = i0;
|
||||
uint32_t s1 = i1;
|
||||
uint32_t s2 = i2;
|
||||
uint32_t s3 = i3;
|
||||
|
||||
if (dim == 0 && src == src1) s0 -= src0->ne[0];
|
||||
if (dim == 1 && src == src1) s1 -= src0->ne[1];
|
||||
if (dim == 2 && src == src1) s2 -= src0->ne[2];
|
||||
if (dim == 3 && src == src1) s3 -= src0->ne[3];
|
||||
|
||||
uint8_t * src_ptr = (uint8_t *)src->data + s3 * src->nb[3] + s2 * src->nb[2] + s1 * src->nb[1] + s0 * src->nb[0];
|
||||
|
||||
if (type_size == 4) {
|
||||
*(float*)dst_ptr = *(float*)src_ptr;
|
||||
} else {
|
||||
*(__fp16*)dst_ptr = *(__fp16*)src_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int op_concat(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * src1 = octx->src[1];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
int dim = octx->op_params[0];
|
||||
|
||||
bool is_2d = dst->ne[2] == 1 && dst->ne[3] == 1;
|
||||
|
||||
const uint32_t type_size = (dst->type == HTP_TYPE_F32 || dst->type == HTP_TYPE_I32) ? 4 : 2;
|
||||
bool is_src1_transposed = (src1->nb[0] > src1->nb[1]);
|
||||
bool is_src0_transposed = (src0->nb[0] > src0->nb[1]);
|
||||
|
||||
uint32_t n_threads = octx->n_threads;
|
||||
struct htp_concat_context cctx;
|
||||
cctx.octx = octx;
|
||||
cctx.dim = dim;
|
||||
cctx.div_ne0 = init_fastdiv_values(dst->ne[0]);
|
||||
cctx.div_ne1 = init_fastdiv_values(dst->ne[1]);
|
||||
cctx.div_ne2 = init_fastdiv_values(dst->ne[2]);
|
||||
|
||||
void (*worker_func)(unsigned int, unsigned int, void *) = concat_generic;
|
||||
|
||||
if (dim == 0 && is_2d && is_src1_transposed && !is_src0_transposed) {
|
||||
n_threads = MIN(dst->ne[1], n_threads);
|
||||
if (n_threads < 1) {
|
||||
n_threads = 1;
|
||||
}
|
||||
uint32_t block_i = (type_size == 4) ? 32 : 64;
|
||||
|
||||
cctx.nrows_per_thread = hmx_ceil_div(dst->ne[1], n_threads);
|
||||
|
||||
// Allocate VTCM
|
||||
uint32_t spad1_stride = block_i * type_size;
|
||||
|
||||
uint32_t src1_ne0_padded = hex_round_up(src1->ne[0], block_i);
|
||||
uint32_t spad0_row_bytes = hex_round_up((src0->ne[0] + src1_ne0_padded) * type_size, VLEN);
|
||||
|
||||
octx->src0_spad.size_per_thread = block_i * spad0_row_bytes;
|
||||
octx->src1_spad.size_per_thread = src1_ne0_padded * spad1_stride + block_i * VLEN;
|
||||
|
||||
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
|
||||
octx->src1_spad.size = n_threads * octx->src1_spad.size_per_thread;
|
||||
|
||||
if (octx->src0_spad.size + octx->src1_spad.size > octx->ctx->vtcm_size) {
|
||||
return HTP_STATUS_VTCM_TOO_SMALL;
|
||||
}
|
||||
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
|
||||
if (type_size == 4) {
|
||||
worker_func = concat_2d_f32_transposed;
|
||||
} else {
|
||||
worker_func = concat_2d_f16_transposed;
|
||||
}
|
||||
}
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, worker_func, &cctx, n_threads);
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
@@ -28,158 +28,170 @@ struct htp_copy_context {
|
||||
uint32_t dst_blocks_per_row;
|
||||
|
||||
uint32_t src0_nrows_per_thread;
|
||||
|
||||
void (*copy)(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith);
|
||||
};
|
||||
|
||||
#define cpy_preamble \
|
||||
const struct htp_tensor *src0 = octx->src[0]; \
|
||||
const struct htp_tensor *dst = octx->dst; \
|
||||
\
|
||||
const uint32_t ne00 = src0->ne[0]; \
|
||||
const uint32_t ne01 = src0->ne[1]; \
|
||||
const uint32_t ne02 = src0->ne[2]; \
|
||||
const uint32_t ne03 = src0->ne[3]; \
|
||||
\
|
||||
const uint32_t nb00 = src0->nb[0]; \
|
||||
const uint32_t nb01 = src0->nb[1]; \
|
||||
const uint32_t nb02 = src0->nb[2]; \
|
||||
const uint32_t nb03 = src0->nb[3]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb0 = dst->nb[0]; \
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3]; \
|
||||
\
|
||||
const uint32_t ne00 = src0->ne[0]; \
|
||||
const uint32_t ne01 = src0->ne[1]; \
|
||||
const uint32_t ne02 = src0->ne[2]; \
|
||||
const uint32_t ne03 = src0->ne[3]; \
|
||||
\
|
||||
const uint32_t nb00 = src0->nb[0]; \
|
||||
const uint32_t nb01 = src0->nb[1]; \
|
||||
const uint32_t nb02 = src0->nb[2]; \
|
||||
const uint32_t nb03 = src0->nb[3]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb0 = dst->nb[0]; \
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3]; \
|
||||
\
|
||||
const uint32_t nr = ne01;
|
||||
|
||||
static void cpy_thread_sametype_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
|
||||
cpy_preamble;
|
||||
|
||||
// parallelize by src0 rows
|
||||
const uint32_t dr = ct->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
|
||||
|
||||
// copy by rows
|
||||
for (uint32_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (uint32_t i02 = 0; i02 < ne02; i02++) {
|
||||
#pragma unroll(2)
|
||||
for (uint32_t i01 = ir0; i01 < ir1; i01++) {
|
||||
uint8_t* dst_ptr = (uint8_t*) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
|
||||
uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
||||
hex_l2fetch(src0_ptr, ne00 * ct->src0_type_size, nb01, 2);
|
||||
hvx_copy_uu(dst_ptr, src0_ptr, ne00, ct->src0_type_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
#define DEFINE_CPY_SAMESHAPE(NAME, ELEM_TYPE, ELEM_SIZE) \
|
||||
static void cpy_thread_##NAME##_sameshape(unsigned int nth, unsigned int ith, void * data) { \
|
||||
struct htp_copy_context * ct = (struct htp_copy_context *) data; \
|
||||
struct htp_ops_context * octx = ct->octx; \
|
||||
cpy_preamble; \
|
||||
const uint32_t dr = ct->src0_nrows_per_thread; \
|
||||
const uint32_t ir0 = dr * ith; \
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; \
|
||||
if (ir0 >= nr) return; \
|
||||
for (uint32_t i03 = 0; i03 < ne03; i03++) { \
|
||||
for (uint32_t i02 = 0; i02 < ne02; i02++) { \
|
||||
_Pragma("unroll(4)") \
|
||||
for (uint32_t i01 = ir0; i01 < ir1; i01++) { \
|
||||
uint8_t* dst_ptr = (uint8_t*) dst->data + i01*nb1 + i02*nb2 + i03*nb3; \
|
||||
uint8_t* src0_ptr = (uint8_t*) src0->data + i01*nb01 + i02*nb02 + i03*nb03; \
|
||||
hex_l2fetch(src0_ptr, ne00 * ELEM_SIZE, nb01, 2); \
|
||||
hvx_copy_uu(dst_ptr, src0_ptr, ne00, ELEM_SIZE); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp_ops_context * octx, int nth, int ith) {
|
||||
cpy_preamble;
|
||||
DEFINE_CPY_SAMESHAPE(f32, float, 4)
|
||||
DEFINE_CPY_SAMESHAPE(f16, __fp16, 2)
|
||||
|
||||
// parallelize by src0 rows
|
||||
const uint32_t dr = ct->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
|
||||
|
||||
// Fast path: when both src0 and dst are contiguous in memory
|
||||
// Replace the element-by-element loop with a single bulk HVX copy per (i03, i02) slice.
|
||||
const bool src0_contig = (nb00 == ct->src0_type_size) &&
|
||||
(nb01 == ne00 * nb00) &&
|
||||
(nb02 == ne01 * nb01) &&
|
||||
(nb03 == ne02 * nb02);
|
||||
const bool dst_contig = (nb0 == ct->dst_type_size) &&
|
||||
(nb1 == ne0 * nb0) &&
|
||||
(nb2 == ne1 * nb1) &&
|
||||
(nb3 == ne2 * nb2);
|
||||
|
||||
if (src0_contig && dst_contig) {
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01;
|
||||
uint32_t flat = ((i03*ne02 + i02)*ne01 + ir0) * ne00;
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + flat * ct->src0_type_size;
|
||||
hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ct->src0_type_size);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// dst counters
|
||||
int64_t k10 = 0;
|
||||
int64_t i11 = 0;
|
||||
int64_t i12 = 0;
|
||||
int64_t i13 = 0;
|
||||
|
||||
// number of blocks in a row
|
||||
const int64_t nk00 = ct->src0_blocks_per_row;
|
||||
const int64_t nk0 = ct->dst_blocks_per_row;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
k10 += nk00 * ir0;
|
||||
while (k10 >= nk0) {
|
||||
k10 -= nk0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
||||
for (int64_t k00 = 0; k00 < nk00; k00++) {
|
||||
const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
||||
memcpy(dst_ptr, src0_ptr, ct->dst_type_size);
|
||||
|
||||
if (++k10 == nk0) {
|
||||
k10 = 0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
k10 += nk00 * (ne01 - ir1);
|
||||
while (k10 >= nk0) {
|
||||
k10 -= nk0;
|
||||
if (++i11 == ne1) {
|
||||
i11 = 0;
|
||||
if (++i12 == ne2) {
|
||||
i12 = 0;
|
||||
if (++i13 == ne3) {
|
||||
i13 = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#define DEFINE_CPY_RESHAPE(NAME, ELEM_TYPE, ELEM_SIZE) \
|
||||
static void cpy_thread_##NAME##_reshape(unsigned int nth, unsigned int ith, void * data) { \
|
||||
struct htp_copy_context * ct = (struct htp_copy_context *) data; \
|
||||
struct htp_ops_context * octx = ct->octx; \
|
||||
cpy_preamble; \
|
||||
const uint32_t dr = ct->src0_nrows_per_thread; \
|
||||
const uint32_t ir0 = dr * ith; \
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr; \
|
||||
if (ir0 >= nr) return; \
|
||||
const bool src0_contig = (nb00 == ELEM_SIZE) && \
|
||||
(nb01 == ne00 * nb00) && \
|
||||
(nb02 == ne01 * nb01) && \
|
||||
(nb03 == ne02 * nb02); \
|
||||
const bool dst_contig = (nb0 == ELEM_SIZE) && \
|
||||
(nb1 == ne0 * nb0) && \
|
||||
(nb2 == ne1 * nb1) && \
|
||||
(nb3 == ne2 * nb2); \
|
||||
if (src0_contig && dst_contig) { \
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) { \
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) { \
|
||||
uint8_t * src_ptr = (uint8_t *) src0->data + i03*nb03 + i02*nb02 + ir0*nb01; \
|
||||
uint32_t flat = ((i03*ne02 + i02)*ne01 + ir0) * ne00; \
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + flat * ELEM_SIZE; \
|
||||
hvx_copy_uu(dst_ptr, src_ptr, (ir1 - ir0) * ne00, ELEM_SIZE); \
|
||||
} \
|
||||
} \
|
||||
return; \
|
||||
} \
|
||||
const bool reshape_flat_fast = (ne03 == 1 && ne2 == 1 && ne3 == 1) && \
|
||||
(ne0 == ne00 * ne01) && (ne1 == ne02) && \
|
||||
(nb00 == ELEM_SIZE) && (nb0 == ELEM_SIZE); \
|
||||
if (reshape_flat_fast) { \
|
||||
for (uint32_t i02 = 0; i02 < ne02; i02++) { \
|
||||
for (uint32_t i01 = ir0; i01 < ir1; i01++) { \
|
||||
uint8_t * src0_ptr = (uint8_t *) src0->data + i01 * nb01 + i02 * nb02; \
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i01 * ne00 * ELEM_SIZE + i02 * nb1; \
|
||||
hvx_copy_uu(dst_ptr, src0_ptr, ne00, ELEM_SIZE); \
|
||||
} \
|
||||
} \
|
||||
return; \
|
||||
} \
|
||||
int64_t k10 = 0; \
|
||||
int64_t i11 = 0; \
|
||||
int64_t i12 = 0; \
|
||||
int64_t i13 = 0; \
|
||||
const int64_t nk00 = ct->src0_blocks_per_row; \
|
||||
const int64_t nk0 = ct->dst_blocks_per_row; \
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) { \
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) { \
|
||||
k10 += nk00 * ir0; \
|
||||
while (k10 >= nk0) { \
|
||||
k10 -= nk0; \
|
||||
if (++i11 == ne1) { \
|
||||
i11 = 0; \
|
||||
if (++i12 == ne2) { \
|
||||
i12 = 0; \
|
||||
if (++i13 == ne3) { \
|
||||
i13 = 0; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
for (int64_t i01 = ir0; i01 < ir1; i01++) { \
|
||||
for (int64_t k00 = 0; k00 < nk00; k00++) { \
|
||||
const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); \
|
||||
char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); \
|
||||
memcpy(dst_ptr, src0_ptr, ELEM_SIZE); \
|
||||
if (++k10 == nk0) { \
|
||||
k10 = 0; \
|
||||
if (++i11 == ne1) { \
|
||||
i11 = 0; \
|
||||
if (++i12 == ne2) { \
|
||||
i12 = 0; \
|
||||
if (++i13 == ne3) { \
|
||||
i13 = 0; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
k10 += nk00 * (ne01 - ir1); \
|
||||
while (k10 >= nk0) { \
|
||||
k10 -= nk0; \
|
||||
if (++i11 == ne1) { \
|
||||
i11 = 0; \
|
||||
if (++i12 == ne2) { \
|
||||
i12 = 0; \
|
||||
if (++i13 == ne3) { \
|
||||
i13 = 0; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
|
||||
DEFINE_CPY_RESHAPE(f32, float, 4)
|
||||
DEFINE_CPY_RESHAPE(f16, __fp16, 2)
|
||||
|
||||
static void cpy_thread_f16_f32_sameshape(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_copy_context * ct = (struct htp_copy_context *) data;
|
||||
struct htp_ops_context * octx = ct->octx;
|
||||
cpy_preamble;
|
||||
|
||||
// parallelize by src0 rows
|
||||
const uint32_t dr = ct->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
|
||||
if (ir0 >= nr) return;
|
||||
|
||||
// copy by rows
|
||||
for (uint32_t i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -195,13 +207,16 @@ static void cpy_thread_f16_f32_sameshape(struct htp_copy_context * ct, struct ht
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct htp_ops_context * octx, const int nth, const int ith) {
|
||||
static void cpy_thread_f32_f16_sameshape(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_copy_context * ct = (struct htp_copy_context *) data;
|
||||
struct htp_ops_context * octx = ct->octx;
|
||||
cpy_preamble;
|
||||
|
||||
// parallelize by src0 rows
|
||||
const uint32_t dr = ct->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr) < nr ? (ir0 + dr) : nr;
|
||||
if (ir0 >= nr) return;
|
||||
|
||||
// copy by rows
|
||||
for (uint32_t i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -217,11 +232,6 @@ static void cpy_thread_f32_f16_sameshape(struct htp_copy_context * ct, struct ht
|
||||
}
|
||||
}
|
||||
|
||||
static void cpy_work_func(unsigned int n, unsigned int i, void *data) {
|
||||
struct htp_copy_context *ct = (struct htp_copy_context *) data;
|
||||
ct->copy(ct, ct->octx, n, i);
|
||||
}
|
||||
|
||||
int op_cpy(struct htp_ops_context * octx) {
|
||||
cpy_preamble;
|
||||
|
||||
@@ -254,22 +264,32 @@ int op_cpy(struct htp_ops_context * octx) {
|
||||
|
||||
ct.src0_nrows_per_thread = (nr + n_threads - 1) / n_threads;
|
||||
|
||||
worker_callback_t copy_fun;
|
||||
|
||||
if (sametype && sameshape) {
|
||||
ct.copy = cpy_thread_sametype_sameshape;
|
||||
if (src0->type == HTP_TYPE_F32) {
|
||||
copy_fun = cpy_thread_f32_sameshape;
|
||||
} else {
|
||||
copy_fun = cpy_thread_f16_sameshape;
|
||||
}
|
||||
} else if (sameshape) {
|
||||
/**/ if (dst->type == HTP_TYPE_F16 && src0->type == HTP_TYPE_F32)
|
||||
ct.copy = cpy_thread_f16_f32_sameshape;
|
||||
copy_fun = cpy_thread_f16_f32_sameshape;
|
||||
else if (dst->type == HTP_TYPE_F32 && src0->type == HTP_TYPE_F16)
|
||||
ct.copy = cpy_thread_f32_f16_sameshape;
|
||||
copy_fun = cpy_thread_f32_f16_sameshape;
|
||||
else
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
} else if (sametype) {
|
||||
ct.copy = cpy_thread_sametype_reshape;
|
||||
if (src0->type == HTP_TYPE_F32) {
|
||||
copy_fun = cpy_thread_f32_reshape;
|
||||
} else {
|
||||
copy_fun = cpy_thread_f16_reshape;
|
||||
}
|
||||
} else {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, cpy_work_func, &ct, n_threads);
|
||||
worker_pool_run_func(octx->ctx->worker_pool, copy_fun, &ct, n_threads);
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
@@ -22,6 +22,16 @@
|
||||
// Must be multiple of 32
|
||||
#define FLASH_ATTN_BLOCK_SIZE (32 * 2)
|
||||
|
||||
#if __HVX_ARCH__ < 79
|
||||
#define HVX_OP_ADD_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b))
|
||||
#define HVX_OP_SUB_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b))
|
||||
#define HVX_OP_MUL_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
|
||||
#else
|
||||
#define HVX_OP_ADD_F32(a, b) Q6_Vsf_vadd_VsfVsf(a, b)
|
||||
#define HVX_OP_SUB_F32(a, b) Q6_Vsf_vsub_VsfVsf(a, b)
|
||||
#define HVX_OP_MUL_F32(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
|
||||
#endif
|
||||
|
||||
// This is a bit of a hack because the compiler is strugling to properly inline
|
||||
// the default hvx_vec_f32_to_f16 with output into the local array.
|
||||
static __attribute__((noinline)) void hvx_vec_f32_to_f16_a(void *ptr, HVX_Vector v0, HVX_Vector v1)
|
||||
@@ -54,8 +64,8 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
|
||||
rsum_p = hvx_vec_mpyacc_f32_f16(rsum_p, x_hf, y_hf);
|
||||
}
|
||||
|
||||
HVX_Vector rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p)));
|
||||
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum)));
|
||||
HVX_Vector rsum = HVX_OP_ADD_F32(Q6_V_lo_W(rsum_p), Q6_V_hi_W(rsum_p));
|
||||
rsum = HVX_OP_MUL_F32(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
|
||||
hvx_vec_store_u(r, 4, rsum);
|
||||
}
|
||||
|
||||
@@ -105,10 +115,10 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx4(const void * restrict y,
|
||||
rsum3_p = hvx_vec_mpyacc_f32_f16(rsum3_p, x3_hf, y_hf);
|
||||
}
|
||||
|
||||
HVX_Vector rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p)));
|
||||
HVX_Vector rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p)));
|
||||
HVX_Vector rsum2 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p)));
|
||||
HVX_Vector rsum3 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p)));
|
||||
HVX_Vector rsum0 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum0_p), Q6_V_hi_W(rsum0_p));
|
||||
HVX_Vector rsum1 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum1_p), Q6_V_hi_W(rsum1_p));
|
||||
HVX_Vector rsum2 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum2_p), Q6_V_hi_W(rsum2_p));
|
||||
HVX_Vector rsum3 = HVX_OP_ADD_F32(Q6_V_lo_W(rsum3_p), Q6_V_hi_W(rsum3_p));
|
||||
|
||||
HVX_Vector_x4 rsum0123 = { .v = { rsum0, rsum1, rsum2, rsum3 } };
|
||||
return hvx_vec_reduce_sum_f32x4(rsum0123);
|
||||
@@ -123,7 +133,7 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y,
|
||||
const size_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||
const size_t nloe = n % VLEN_FP16; // leftover elements
|
||||
|
||||
HVX_Vector sums; // initialize at j = 0
|
||||
HVX_Vector sums = Q6_V_vzero();
|
||||
const size_t stride_x_4 = stride_x * 4;
|
||||
for (uint32_t j = 0; j < VLEN_FP32; j += 4) {
|
||||
HVX_Vector sums_x4 = hvx_dot_f16_f16_aa_rx4(y, x, stride_x, nvec, nloe);
|
||||
@@ -132,8 +142,7 @@ static inline HVX_Vector hvx_dot_f16_f16_aa_rx32(const void * restrict y,
|
||||
x += stride_x_4;
|
||||
}
|
||||
|
||||
sums = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), sums);
|
||||
return Q6_Vsf_equals_Vqf32(sums);
|
||||
return HVX_OP_MUL_F32(hvx_vec_splat_f32(s), sums);
|
||||
}
|
||||
|
||||
// MAD: y (F32) += x (F16) * s (F16)
|
||||
@@ -268,11 +277,10 @@ static inline void hvx_scale_vec_f32_aa(uint8_t * restrict dst, const uint8_t *
|
||||
uint32_t i = 0;
|
||||
#pragma unroll(4)
|
||||
for (; i < nvec; ++i) {
|
||||
vdst[i] = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs));
|
||||
vdst[i] = HVX_OP_MUL_F32(vsrc[i], vs);
|
||||
}
|
||||
if (nloe) {
|
||||
HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);
|
||||
hvx_vec_store_a(&vdst[i], nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v));
|
||||
hvx_vec_store_a(&vdst[i], nloe * sizeof(float), HVX_OP_MUL_F32(vsrc[i], vs));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -438,25 +446,44 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
|
||||
// Process in sub-blocks of 32 (VLEN_FP32)
|
||||
HVX_Vector sb_scores[FLASH_ATTN_BLOCK_SIZE / VLEN_FP32];
|
||||
HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY);
|
||||
for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) {
|
||||
for (uint32_t iv = 0; ic < current_block_size; ic += VLEN_FP32, ++iv) {
|
||||
// 1. Compute scores
|
||||
HVX_Vector scores = hvx_dot_f16_f16_aa_rx32(q_ptr_vtcm, k_base + ic * factx->size_k_row_padded, factx->size_k_row_padded, DK, factx->scale);
|
||||
|
||||
// 2. Softcap
|
||||
if (factx->logit_softcap != 0.0f) {
|
||||
scores = hvx_vec_tanh_f32(scores);
|
||||
scores = Q6_Vqf32_vmpy_VsfVsf(scores, logit_cap);
|
||||
scores = Q6_Vsf_equals_Vqf32(scores);
|
||||
scores = HVX_OP_MUL_F32(scores, logit_cap);
|
||||
}
|
||||
|
||||
// 3. Mask
|
||||
if (mask) {
|
||||
const __fp16 * mp = m_base + ic;
|
||||
HVX_Vector m_vals_f16 = *(const HVX_UVector *) mp;
|
||||
HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
|
||||
HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
|
||||
scores = Q6_Vqf32_vadd_Vqf32Vsf(add_val, scores);
|
||||
scores = Q6_Vsf_equals_Vqf32(scores);
|
||||
|
||||
// Multiplying -INFINITY (0xFC00) by a slope in VhfVhf instructions can incorrectly produce NaN on v79.
|
||||
// Clamp -INFINITY to the max negative fp16 finite value (-65504.0f).
|
||||
HVX_Vector vinf = Q6_Vh_vsplat_R(0xFC00);
|
||||
HVX_Vector vmin = Q6_Vh_vsplat_R(0xFBFF);
|
||||
HVX_VectorPred is_inf = Q6_Q_vcmp_eq_VhVh(m_vals_f16, vinf);
|
||||
m_vals_f16 = Q6_V_vmux_QVV(is_inf, vmin, m_vals_f16);
|
||||
|
||||
#if __HVX_ARCH__ >= 79
|
||||
HVX_VectorPair m_vals_f32_pair = Q6_Wsf_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
|
||||
HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
|
||||
scores = Q6_Vsf_vadd_VsfVsf(add_val, scores);
|
||||
#else
|
||||
HVX_VectorPair m_vals_f32_pair = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(m_vals_f16), slope_vec);
|
||||
HVX_Vector add_val = Q6_V_lo_W(m_vals_f32_pair);
|
||||
scores = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(add_val, scores));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Mask out invalid lanes for leftover handling
|
||||
uint32_t valid_lanes = current_block_size - ic;
|
||||
if (valid_lanes < VLEN_FP32) {
|
||||
HVX_VectorPred valid_pred = Q6_Q_vsetq_R(valid_lanes * 4); // 4 bytes per fp32 lane
|
||||
scores = Q6_V_vmux_QVV(valid_pred, scores, hvx_vec_splat_f32(-INFINITY));
|
||||
}
|
||||
|
||||
sb_scores[iv] = scores;
|
||||
@@ -466,78 +493,55 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
|
||||
{
|
||||
// 4. Online Softmax Update
|
||||
HVX_Vector M_new_vec = Q6_Vsf_vmax_VsfVsf(v_max, M_vec);
|
||||
HVX_Vector diff_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(M_vec, M_new_vec));
|
||||
HVX_Vector diff_vec = HVX_OP_SUB_F32(M_vec, M_new_vec);
|
||||
HVX_Vector ms_vec = hvx_vec_exp_f32(diff_vec);
|
||||
M_vec = M_new_vec;
|
||||
|
||||
hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);
|
||||
|
||||
HVX_Vector p_sum_vec = hvx_vec_splat_f32(0.0f);
|
||||
for (uint32_t ic2 = 0, iv = 0; ic2 + VLEN_FP32 <= current_block_size; ic2 += VLEN_FP32, ++iv) {
|
||||
for (uint32_t ic2 = 0, iv = 0; ic2 < current_block_size; ic2 += VLEN_FP32, ++iv) {
|
||||
HVX_Vector scores = sb_scores[iv];
|
||||
HVX_Vector scores_shifted = Q6_Vqf32_vsub_VsfVsf(scores, M_vec);
|
||||
HVX_Vector P = hvx_vec_exp_f32(Q6_Vsf_equals_Vqf32(scores_shifted));
|
||||
HVX_Vector scores_shifted = HVX_OP_SUB_F32(scores, M_vec);
|
||||
HVX_Vector P = hvx_vec_exp_f32(scores_shifted);
|
||||
|
||||
p_sum_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(p_sum_vec, P));
|
||||
p_sum_vec = HVX_OP_ADD_F32(p_sum_vec, P);
|
||||
|
||||
// 5. Accumulate V
|
||||
__fp16 __attribute__((aligned(VLEN))) p_arr[VLEN_FP16];
|
||||
hvx_vec_f32_to_f16_a(p_arr, P, hvx_vec_splat_f32(0));
|
||||
|
||||
float __attribute__((aligned(128))) P_arr[VLEN_FP32];
|
||||
hvx_vec_store_a(P_arr, 128, P);
|
||||
|
||||
for (uint32_t j = 0; j < VLEN_FP32; j += 2) {
|
||||
const uint32_t cur_ic = ic2 + j;
|
||||
const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded;
|
||||
const uint32_t cur_ic = ic2 + j;
|
||||
if (cur_ic >= current_block_size) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (cur_ic + 1 == current_block_size) {
|
||||
// Odd leftover, process single row
|
||||
if (P_arr[j] != 0.0f) {
|
||||
const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded;
|
||||
hvx_mad_f32_f16_aa(VKQ32, v_ptr, (p_arr + j), DV);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// Avoid NaN * 0.0 = NaN for uninitialized V cache rows.
|
||||
// Check the f32 values to safely avoid strict aliasing violations.
|
||||
if (P_arr[j] == 0.0f && P_arr[j + 1] == 0.0f) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint8_t * v_ptr = v_base + cur_ic * factx->size_v_row_padded;
|
||||
hvx_mad_f32_f16_aa_rx2(VKQ32, v_ptr, v_ptr + factx->size_v_row_padded, (p_arr + j), (p_arr + j + 1), DV);
|
||||
}
|
||||
}
|
||||
|
||||
p_sum_vec = hvx_vec_reduce_sum_f32(p_sum_vec);
|
||||
S_vec = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(S_vec, ms_vec)), p_sum_vec));
|
||||
}
|
||||
|
||||
if (ic < current_block_size) {
|
||||
// Sync scalars for leftover/next block if needed
|
||||
float M = hvx_vec_get_f32(M_vec);
|
||||
float S = hvx_vec_get_f32(S_vec);
|
||||
|
||||
// Leftover
|
||||
for (; ic < current_block_size; ++ic) {
|
||||
float s_val;
|
||||
const uint8_t * k_ptr = k_base + ic * factx->size_k_row_padded;
|
||||
hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, factx->scale);
|
||||
if (factx->logit_softcap != 0.0f) {
|
||||
s_val = factx->logit_softcap * tanhf(s_val);
|
||||
}
|
||||
|
||||
if (mask) {
|
||||
const float m_val = m_base[ic];
|
||||
s_val += slope * m_val;
|
||||
}
|
||||
|
||||
const float Mold = M;
|
||||
__fp16 vs = 1.0f;
|
||||
|
||||
if (s_val > M) {
|
||||
M = s_val;
|
||||
HVX_Vector diff_vec = hvx_vec_splat_f32(Mold - M);
|
||||
HVX_Vector ms_vec = hvx_vec_exp_f32(diff_vec);
|
||||
hvx_scale_vec_f32_aa((uint8_t *) VKQ32, (const uint8_t *) VKQ32, DV, ms_vec);
|
||||
|
||||
float ms = hvx_vec_get_f32(ms_vec);
|
||||
S = S * ms + vs;
|
||||
} else {
|
||||
HVX_Vector diff_vec = hvx_vec_splat_f32(s_val - M);
|
||||
vs = hvx_vec_get_f32(hvx_vec_exp_f32(diff_vec));
|
||||
S += vs;
|
||||
}
|
||||
|
||||
const uint8_t * v_ptr = v_base + ic * factx->size_v_row_padded;
|
||||
|
||||
hvx_mad_f32_f16_aa(VKQ32, v_ptr, &vs, DV);
|
||||
}
|
||||
|
||||
M_vec = hvx_vec_splat_f32(M);
|
||||
S_vec = hvx_vec_splat_f32(S);
|
||||
S_vec = HVX_OP_ADD_F32(HVX_OP_MUL_F32(S_vec, ms_vec), p_sum_vec);
|
||||
}
|
||||
|
||||
// Issue DMA for next+1 block (if exists)
|
||||
@@ -599,8 +603,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// dst is permuted
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1) * nb1;
|
||||
// dst is permuted: [DV, n_heads, n_tokens, n_seq]
|
||||
// head stride is nb[1], token stride is nb[2], batch stride is nb[3]
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i2 * dst->nb[1] + i1 * dst->nb[2] + i3 * dst->nb[3];
|
||||
|
||||
if (dst->type == HTP_TYPE_F32) {
|
||||
hvx_copy_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV);
|
||||
@@ -623,8 +628,8 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
}
|
||||
|
||||
#ifdef HTP_HAS_HMX
|
||||
// HMX path: prefill (neq1 >= 32), head_dim multiple of 32, F16 KV
|
||||
if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0 && q->ne[1] >= 32) {
|
||||
// HMX path: head_dim multiple of 32, F16 KV
|
||||
if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0) {
|
||||
int ret = hmx_flash_attn_ext(octx);
|
||||
if (ret == HTP_STATUS_OK) {
|
||||
return ret;
|
||||
|
||||
@@ -586,6 +586,7 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_tokens = v->ne[2];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
const uint32_t K = state->ne[1];
|
||||
|
||||
const uint32_t total_rows = H * n_seqs;
|
||||
if (ith >= total_rows) {
|
||||
@@ -606,6 +607,10 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
float local_k[HTP_GDN_MAX_SV] __attribute__((aligned(128)));
|
||||
float local_sums[4] __attribute__((aligned(128)));
|
||||
|
||||
const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
|
||||
const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
|
||||
const int64_t shift = (int64_t) n_tokens - (int64_t) K;
|
||||
|
||||
for (uint32_t ir = ith; ir < total_rows; ir += nth) {
|
||||
const uint32_t iv1 = ir % H;
|
||||
const uint32_t iv3 = ir / H;
|
||||
@@ -615,8 +620,8 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
const uint32_t iq3 = iv3 / rq3;
|
||||
const uint32_t ik3 = iv3 / rk3;
|
||||
|
||||
float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
const float * s_in = state_in_base + (uint64_t) iv3 * state_seq_stride + (uint64_t) iv1 * S_v * S_v;
|
||||
|
||||
memcpy(s_out, s_in, gctx->state_bytes);
|
||||
float * s_work = s_out;
|
||||
@@ -689,6 +694,16 @@ static void gated_delta_net_f32_pp_thread(unsigned int nth, unsigned int ith, vo
|
||||
}
|
||||
}
|
||||
|
||||
if (K > 1) {
|
||||
const int64_t target_slot = (int64_t) t - shift;
|
||||
if (target_slot >= 0 && target_slot < (int64_t) K) {
|
||||
float * curr_state_o = state_out_base + (uint64_t) target_slot * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
if (curr_state_o != s_work) {
|
||||
memcpy(curr_state_o, s_work, gctx->state_bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
attn_data += (uint64_t) S_v * H;
|
||||
}
|
||||
}
|
||||
@@ -709,6 +724,7 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
|
||||
const uint32_t S_v = v->ne[0];
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
const uint32_t K = state->ne[1];
|
||||
|
||||
const uint32_t total_rows = H * n_seqs;
|
||||
if (ith >= total_rows) {
|
||||
@@ -736,6 +752,9 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
|
||||
spad = gctx->vtcm_state_base + gctx->vtcm_state_per_thread * ith;
|
||||
}
|
||||
|
||||
const uint64_t state_seq_stride = state->nb[2] / sizeof(float);
|
||||
const uint64_t state_size_per_snap = (uint64_t) S_v * S_v * H * n_seqs;
|
||||
|
||||
for (uint32_t ir = ith; ir < total_rows; ir += nth) {
|
||||
const uint32_t iv1 = ir % H;
|
||||
const uint32_t iv3 = ir / H;
|
||||
@@ -745,8 +764,8 @@ static void gated_delta_net_f32_tg_thread(unsigned int nth, unsigned int ith, vo
|
||||
const uint32_t iq3 = iv3 / rq3;
|
||||
const uint32_t ik3 = iv3 / rk3;
|
||||
|
||||
float * s_out = state_out_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
const float * s_in = state_in_base + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
float * s_out = state_out_base + (uint64_t) (K - 1) * state_size_per_snap + ((uint64_t) iv3 * H + iv1) * S_v * S_v;
|
||||
const float * s_in = state_in_base + (uint64_t) iv3 * state_seq_stride + (uint64_t) iv1 * S_v * S_v;
|
||||
float * s_work;
|
||||
|
||||
if (spad) {
|
||||
@@ -901,6 +920,7 @@ int op_gated_delta_net(struct htp_ops_context * octx) {
|
||||
const uint32_t H = v->ne[1];
|
||||
const uint32_t n_tokens = v->ne[2];
|
||||
const uint32_t n_seqs = v->ne[3];
|
||||
const uint32_t K = state->ne[1];
|
||||
|
||||
if (S_v == 0 || S_v > HTP_GDN_MAX_SV || H == 0 || n_tokens == 0 || n_seqs == 0) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
@@ -913,10 +933,10 @@ int op_gated_delta_net(struct htp_ops_context * octx) {
|
||||
(n_seqs % q->ne[3]) != 0 || (n_seqs % k->ne[3]) != 0) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (state->ne[0] * state->ne[1] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
|
||||
if (state->ne[0] * state->ne[2] * state->ne[3] != S_v * S_v * H * n_seqs) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs) {
|
||||
if (dst->ne[0] != S_v * H || dst->ne[1] != n_tokens * n_seqs + S_v * n_seqs * K) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
|
||||
@@ -17,9 +17,13 @@
|
||||
|
||||
struct get_rows_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t src1_nrows_per_thread;
|
||||
uint32_t tasks_per_thread;
|
||||
uint32_t total_tasks;
|
||||
uint32_t chunks_per_row;
|
||||
uint32_t chunk_size;
|
||||
struct fastdiv_values get_rows_div_ne10;
|
||||
struct fastdiv_values get_rows_div_ne10_ne11;
|
||||
struct fastdiv_values get_rows_div_chunks_per_row;
|
||||
};
|
||||
|
||||
#define get_rows_preamble \
|
||||
@@ -52,20 +56,23 @@ struct get_rows_context {
|
||||
\
|
||||
const uint32_t nr = ne10 * ne11 * ne12;
|
||||
|
||||
static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
|
||||
static void get_rows_thread_f32_f32_dma(unsigned int nth, unsigned int ith, void *data) {
|
||||
struct get_rows_context * grctx = (struct get_rows_context *)data;
|
||||
struct htp_ops_context * octx = grctx->octx;
|
||||
get_rows_preamble;
|
||||
|
||||
uint64_t qt = HAP_perf_get_qtimer_count();
|
||||
|
||||
// parallelize by src1 elements (which correspond to dst rows)
|
||||
const uint32_t dr = grctx->src1_nrows_per_thread;
|
||||
const uint32_t dr = grctx->tasks_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||
if (ir0 >= grctx->total_tasks) {
|
||||
return;
|
||||
}
|
||||
const uint32_t ir1 = MIN(ir0 + dr, grctx->total_tasks);
|
||||
|
||||
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
|
||||
|
||||
dma_queue * dma_queue = octx->ctx->dma[ith];
|
||||
for (uint32_t i = ir0; i < ir1; ++i) {
|
||||
const uint32_t i12 = fastdiv(i, &grctx->get_rows_div_ne10_ne11);
|
||||
const uint32_t rem = i - i12 * ne11 * ne10;
|
||||
@@ -73,29 +80,77 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
|
||||
const uint32_t i10 = rem - i11 * ne10;
|
||||
|
||||
const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
|
||||
|
||||
uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
||||
|
||||
if (i01 >= ne01) {
|
||||
// invalid index, skip for now to avoid crash
|
||||
continue;
|
||||
}
|
||||
|
||||
const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03;
|
||||
const uintptr_t dst_ptr = octx->dst->data + i10*nb1 + i11*nb2 + i12*nb3;
|
||||
hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
|
||||
|
||||
while (!dma_queue_push(dma_queue, dma_make_ptr((void *)dst_ptr, (const void *)src0_ptr), nb1, nb01, ne00 * sizeof(float), 1)) {
|
||||
dma_queue_pop(dma_queue);
|
||||
}
|
||||
}
|
||||
dma_queue_flush(dma_queue);
|
||||
|
||||
qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
|
||||
FARF(HIGH, "get-rows-f32-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
|
||||
ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
|
||||
}
|
||||
|
||||
static void get_rows_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void *data) {
|
||||
struct get_rows_context * grctx = (struct get_rows_context *)data;
|
||||
struct htp_ops_context * octx = grctx->octx;
|
||||
get_rows_preamble;
|
||||
|
||||
uint64_t qt = HAP_perf_get_qtimer_count();
|
||||
|
||||
const uint32_t dr = grctx->tasks_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
if (ir0 >= grctx->total_tasks) {
|
||||
return;
|
||||
}
|
||||
const uint32_t ir1 = MIN(ir0 + dr, grctx->total_tasks);
|
||||
|
||||
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
|
||||
|
||||
const uint32_t chunks_per_row = grctx->chunks_per_row;
|
||||
const uint32_t chunk_size = grctx->chunk_size;
|
||||
for (uint32_t i = ir0; i < ir1; ++i) {
|
||||
const uint32_t row_idx = fastdiv(i, &grctx->get_rows_div_chunks_per_row);
|
||||
const uint32_t chunk_idx = i - row_idx * chunks_per_row;
|
||||
|
||||
const uint32_t i12 = fastdiv(row_idx, &grctx->get_rows_div_ne10_ne11);
|
||||
const uint32_t rem = row_idx - i12 * ne11 * ne10;
|
||||
const uint32_t i11 = fastdiv(rem, &grctx->get_rows_div_ne10);
|
||||
const uint32_t i10 = rem - i11 * ne10;
|
||||
|
||||
const uintptr_t src1_addr = octx->src[1]->data + i10*nb10 + i11*nb11 + i12*nb12;
|
||||
uint32_t i01 = is_i32 ? *(int32_t *)src1_addr : *(int64_t *)src1_addr;
|
||||
|
||||
if (i01 >= ne01) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const uint32_t offset = chunk_idx * chunk_size;
|
||||
if (offset < ne00) {
|
||||
const uint32_t copy_size = MIN(chunk_size, ne00 - offset);
|
||||
const uintptr_t src0_ptr = octx->src[0]->data + i01*nb01 + i11*nb02 + i12*nb03 + offset * sizeof(float);
|
||||
const uintptr_t dst_ptr = octx->dst->data + i10*nb1 + i11*nb2 + i12*nb3 + offset * sizeof(float);
|
||||
hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, copy_size);
|
||||
}
|
||||
}
|
||||
|
||||
qt = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - qt);
|
||||
FARF(HIGH, "get-rows-f32-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
|
||||
FARF(HIGH, "get-rows-f32-f32-hvx %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
|
||||
ne00, ne01, ne02, ne03, ir0, ir1, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, (unsigned) qt);
|
||||
}
|
||||
|
||||
int op_get_rows(struct htp_ops_context * octx) {
|
||||
get_rows_preamble;
|
||||
|
||||
const uint32_t n_threads = MIN(nr, octx->n_threads);
|
||||
|
||||
if (octx->src[0]->type != HTP_TYPE_F32) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
@@ -112,13 +167,52 @@ int op_get_rows(struct htp_ops_context * octx) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
const uint32_t nb00 = octx->src[0]->nb[0];
|
||||
const uint32_t nb0 = octx->dst->nb[0];
|
||||
|
||||
const bool can_use_dma = (nb00 == sizeof(float)) && (nb0 == sizeof(float));
|
||||
const bool use_dma = can_use_dma && (ne00 >= 2048);
|
||||
|
||||
struct get_rows_context grctx;
|
||||
grctx.octx = octx;
|
||||
grctx.get_rows_div_ne10 = init_fastdiv_values(octx->src[1]->ne[0]);
|
||||
grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src[1]->ne[0] * octx->src[1]->ne[1]);
|
||||
|
||||
grctx.src1_nrows_per_thread = (nr + n_threads - 1) / n_threads;
|
||||
if (use_dma) {
|
||||
grctx.chunks_per_row = 1;
|
||||
grctx.chunk_size = ne00;
|
||||
grctx.total_tasks = nr;
|
||||
grctx.get_rows_div_chunks_per_row = init_fastdiv_values(1);
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32, &grctx, n_threads);
|
||||
const uint32_t n_threads = MIN(nr, octx->n_threads);
|
||||
grctx.tasks_per_thread = (nr + n_threads - 1) / n_threads;
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32_dma, &grctx, n_threads);
|
||||
} else {
|
||||
uint32_t chunks_per_row = 1;
|
||||
uint32_t chunk_size = ne00;
|
||||
uint32_t total_tasks = nr;
|
||||
|
||||
if (nr < octx->n_threads) {
|
||||
const uint32_t min_chunk_size = 1024;
|
||||
uint32_t max_chunks = ne00 / min_chunk_size;
|
||||
if (max_chunks == 0) {
|
||||
max_chunks = 1;
|
||||
}
|
||||
chunks_per_row = MIN((octx->n_threads + nr - 1) / nr, max_chunks);
|
||||
chunk_size = (ne00 + chunks_per_row - 1) / chunks_per_row;
|
||||
total_tasks = nr * chunks_per_row;
|
||||
}
|
||||
|
||||
grctx.chunks_per_row = chunks_per_row;
|
||||
grctx.chunk_size = chunk_size;
|
||||
grctx.total_tasks = total_tasks;
|
||||
grctx.get_rows_div_chunks_per_row = init_fastdiv_values(chunks_per_row);
|
||||
|
||||
const uint32_t n_threads = MIN(total_tasks, octx->n_threads);
|
||||
grctx.tasks_per_thread = (total_tasks + n_threads - 1) / n_threads;
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32_hvx, &grctx, n_threads);
|
||||
}
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
@@ -50,8 +50,8 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV,
|
||||
const size_t g_br = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS);
|
||||
const size_t q_tile_size = hex_align_up(g_br * DK * sizeof(__fp16), 4096); // Q: [g_br, DK]
|
||||
const size_t o_tile_size = hex_align_up(g_br * DV * sizeof(__fp16), 4096); // O: [g_br, DV] x2 ping-pong
|
||||
const size_t k_dma_size = hex_align_up(Bc * DK * sizeof(__fp16), 4096); // K DMA: [Bc, DK] x2 double-buf
|
||||
const size_t v_dma_size = hex_align_up(Bc * DV * sizeof(__fp16), 4096); // V DMA: [Bc, DV] x2 double-buf
|
||||
const size_t k_dma_size = hex_align_up(Bc * hex_round_up(DK * sizeof(__fp16), 128), 4096); // K DMA: [Bc, DK] x2 double-buf
|
||||
const size_t v_dma_size = hex_align_up(Bc * hex_round_up(DV * sizeof(__fp16), 128), 4096); // V DMA: [Bc, DV] x2 double-buf
|
||||
const size_t k_tile_size = hex_align_up(Bc * DK * sizeof(__fp16), 4096); // K tiles: [Bc, DK] interleaved
|
||||
const size_t v_tile_size = hex_align_up(Bc * DV * sizeof(__fp16), 4096); // V tiles: [Bc, DV] interleaved
|
||||
const size_t s_tile_size = hex_align_up(g_br * Bc * sizeof(__fp16), 4096); // S/P:[g_br, Bc]
|
||||
@@ -1248,9 +1248,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
if (DK % 32 != 0 || DV % 32 != 0) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
if (neq1 < 32) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
// GQA factor
|
||||
const uint32_t n_kv_heads = k->ne[2];
|
||||
@@ -1278,7 +1275,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
struct hmx_fa_context factx;
|
||||
memset(&factx, 0, sizeof(factx));
|
||||
factx.octx = octx;
|
||||
factx.n_threads = octx->ctx->n_threads;
|
||||
factx.n_threads = n_threads;
|
||||
factx.DK = DK;
|
||||
factx.DV = DV;
|
||||
factx.n_kv = nek1;
|
||||
@@ -1328,10 +1325,15 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
factx.m1 = powf(2.0f, -(max_bias / 2.0f) / factx.n_head_log2);
|
||||
|
||||
// ======== VTCM allocation (GQA-aware) ========
|
||||
const size_t size_k_row = DK * sizeof(__fp16);
|
||||
const size_t size_v_row = DV * sizeof(__fp16);
|
||||
const size_t size_k_row_padded = hex_round_up(size_k_row, 128);
|
||||
const size_t size_v_row_padded = hex_round_up(size_v_row, 128);
|
||||
|
||||
const size_t q_tile_bytes = hex_align_up(g_br * DK * sizeof(__fp16), 4096);
|
||||
const size_t o_tile_bytes = hex_align_up(g_br * DV * sizeof(__fp16), 4096);
|
||||
const size_t k_dma_bytes = hex_align_up(Bc * DK * sizeof(__fp16), 4096);
|
||||
const size_t v_dma_bytes = hex_align_up(Bc * DV * sizeof(__fp16), 4096);
|
||||
const size_t k_dma_bytes = hex_align_up(Bc * size_k_row_padded, 4096);
|
||||
const size_t v_dma_bytes = hex_align_up(Bc * size_v_row_padded, 4096);
|
||||
const size_t k_tile_bytes = hex_align_up(Bc * DK * sizeof(__fp16), 4096);
|
||||
const size_t v_tile_bytes = hex_align_up(Bc * DV * sizeof(__fp16), 4096);
|
||||
const size_t s_tile_bytes = hex_align_up(g_br * Bc * sizeof(__fp16), 4096);
|
||||
@@ -1401,11 +1403,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
|
||||
// ======== DMA setup ========
|
||||
dma_queue * const dma = ctx->dma[0];
|
||||
|
||||
// Padded row sizes for DMA
|
||||
const size_t size_k_row = nek0 * sizeof(__fp16);
|
||||
const size_t size_v_row = nev0 * sizeof(__fp16);
|
||||
const size_t size_k_row_padded = hex_round_up(nek0 * sizeof(__fp16), 128);
|
||||
const size_t size_v_row_padded = hex_round_up(nev0 * sizeof(__fp16), 128);
|
||||
// Padded row sizes for DMA (defined in outer scope)
|
||||
|
||||
const size_t n_row_tiles_g_br = g_br / HMX_FP16_TILE_N_ROWS;
|
||||
const size_t n_tiles_per_bc = Bc / HMX_FP16_TILE_N_COLS;
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hex-fastdiv.h"
|
||||
#include "worker-pool.h"
|
||||
|
||||
#include "hvx-utils.h"
|
||||
@@ -34,6 +35,10 @@ static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
|
||||
-8, 0, -7, 0, -6, 0, -5, 0, -4, 0, -3, 0, -2, 0, -1, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0,
|
||||
};
|
||||
|
||||
static const __fp16 q4_1_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
|
||||
0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0,
|
||||
};
|
||||
|
||||
// MXFP4 dequantization LUT: maps 4-bit index to fp16 mantissa value
|
||||
// kvalues: 0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1, -1.5, -2, -3, -4, -6
|
||||
static const __fp16 mxfp4_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
|
||||
@@ -62,6 +67,8 @@ static inline size_t get_x4x2_row_stride(int weight_type, int k) {
|
||||
case HTP_TYPE_Q4_0:
|
||||
case HTP_TYPE_IQ4_NL:
|
||||
return (size_t) nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE); // 144 * nb
|
||||
case HTP_TYPE_Q4_1:
|
||||
return (size_t) nb * (QK_Q4_0x4x2 / 2 + 32); // 160 * nb
|
||||
case HTP_TYPE_Q8_0:
|
||||
return (size_t) nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE); // 272 * nb
|
||||
case HTP_TYPE_MXFP4:
|
||||
@@ -181,45 +188,44 @@ next_nc:
|
||||
// In x4x2, sub-blocks 0..3 use lower nibbles, sub-blocks 4..7 use upper nibbles
|
||||
// of the same 32 packed bytes.
|
||||
static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) {
|
||||
(void)vlut_cvt;
|
||||
HVX_Vector vq = hvx_vmemu(packed_32);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
|
||||
HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
|
||||
// q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
// Shuffle before LUT
|
||||
v_quants = Q6_Vb_vshuff_Vb(v_quants);
|
||||
// Use standard vlut16 (not _nomatch) to avoid stale-register NaN.
|
||||
// _nomatch retains the previous destination-register value for colliding
|
||||
// indices, but the C intrinsic doesn't model the implicit read so the
|
||||
// compiler may allocate a register containing garbage/NaN.
|
||||
HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
|
||||
HVX_Vector v_hf = Q6_V_lo_W(vp);
|
||||
|
||||
HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8);
|
||||
HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_int8));
|
||||
HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0);
|
||||
|
||||
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
|
||||
}
|
||||
|
||||
// Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using
|
||||
// full HVX vector width. One vmemu + one vlut16 replaces 4 separate calls.
|
||||
// full HVX vector width.
|
||||
// Output: vector_x2 each hold 32 FP16 values in the first 64 bytes.
|
||||
static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
const uint8_t *packed_128, bool upper_nibbles,
|
||||
const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
|
||||
// Load all 128 packed bytes (4 contiguous 32-byte groups)
|
||||
(void)vlut_cvt;
|
||||
HVX_Vector vq = hvx_vmemu(packed_128);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
|
||||
// Shuffle before LUT
|
||||
v_quants = Q6_Vb_vshuff_Vb(v_quants);
|
||||
HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8);
|
||||
|
||||
// Full-width vlut16: 128 byte lookups -> 128 fp16 results in a VectorPair
|
||||
HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
|
||||
HVX_Vector v_lo = Q6_V_lo_W(vp); // [group0: 32 fp16 | group1: 32 fp16]
|
||||
HVX_Vector v_hi = Q6_V_hi_W(vp); // [group2: 32 fp16 | group3: 32 fp16]
|
||||
HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_int8);
|
||||
HVX_Vector v_lo = Q6_V_lo_W(vp_int16);
|
||||
HVX_Vector v_hi = Q6_V_hi_W(vp_int16);
|
||||
|
||||
v_lo = Q6_Vhf_equals_Vh(v_lo);
|
||||
v_hi = Q6_Vhf_equals_Vh(v_hi);
|
||||
|
||||
// Build per-group scale vectors: first 64 bytes use scale_a, last 64 use scale_b
|
||||
HVX_Vector vscale = hvx_vmemu(scales_4);
|
||||
HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
|
||||
HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
|
||||
@@ -227,9 +233,97 @@ static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx(
|
||||
v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
|
||||
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
|
||||
|
||||
// Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
|
||||
HVX_Vector_x2 r = { v_lo,/* group1 already in [0:63] */
|
||||
v_hi /* group2 already in [0:63] */ };
|
||||
HVX_Vector_x2 r = { v_lo, v_hi };
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale_offset, const HVX_Vector vlut_cvt) {
|
||||
(void)vlut_cvt;
|
||||
HVX_Vector vq = hvx_vmemu(packed_32);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_dm = hvx_vmemu(scale_offset);
|
||||
HVX_Vector v_scales = hvx_vec_repl_f16(v_dm);
|
||||
HVX_Vector v_offsets = hvx_vec_repl_f16(Q6_V_vror_VR(v_dm, 2));
|
||||
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
|
||||
HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_quants));
|
||||
HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0);
|
||||
|
||||
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales), v_offsets));
|
||||
}
|
||||
|
||||
static inline HVX_Vector_x2 dequantize_x4x2_q4_1_x4groups_hvx(
|
||||
const uint8_t *packed_128, bool upper_nibbles,
|
||||
const __fp16 *scales_offsets_4, const HVX_Vector vlut_cvt) {
|
||||
(void)vlut_cvt;
|
||||
HVX_Vector vq = hvx_vmemu(packed_128);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
|
||||
HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_quants);
|
||||
HVX_Vector v_lo = Q6_V_lo_W(vp_int16);
|
||||
HVX_Vector v_hi = Q6_V_hi_W(vp_int16);
|
||||
|
||||
v_lo = Q6_Vhf_equals_Vh(v_lo);
|
||||
v_hi = Q6_Vhf_equals_Vh(v_hi);
|
||||
|
||||
HVX_Vector vscale_offset = hvx_vmemu(scales_offsets_4);
|
||||
HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(vscale_offset, vscale_offset, -2);
|
||||
HVX_Vector vd = Q6_V_lo_W(dm_deal);
|
||||
HVX_Vector vm = Q6_V_hi_W(dm_deal);
|
||||
|
||||
HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vd);
|
||||
HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vd, 4));
|
||||
|
||||
HVX_Vector v_os01 = hvx_vec_repl_2x_f16(vm);
|
||||
HVX_Vector v_os23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vm, 4));
|
||||
|
||||
v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01), v_os01));
|
||||
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23), v_os23));
|
||||
|
||||
HVX_Vector_x2 r = { v_lo, v_hi };
|
||||
return r;
|
||||
}
|
||||
|
||||
// LUT-based dequantizers for non-linear IQ4_NL format.
|
||||
static inline HVX_Vector dequantize_x4x2_iq4_nl_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) {
|
||||
HVX_Vector vq = hvx_vmemu(packed_32);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale));
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
v_quants = Q6_Vb_vshuff_Vb(v_quants);
|
||||
HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
|
||||
HVX_Vector v_hf = Q6_V_lo_W(vp);
|
||||
|
||||
return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
|
||||
}
|
||||
|
||||
static inline HVX_Vector_x2 dequantize_x4x2_iq4_nl_x4groups_hvx(
|
||||
const uint8_t *packed_128, bool upper_nibbles,
|
||||
const __fp16 *scales_4, const HVX_Vector vlut_cvt) {
|
||||
HVX_Vector vq = hvx_vmemu(packed_128);
|
||||
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
|
||||
HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
|
||||
v_quants = Q6_V_vand_VV(v_quants, mask_h4);
|
||||
|
||||
v_quants = Q6_Vb_vshuff_Vb(v_quants);
|
||||
|
||||
HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
|
||||
HVX_Vector v_lo = Q6_V_lo_W(vp);
|
||||
HVX_Vector v_hi = Q6_V_hi_W(vp);
|
||||
|
||||
HVX_Vector vscale = hvx_vmemu(scales_4);
|
||||
HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale);
|
||||
HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4));
|
||||
|
||||
v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
|
||||
v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
|
||||
|
||||
HVX_Vector_x2 r = { v_lo, v_hi };
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -320,100 +414,176 @@ static inline HVX_Vector_x4 dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *
|
||||
return r;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
__fp16 *dst;
|
||||
const uint8_t *src;
|
||||
int n_cols;
|
||||
int k_block;
|
||||
size_t row_stride;
|
||||
int weight_type;
|
||||
int n_tot_tiles;
|
||||
int n_tiles_per_task;
|
||||
int n_tasks;
|
||||
int n_k_tiles;
|
||||
struct fastdiv_values n_k_tiles_div;
|
||||
} x4x2_dequantize_state_t;
|
||||
|
||||
// Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
|
||||
// Input: vtcm_src has n_cols rows of x4x2 data, each row_stride bytes.
|
||||
// Output: vtcm_dst in tile-major FP16 layout.
|
||||
static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
__fp16 *restrict vtcm_dst,
|
||||
const uint8_t *restrict vtcm_src,
|
||||
int n_cols, int k_block,
|
||||
size_t row_stride, int weight_type,
|
||||
|
||||
#define DEFINE_DEQUANTIZE_Q4_TASK(suffix, lut_name, helper_prefix, dblk_size, scale_step) \
|
||||
static void dequantize_x4x2_weight_to_fp16_tiles_task_##suffix( \
|
||||
const x4x2_dequantize_state_t *state, \
|
||||
int start_tile, int end_tile) { \
|
||||
\
|
||||
const int n_k_tiles = state->n_k_tiles; \
|
||||
const int qrow_size = (unsigned)state->k_block / 2; \
|
||||
const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; \
|
||||
const HVX_Vector vlut_cvt = hvx_vmem(lut_name); \
|
||||
\
|
||||
const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); \
|
||||
const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); \
|
||||
const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); \
|
||||
\
|
||||
unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); \
|
||||
unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); \
|
||||
\
|
||||
for (unsigned t = start_tile; t < (unsigned)end_tile; ) { \
|
||||
if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } \
|
||||
\
|
||||
if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) { \
|
||||
unsigned blk_idx = ((kt * 32) / QK_Q4_0x4x2); \
|
||||
unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; \
|
||||
bool upper = (sub_blk_base >= 4); \
|
||||
unsigned packed_off = blk_idx * (QK_Q4_0x4x2 / 2); \
|
||||
unsigned scale_off = qrow_size + blk_idx * (dblk_size) + sub_blk_base * (scale_step); \
|
||||
\
|
||||
__fp16 *tile_bases[4]; \
|
||||
for (unsigned g = 0; g < 4; g++) { \
|
||||
tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS; \
|
||||
} \
|
||||
\
|
||||
HVX_Vector v_off = v_scat_base; \
|
||||
unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride; \
|
||||
\
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { \
|
||||
const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride; \
|
||||
const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride; \
|
||||
\
|
||||
HVX_Vector_x2 dv0 = dequantize_x4x2_##helper_prefix##_x4groups_hvx( \
|
||||
r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt); \
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]); \
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]); \
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \
|
||||
\
|
||||
HVX_Vector_x2 dv1 = dequantize_x4x2_##helper_prefix##_x4groups_hvx( \
|
||||
r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt); \
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]); \
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]); \
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \
|
||||
} \
|
||||
\
|
||||
for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); } \
|
||||
t += 4; kt += 4; \
|
||||
continue; \
|
||||
} \
|
||||
\
|
||||
__fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS; \
|
||||
{ \
|
||||
unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2; \
|
||||
unsigned sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32; \
|
||||
bool upper = (sub_blk >= 4); \
|
||||
unsigned byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32; \
|
||||
unsigned scale_off = qrow_size + blk_idx * (dblk_size) + sub_blk * (scale_step); \
|
||||
\
|
||||
HVX_Vector v_off = v_scat_base; \
|
||||
unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride; \
|
||||
unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1; \
|
||||
\
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) { \
|
||||
const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride; \
|
||||
const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride; \
|
||||
\
|
||||
HVX_Vector v0 = dequantize_x4x2_##helper_prefix##_group_hvx( \
|
||||
r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt); \
|
||||
HVX_Vector v1 = (row1 < (unsigned)state->n_cols) \
|
||||
? dequantize_x4x2_##helper_prefix##_group_hvx( \
|
||||
r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt) \
|
||||
: Q6_V_vzero(); \
|
||||
\
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0); \
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1); \
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \
|
||||
} \
|
||||
(void) *(volatile HVX_Vector *)(tile_base); \
|
||||
} \
|
||||
++t; ++kt; \
|
||||
} \
|
||||
\
|
||||
if (start_tile < end_tile) { \
|
||||
(void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
static void dequantize_x4x2_worker_loop_##suffix(unsigned int n, unsigned int i, void *data) { \
|
||||
x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; \
|
||||
for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { \
|
||||
int start = task_id * state->n_tiles_per_task; \
|
||||
int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); \
|
||||
dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(state, start, end); \
|
||||
} \
|
||||
}
|
||||
|
||||
DEFINE_DEQUANTIZE_Q4_TASK(q4_0, q4_0_to_fp16_lut, q4_0, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16))
|
||||
DEFINE_DEQUANTIZE_Q4_TASK(q4_1, q4_1_to_fp16_lut, q4_1, 32, 4)
|
||||
DEFINE_DEQUANTIZE_Q4_TASK(iq4_nl, iq4_nl_to_fp16_lut, iq4_nl, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16))
|
||||
|
||||
static void dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(
|
||||
const x4x2_dequantize_state_t *state,
|
||||
int start_tile, int end_tile) {
|
||||
|
||||
const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS;
|
||||
const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
|
||||
const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block;
|
||||
const int n_k_tiles = state->n_k_tiles;
|
||||
const int qrow_size = state->k_block;
|
||||
const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;
|
||||
const HVX_Vector vlut_cvt = hvx_vmem(mxfp4_to_fp16_lut);
|
||||
|
||||
const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
|
||||
(weight_type == HTP_TYPE_MXFP4) ? hvx_vmem(mxfp4_to_fp16_lut) :
|
||||
hvx_vmem(q4_0_to_fp16_lut);
|
||||
|
||||
// vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions.
|
||||
// Each int32 element holds a K-row-pair (2 adjacent fp16 values). word[i] at offset i*128
|
||||
// maps to K-rows 2i and 2i+1. Column offset (n*4) added per row.
|
||||
const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets);
|
||||
const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); // 4 bytes = 1 column step
|
||||
const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); // first 16 words (64 bytes)
|
||||
const HVX_Vector v_scat_step = Q6_V_vsplat_R(4);
|
||||
const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);
|
||||
|
||||
unsigned ct = (unsigned)start_tile / n_k_tiles; // column tile index
|
||||
unsigned kt = (unsigned)start_tile % n_k_tiles; // K tile index
|
||||
for (unsigned t = start_tile; t < end_tile; ) {
|
||||
if (kt >= n_k_tiles) { kt = 0; ct++; }
|
||||
unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);
|
||||
unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);
|
||||
|
||||
// --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
|
||||
if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
|
||||
unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4
|
||||
bool upper = (sub_blk_base >= 4);
|
||||
unsigned packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes
|
||||
unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
|
||||
+ sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales
|
||||
for (unsigned t = start_tile; t < (unsigned)end_tile; ) {
|
||||
if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }
|
||||
|
||||
__fp16 *tile_bases[4];
|
||||
for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
|
||||
|
||||
HVX_Vector v_off = v_scat_base;
|
||||
|
||||
unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
|
||||
unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
|
||||
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
|
||||
const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
|
||||
HVX_Vector_x2 dv0 = dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
|
||||
HVX_Vector_x2 dv1 = dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt);
|
||||
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]);
|
||||
Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
}
|
||||
|
||||
for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }
|
||||
t += 4; kt += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Batch-4 fast path for MXFP4: same nibble layout but E8M0 scales ---
|
||||
if (weight_type == HTP_TYPE_MXFP4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
|
||||
// Batch-4 fast path for MXFP4
|
||||
if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) {
|
||||
int blk_idx = (kt * 32) / QK_MXFP4x4x2;
|
||||
int sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32; // 0 or 4
|
||||
int sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32;
|
||||
bool upper = (sub_blk_base >= 4);
|
||||
int packed_off = blk_idx * (QK_MXFP4x4x2 / 2); // 128 contiguous packed bytes
|
||||
int e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE; // all 8 E8M0 scales
|
||||
int packed_off = blk_idx * (QK_MXFP4x4x2 / 2);
|
||||
int e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;
|
||||
|
||||
__fp16 * tile_bases[4];
|
||||
for (int g = 0; g < 4; g++) {
|
||||
tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS;
|
||||
tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS;
|
||||
}
|
||||
|
||||
HVX_Vector v_off = v_scat_base;
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
|
||||
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
|
||||
int row1 = row0 + 1;
|
||||
const uint8_t * r0 = vtcm_src + row0 * row_stride;
|
||||
const uint8_t * r1 = vtcm_src + row1 * row_stride;
|
||||
const uint8_t * r0 = state->src + row0 * state->row_stride;
|
||||
const uint8_t * r1 = state->src + row1 * state->row_stride;
|
||||
|
||||
// Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
|
||||
mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
|
||||
|
||||
HVX_Vector_x4 dv0, dv1;
|
||||
dv0 = dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8);
|
||||
if (row1 < n_cols) {
|
||||
if (row1 < state->n_cols) {
|
||||
mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
|
||||
dv1 = dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8);
|
||||
} else {
|
||||
@@ -434,41 +604,13 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
(void) *(volatile HVX_Vector *) (tile_bases[g]);
|
||||
}
|
||||
|
||||
t += 4;
|
||||
t += 4; kt += 4;
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Single-tile fallback ---
|
||||
__fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;
|
||||
|
||||
if (is_q4) {
|
||||
unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2;
|
||||
unsigned sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32;
|
||||
bool upper = (sub_blk >= 4);
|
||||
unsigned byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
|
||||
unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
|
||||
|
||||
HVX_Vector v_off = v_scat_base; // reset to column 0
|
||||
unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
|
||||
unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
|
||||
const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
|
||||
|
||||
HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(
|
||||
r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
|
||||
HVX_Vector v1 = (row1 < n_cols)
|
||||
? dequantize_x4x2_q4_0_group_hvx(
|
||||
r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt)
|
||||
: Q6_V_vzero();
|
||||
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
}
|
||||
(void) *(volatile HVX_Vector *)(tile_base);
|
||||
} else if (weight_type == HTP_TYPE_MXFP4) {
|
||||
// Single-tile fallback
|
||||
__fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;
|
||||
{
|
||||
int blk_idx = (kt * 32) / QK_MXFP4x4x2;
|
||||
int sub_blk = ((kt * 32) % QK_MXFP4x4x2) / 32;
|
||||
bool upper = (sub_blk >= 4);
|
||||
@@ -480,15 +622,14 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
|
||||
int row1 = row0 + 1;
|
||||
|
||||
const uint8_t * r0 = vtcm_src + row0 * row_stride;
|
||||
const uint8_t * r1 = vtcm_src + row1 * row_stride;
|
||||
const uint8_t * r0 = state->src + row0 * state->row_stride;
|
||||
const uint8_t * r1 = state->src + row1 * state->row_stride;
|
||||
|
||||
// Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
|
||||
mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
|
||||
|
||||
HVX_Vector v0 = dequantize_x4x2_mxfp4_group_hvx(r0 + byte_off, upper, sub_blk, vlut_cvt, r0_e8);
|
||||
HVX_Vector v1;
|
||||
if (row1 < n_cols) {
|
||||
if (row1 < state->n_cols) {
|
||||
mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
|
||||
v1 = dequantize_x4x2_mxfp4_group_hvx(r1 + byte_off, upper, sub_blk, vlut_cvt, r1_e8);
|
||||
} else {
|
||||
@@ -501,23 +642,59 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
}
|
||||
(void) *(volatile HVX_Vector *) (tile_base);
|
||||
} else {
|
||||
// Q8_0
|
||||
}
|
||||
++t; ++kt;
|
||||
}
|
||||
|
||||
if (start_tile < end_tile) {
|
||||
(void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_x4x2_worker_loop_mxfp4(unsigned int n, unsigned int i, void *data) {
|
||||
x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
|
||||
for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
|
||||
int start = task_id * state->n_tiles_per_task;
|
||||
int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
|
||||
dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(state, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(
|
||||
const x4x2_dequantize_state_t *state,
|
||||
int start_tile, int end_tile) {
|
||||
|
||||
const int n_k_tiles = state->n_k_tiles;
|
||||
const int qrow_size = state->k_block;
|
||||
const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div;
|
||||
|
||||
const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets);
|
||||
const HVX_Vector v_scat_step = Q6_V_vsplat_R(4);
|
||||
const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);
|
||||
|
||||
unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div);
|
||||
unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div);
|
||||
|
||||
for (unsigned t = start_tile; t < (unsigned)end_tile; ) {
|
||||
if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; }
|
||||
|
||||
__fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS;
|
||||
{
|
||||
int blk_idx = (kt * 32) / QK_Q8_0x4x2;
|
||||
int sub_blk = ((kt * 32) % QK_Q8_0x4x2) / 32;
|
||||
int byte_off = blk_idx * QK_Q8_0x4x2 + sub_blk * 32;
|
||||
int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
|
||||
|
||||
HVX_Vector v_off = v_scat_base; // reset to column 0
|
||||
HVX_Vector v_off = v_scat_base;
|
||||
for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
|
||||
int row0 = ct * HMX_FP16_TILE_N_COLS + r;
|
||||
int row1 = row0 + 1;
|
||||
|
||||
const uint8_t *r0 = vtcm_src + row0 * row_stride;
|
||||
const uint8_t *r1 = vtcm_src + row1 * row_stride;
|
||||
const uint8_t *r0 = state->src + row0 * state->row_stride;
|
||||
const uint8_t *r1 = state->src + row1 * state->row_stride;
|
||||
|
||||
HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off));
|
||||
HVX_Vector v1 = (row1 < n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero();
|
||||
HVX_Vector v1 = (row1 < state->n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero();
|
||||
|
||||
Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
|
||||
v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
|
||||
@@ -529,50 +706,31 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
++t; ++kt;
|
||||
}
|
||||
|
||||
// Drain HVX scatter write buffer: a vmem load on the same HW thread retires
|
||||
// all pending scatter entries to VTCM. Without this, the main thread's HMX
|
||||
// reads may see stale data because atomic_fetch_sub (release) only orders
|
||||
// regular stores, not the HVX scatter buffer.
|
||||
if (start_tile < end_tile) {
|
||||
(void) *(volatile HVX_Vector *)(vtcm_dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
|
||||
(void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS);
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
__fp16 *dst;
|
||||
const uint8_t *src;
|
||||
int n_cols;
|
||||
int k_block;
|
||||
size_t row_stride;
|
||||
int weight_type;
|
||||
int n_tot_tiles;
|
||||
int n_tiles_per_task;
|
||||
int n_tasks;
|
||||
} x4x2_dequantize_state_t;
|
||||
|
||||
static void dequantize_x4x2_worker_loop(unsigned int n, unsigned int i, void *data) {
|
||||
static void dequantize_x4x2_worker_loop_q8_0(unsigned int n, unsigned int i, void *data) {
|
||||
x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data;
|
||||
|
||||
for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) {
|
||||
int start = task_id * state->n_tiles_per_task;
|
||||
int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles);
|
||||
|
||||
dequantize_x4x2_weight_to_fp16_tiles_task(
|
||||
state->dst, state->src, state->n_cols, state->k_block,
|
||||
state->row_stride, state->weight_type, start, end);
|
||||
dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(state, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
|
||||
struct htp_context *ctx, __fp16 *vtcm_dst,
|
||||
const void *vtcm_src, int n_cols, int k_block,
|
||||
size_t row_stride, int weight_type) {
|
||||
size_t row_stride, int weight_type,
|
||||
int n_k_tiles, struct fastdiv_values n_k_tiles_div,
|
||||
worker_callback_t dequant_worker_fn) {
|
||||
|
||||
assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
|
||||
assert(k_block % HMX_FP16_TILE_N_COLS == 0);
|
||||
|
||||
size_t n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
|
||||
size_t n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
|
||||
size_t n_tot_tiles = n_col_tiles * n_k_tiles;
|
||||
|
||||
size_t n_tiles_per_task = hmx_ceil_div(n_tot_tiles, ctx->n_threads);
|
||||
@@ -587,12 +745,16 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
|
||||
state.k_block = k_block;
|
||||
state.row_stride = row_stride;
|
||||
state.weight_type = weight_type;
|
||||
state.n_k_tiles = n_k_tiles;
|
||||
state.n_k_tiles_div = n_k_tiles_div;
|
||||
|
||||
worker_pool_run_func(ctx->worker_pool, dequantize_x4x2_worker_loop, &state, ctx->n_threads);
|
||||
worker_pool_run_func(ctx->worker_pool, dequant_worker_fn, &state, ctx->n_threads);
|
||||
}
|
||||
|
||||
// --- End x4x2 dequantizers ---
|
||||
|
||||
#pragma clang diagnostic ignored "-Wbackend-plugin" // spurios warning for hmx intrinsics
|
||||
|
||||
// requires external HMX lock
|
||||
static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales,
|
||||
int n_row_tiles, int n_col_tiles, int n_dot_tiles) {
|
||||
@@ -883,6 +1045,20 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
|
||||
return -1;
|
||||
}
|
||||
|
||||
worker_callback_t dequant_worker_fn = NULL;
|
||||
switch (weight_type) {
|
||||
case HTP_TYPE_Q4_0: dequant_worker_fn = dequantize_x4x2_worker_loop_q4_0; break;
|
||||
case HTP_TYPE_IQ4_NL: dequant_worker_fn = dequantize_x4x2_worker_loop_iq4_nl; break;
|
||||
case HTP_TYPE_Q4_1: dequant_worker_fn = dequantize_x4x2_worker_loop_q4_1; break;
|
||||
case HTP_TYPE_MXFP4: dequant_worker_fn = dequantize_x4x2_worker_loop_mxfp4; break;
|
||||
case HTP_TYPE_Q8_0: dequant_worker_fn = dequantize_x4x2_worker_loop_q8_0; break;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
|
||||
const int n_k_tiles = k / HMX_FP16_TILE_N_COLS;
|
||||
const struct fastdiv_values n_k_tiles_div = init_fastdiv_values(n_k_tiles);
|
||||
|
||||
// --- Dynamic VTCM layout ---
|
||||
const size_t vec_dot_size = k * sizeof(__fp16);
|
||||
const size_t vtcm_budget = ctx->vtcm_size;
|
||||
@@ -975,7 +1151,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
|
||||
{
|
||||
// B0: wait for DMA, dequant weight chunk 0
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
|
||||
|
||||
// A1: issue DMA for weight chunk 1
|
||||
const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
|
||||
@@ -994,7 +1170,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
|
||||
// B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
|
||||
if (1 < n_chunk_cnt) {
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1036,7 +1212,7 @@ int hmx_matmul_q_f32(struct htp_context *ctx, float *restrict dst, const float *
|
||||
// B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
|
||||
if (i + 2 < n_chunk_cnt) {
|
||||
dma_queue_pop(ctx->dma[0]);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
|
||||
dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -104,6 +104,7 @@ int op_argsort(struct htp_ops_context * octx);
|
||||
int op_ssm_conv(struct htp_ops_context * octx);
|
||||
int op_cumsum(struct htp_ops_context * octx);
|
||||
int op_fill(struct htp_ops_context * octx);
|
||||
int op_concat(struct htp_ops_context * octx);
|
||||
int op_diag(struct htp_ops_context * octx);
|
||||
int op_solve_tri(struct htp_ops_context * octx);
|
||||
int op_gated_delta_net(struct htp_ops_context * octx);
|
||||
|
||||
@@ -20,6 +20,7 @@ enum htp_data_type {
|
||||
HTP_TYPE_F32 = 0,
|
||||
HTP_TYPE_F16 = 1,
|
||||
HTP_TYPE_Q4_0 = 2,
|
||||
HTP_TYPE_Q4_1 = 3,
|
||||
HTP_TYPE_Q8_0 = 8,
|
||||
HTP_TYPE_IQ4_NL = 20,
|
||||
HTP_TYPE_I32 = 26,
|
||||
@@ -28,6 +29,7 @@ enum htp_data_type {
|
||||
|
||||
// types used internally for repack, dyn.quant, etc
|
||||
HTP_TYPE_Q4_0x4x2 = 200,
|
||||
HTP_TYPE_Q4_1x4x2,
|
||||
HTP_TYPE_Q8_0x4x2,
|
||||
HTP_TYPE_MXFP4x4x2,
|
||||
|
||||
@@ -89,6 +91,7 @@ enum htp_op_code {
|
||||
HTP_OP_TRI,
|
||||
HTP_OP_PAD,
|
||||
HTP_OP_NORM,
|
||||
HTP_OP_CONCAT,
|
||||
|
||||
HTP_OP_INVALID
|
||||
};
|
||||
|
||||
90
ggml/src/ggml-hexagon/htp/hvx-sin-cos.h
Normal file
90
ggml/src/ggml-hexagon/htp/hvx-sin-cos.h
Normal file
@@ -0,0 +1,90 @@
|
||||
#ifndef HVX_SIN_COS_H
|
||||
#define HVX_SIN_COS_H
|
||||
|
||||
#include "hvx-base.h"
|
||||
#include "hvx-floor.h"
|
||||
|
||||
static inline HVX_Vector hvx_vec_cos_f32(HVX_Vector x) {
|
||||
HVX_Vector const_inv_pi = hvx_vec_splat_f32(0.3183098861837907f);
|
||||
HVX_Vector const_half = hvx_vec_splat_f32(0.5f);
|
||||
HVX_Vector const_pi = hvx_vec_splat_f32(3.141592653589793f);
|
||||
HVX_Vector const_one = hvx_vec_splat_f32(1.0f);
|
||||
HVX_Vector const_neg_one = hvx_vec_splat_f32(-1.0f);
|
||||
|
||||
// n = floor(x * (1/pi) + 0.5)
|
||||
HVX_Vector n_float = hvx_vec_floor_f32(hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(x, const_inv_pi), const_half));
|
||||
|
||||
// y = x - n * pi
|
||||
HVX_Vector y = hvx_vec_sub_f32_f32(x, hvx_vec_mul_f32_f32(n_float, const_pi));
|
||||
|
||||
// Sign determination: if n is odd, sign is -1.0f, else 1.0f
|
||||
// half_n = n * 0.5f
|
||||
HVX_Vector half_n = hvx_vec_mul_f32_f32(n_float, const_half);
|
||||
// floor_half_n = floor(half_n)
|
||||
HVX_Vector floor_half_n = hvx_vec_floor_f32(half_n);
|
||||
// is_odd = half_n > floor_half_n
|
||||
HVX_VectorPred is_odd = Q6_Q_vcmp_gt_VsfVsf(half_n, floor_half_n);
|
||||
// sign = vmux(is_odd, -1.0f, 1.0f)
|
||||
HVX_Vector sign = Q6_V_vmux_QVV(is_odd, const_neg_one, const_one);
|
||||
|
||||
// z = y^2
|
||||
HVX_Vector z = hvx_vec_mul_f32_f32(y, y);
|
||||
|
||||
// Chebyshev approximation for cos(y)
|
||||
HVX_Vector c4 = hvx_vec_splat_f32(2.3557242013849433e-05f);
|
||||
HVX_Vector c3 = hvx_vec_splat_f32(-0.0013871428263450528f);
|
||||
HVX_Vector c2 = hvx_vec_splat_f32(0.041665895266688284f);
|
||||
HVX_Vector c1 = hvx_vec_splat_f32(-0.4999999360426369f);
|
||||
HVX_Vector c0 = hvx_vec_splat_f32(0.9999999999071725f);
|
||||
|
||||
HVX_Vector cos_y = hvx_vec_add_f32_f32(c3, hvx_vec_mul_f32_f32(z, c4));
|
||||
cos_y = hvx_vec_add_f32_f32(c2, hvx_vec_mul_f32_f32(z, cos_y));
|
||||
cos_y = hvx_vec_add_f32_f32(c1, hvx_vec_mul_f32_f32(z, cos_y));
|
||||
cos_y = hvx_vec_add_f32_f32(c0, hvx_vec_mul_f32_f32(z, cos_y));
|
||||
|
||||
return hvx_vec_mul_f32_f32(cos_y, sign);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_sin_f32(HVX_Vector x) {
|
||||
HVX_Vector const_inv_pi = hvx_vec_splat_f32(0.3183098861837907f);
|
||||
HVX_Vector const_half = hvx_vec_splat_f32(0.5f);
|
||||
HVX_Vector const_pi = hvx_vec_splat_f32(3.141592653589793f);
|
||||
HVX_Vector const_one = hvx_vec_splat_f32(1.0f);
|
||||
HVX_Vector const_neg_one = hvx_vec_splat_f32(-1.0f);
|
||||
|
||||
// n = floor(x * (1/pi) + 0.5)
|
||||
HVX_Vector n_float = hvx_vec_floor_f32(hvx_vec_add_f32_f32(hvx_vec_mul_f32_f32(x, const_inv_pi), const_half));
|
||||
|
||||
// y = x - n * pi
|
||||
HVX_Vector y = hvx_vec_sub_f32_f32(x, hvx_vec_mul_f32_f32(n_float, const_pi));
|
||||
|
||||
// Sign determination: if n is odd, sign is -1.0f, else 1.0f
|
||||
// half_n = n * 0.5f
|
||||
HVX_Vector half_n = hvx_vec_mul_f32_f32(n_float, const_half);
|
||||
// floor_half_n = floor(half_n)
|
||||
HVX_Vector floor_half_n = hvx_vec_floor_f32(half_n);
|
||||
// is_odd = half_n > floor_half_n
|
||||
HVX_VectorPred is_odd = Q6_Q_vcmp_gt_VsfVsf(half_n, floor_half_n);
|
||||
// sign = vmux(is_odd, -1.0f, 1.0f)
|
||||
HVX_Vector sign = Q6_V_vmux_QVV(is_odd, const_neg_one, const_one);
|
||||
|
||||
// z = y^2
|
||||
HVX_Vector z = hvx_vec_mul_f32_f32(y, y);
|
||||
|
||||
// Chebyshev approximation for sin(y)
|
||||
HVX_Vector s4 = hvx_vec_splat_f32(2.642186986152672e-06f);
|
||||
HVX_Vector s3 = hvx_vec_splat_f32(-0.00019825318964070864f);
|
||||
HVX_Vector s2 = hvx_vec_splat_f32(0.00833326283319605f);
|
||||
HVX_Vector s1 = hvx_vec_splat_f32(-0.16666666082087775f);
|
||||
HVX_Vector s0 = hvx_vec_splat_f32(0.999999999915155f);
|
||||
|
||||
HVX_Vector sin_y = hvx_vec_add_f32_f32(s3, hvx_vec_mul_f32_f32(z, s4));
|
||||
sin_y = hvx_vec_add_f32_f32(s2, hvx_vec_mul_f32_f32(z, sin_y));
|
||||
sin_y = hvx_vec_add_f32_f32(s1, hvx_vec_mul_f32_f32(z, sin_y));
|
||||
sin_y = hvx_vec_add_f32_f32(s0, hvx_vec_mul_f32_f32(z, sin_y));
|
||||
sin_y = hvx_vec_mul_f32_f32(y, sin_y);
|
||||
|
||||
return hvx_vec_mul_f32_f32(sin_y, sign);
|
||||
}
|
||||
|
||||
#endif /* HVX_SIN_COS_H */
|
||||
@@ -14,6 +14,8 @@
|
||||
#include "hvx-sqrt.h"
|
||||
#include "hvx-arith.h"
|
||||
#include "hvx-div.h"
|
||||
#include "hvx-floor.h"
|
||||
#include "hvx-sin-cos.h"
|
||||
#include "hvx-base.h"
|
||||
|
||||
#endif /* HVX_UTILS_H */
|
||||
|
||||
@@ -420,8 +420,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
|
||||
|
||||
ctx->n_threads = n_hvx;
|
||||
for (int i = 0; i < ctx->n_threads; i++) {
|
||||
// see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
|
||||
ctx->dma[i] = dma_queue_create(128);
|
||||
ctx->dma[i] = dma_queue_create(256); // queue depth
|
||||
}
|
||||
|
||||
// init worker pool
|
||||
@@ -601,6 +600,9 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_PAD:
|
||||
return op_pad(octx);
|
||||
|
||||
case HTP_OP_CONCAT:
|
||||
return op_concat(octx);
|
||||
|
||||
case HTP_OP_GATED_DELTA_NET:
|
||||
return op_gated_delta_net(octx);
|
||||
|
||||
@@ -851,6 +853,11 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
for (uint32_t i=0; i < n_ops; i++) {
|
||||
struct profile_data prof;
|
||||
|
||||
if (i == (n_ops-1)) {
|
||||
// wake up the host before starting the last op
|
||||
dspqueue_write_early_wakeup_noblock(queue, 0, 0);
|
||||
}
|
||||
|
||||
profile_start(ctx->profiler, &prof);
|
||||
|
||||
proc_op_req(octx, tens, i, &ops[i]);
|
||||
@@ -867,8 +874,6 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
}
|
||||
}
|
||||
|
||||
// dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
|
||||
|
||||
struct htp_opbatch_rsp rsp;
|
||||
rsp.id = req.id;
|
||||
rsp.status = HTP_STATUS_OK;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hvx-utils.h"
|
||||
@@ -75,6 +76,9 @@ struct htp_rope_context {
|
||||
size_t theta_cache_offset;
|
||||
uint32_t src0_nrows;
|
||||
|
||||
struct fastdiv_values div_ne2_ne1;
|
||||
struct fastdiv_values div_ne1;
|
||||
|
||||
uint64_t t_start;
|
||||
};
|
||||
|
||||
@@ -117,13 +121,84 @@ static __attribute__((noinline)) void rope_cache_init(const float theta_base,
|
||||
float * cache,
|
||||
const float theta_scale) {
|
||||
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||
float theta = theta_base;
|
||||
#if __HVX_ARCH__ >= 79
|
||||
const bool is_v79_or_newer = true;
|
||||
#else
|
||||
const bool is_v79_or_newer = false;
|
||||
#endif
|
||||
|
||||
for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
|
||||
rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
|
||||
if (is_v79_or_newer && ext_factor == 0.0f) {
|
||||
// Fast path: fully vectorized
|
||||
// We process 32 pairs (64 elements) per iteration.
|
||||
const uint32_t n_blocks = ne0 / 64;
|
||||
|
||||
theta *= theta_scale;
|
||||
// Initialize theta scale powers: [1.0f, theta_scale, theta_scale^2, ..., theta_scale^31]
|
||||
float __attribute__((aligned(128))) theta_powers[32];
|
||||
theta_powers[0] = 1.0f;
|
||||
for (int j = 1; j < 32; j++) {
|
||||
theta_powers[j] = theta_powers[j - 1] * theta_scale;
|
||||
}
|
||||
HVX_Vector v_theta_powers = hvx_vmem(theta_powers);
|
||||
|
||||
HVX_Vector v_freq_scale = hvx_vec_splat_f32(freq_scale);
|
||||
HVX_Vector v_mscale = hvx_vec_splat_f32(mscale);
|
||||
|
||||
// Base theta starts at theta_base
|
||||
float theta_block = theta_base;
|
||||
// The scale factor for the next block is theta_scale^32
|
||||
float theta_scale_32 = 1.0f;
|
||||
for (int j = 0; j < 32; j++) {
|
||||
theta_scale_32 *= theta_scale;
|
||||
}
|
||||
|
||||
for (uint32_t b = 0; b < n_blocks; b++) {
|
||||
uint32_t i0 = b * 64;
|
||||
HVX_Vector v_theta_base = hvx_vec_splat_f32(theta_block);
|
||||
HVX_Vector v_theta = hvx_vec_mul_f32_f32(v_theta_base, v_theta_powers);
|
||||
|
||||
if (freq_factors) {
|
||||
// Load 32 elements of freq_factors
|
||||
HVX_Vector v_ff = hvx_vmemu(freq_factors + i0 / 2);
|
||||
HVX_Vector v_inv_ff = hvx_vec_inverse_f32(v_ff);
|
||||
v_theta = hvx_vec_mul_f32_f32(v_theta, v_inv_ff);
|
||||
}
|
||||
|
||||
HVX_Vector v_theta_final = hvx_vec_mul_f32_f32(v_theta, v_freq_scale);
|
||||
|
||||
HVX_Vector vcos = hvx_vec_cos_f32(v_theta_final);
|
||||
HVX_Vector vsin = hvx_vec_sin_f32(v_theta_final);
|
||||
|
||||
vcos = hvx_vec_mul_f32_f32(vcos, v_mscale);
|
||||
vsin = hvx_vec_mul_f32_f32(vsin, v_mscale);
|
||||
|
||||
HVX_VectorPair vstore = Q6_W_vshuff_VVR(vsin, vcos, -4);
|
||||
|
||||
if (((uintptr_t)cache) % 128 == 0) {
|
||||
hvx_vmem(cache + i0 + 0) = Q6_V_lo_W(vstore);
|
||||
hvx_vmem(cache + i0 + 32) = Q6_V_hi_W(vstore);
|
||||
} else {
|
||||
hvx_vec_store_u(cache + i0 + 0, 32 * sizeof(float), Q6_V_lo_W(vstore));
|
||||
hvx_vec_store_u(cache + i0 + 32, 32 * sizeof(float), Q6_V_hi_W(vstore));
|
||||
}
|
||||
|
||||
theta_block *= theta_scale_32;
|
||||
}
|
||||
|
||||
// Leftovers
|
||||
float theta = theta_block;
|
||||
for (uint32_t i0 = n_blocks * 64; i0 < ne0; i0 += 2) {
|
||||
const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
|
||||
rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
|
||||
theta *= theta_scale;
|
||||
}
|
||||
} else {
|
||||
// Fallback to original scalar loop
|
||||
float theta = theta_base;
|
||||
for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
|
||||
rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache);
|
||||
theta *= theta_scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -195,24 +270,18 @@ static void rope_corr_dims(int n_dims,
|
||||
}
|
||||
|
||||
static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
|
||||
const HVX_Vector * restrict vsrc = (const HVX_Vector *) src0;
|
||||
const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
|
||||
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
|
||||
const uint32_t he = ne / 2;
|
||||
const uint32_t nvec = he / 32;
|
||||
const uint32_t nloe = he % 32;
|
||||
|
||||
uint32_t nvec = (ne / (VLEN_FP32 * 2) * 2); // 2 vecs per loop, step of 2
|
||||
for (uint32_t i = 0; i < nvec; i++) {
|
||||
HVX_Vector v0 = ((const HVX_Vector *) src0)[i];
|
||||
HVX_Vector v1 = hvx_vmemu(src0 + he + i * 32);
|
||||
|
||||
uint32_t he = ne / 2; // half_dims offset in elements
|
||||
uint32_t hv = he / VLEN_FP32; // half_dims offset in vectors
|
||||
HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[i * 2 + 0];
|
||||
HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[i * 2 + 1];
|
||||
|
||||
#pragma unroll(2)
|
||||
for (uint32_t i = 0; i < nvec; i += 2) {
|
||||
HVX_Vector v0 = vsrc[i/2+0];
|
||||
HVX_Vector v1 = vsrc[i/2+hv];
|
||||
|
||||
HVX_Vector v2 = vtheta[i+0];
|
||||
HVX_Vector v3 = vtheta[i+1];
|
||||
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
|
||||
@@ -222,37 +291,45 @@ static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * rest
|
||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||
|
||||
vdst[i/2+0] = Q6_Vsf_equals_Vqf32(v4);
|
||||
vdst[i/2+hv] = Q6_Vsf_equals_Vqf32(v5);
|
||||
((HVX_Vector *) dst)[i] = Q6_Vsf_equals_Vqf32(v4);
|
||||
hvx_vmemu(dst + he + i * 32) = Q6_Vsf_equals_Vqf32(v5);
|
||||
}
|
||||
|
||||
for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
|
||||
const float cos_theta = theta_cache[i+0];
|
||||
const float sin_theta = theta_cache[i+1];
|
||||
float x0 = src0[i/2];
|
||||
float x1 = src0[i/2 + he];
|
||||
dst[i/2] = x0 * cos_theta - x1 * sin_theta;
|
||||
dst[i/2 + he] = x0 * sin_theta + x1 * cos_theta;
|
||||
if (nloe > 0) {
|
||||
HVX_Vector v0 = hvx_vmemu(src0 + nvec * 32);
|
||||
HVX_Vector v1 = hvx_vmemu(src0 + he + nvec * 32);
|
||||
|
||||
HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[nvec * 2 + 0];
|
||||
HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[nvec * 2 + 1];
|
||||
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(v0, Q6_V_hi_W(vcos_sin));
|
||||
HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(v1, Q6_V_hi_W(vcos_sin));
|
||||
|
||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||
|
||||
hvx_vec_store_u(dst + nvec * 32, nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v4));
|
||||
hvx_vec_store_u(dst + he + nvec * 32, nloe * sizeof(float), Q6_Vsf_equals_Vqf32(v5));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
|
||||
const HVX_Vector * restrict vsrc = (const HVX_Vector *) src0;
|
||||
const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
|
||||
HVX_Vector * restrict vdst = (HVX_Vector *) dst;
|
||||
const uint32_t nvec = ne / 64;
|
||||
const uint32_t nloe = ne % 64;
|
||||
|
||||
uint32_t nvec = (ne / (VLEN_FP32 * 2)) * 2; // 2 vecs per loop, step of two
|
||||
for (uint32_t i = 0; i < nvec; i++) {
|
||||
HVX_Vector v0 = ((const HVX_Vector *) src0)[i * 2 + 0];
|
||||
HVX_Vector v1 = ((const HVX_Vector *) src0)[i * 2 + 1];
|
||||
|
||||
#pragma unroll(2)
|
||||
for (uint32_t i = 0; i < nvec; i+=2) {
|
||||
HVX_Vector v0 = vsrc[i+0];
|
||||
HVX_Vector v1 = vsrc[i+1];
|
||||
HVX_Vector v2 = ((const HVX_Vector *) theta_cache)[i * 2 + 0];
|
||||
HVX_Vector v3 = ((const HVX_Vector *) theta_cache)[i * 2 + 1];
|
||||
|
||||
HVX_Vector v2 = vtheta[i+0];
|
||||
HVX_Vector v3 = vtheta[i+1];
|
||||
|
||||
HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4); // vx0_x1[0] = x0, vx0_x1[1] = x1
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4); // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
|
||||
HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4);
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
|
||||
@@ -264,17 +341,52 @@ static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict
|
||||
|
||||
HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
|
||||
|
||||
vdst[i+0] = Q6_V_lo_W(vstore);
|
||||
vdst[i+1] = Q6_V_hi_W(vstore);
|
||||
((HVX_Vector *) dst)[i * 2 + 0] = Q6_V_lo_W(vstore);
|
||||
((HVX_Vector *) dst)[i * 2 + 1] = Q6_V_hi_W(vstore);
|
||||
}
|
||||
|
||||
for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
|
||||
const float cos_theta = theta_cache[i+0];
|
||||
const float sin_theta = theta_cache[i+1];
|
||||
float x0 = src0[i+0];
|
||||
float x1 = src0[i+1];
|
||||
dst[i+0] = x0 * cos_theta - x1 * sin_theta;
|
||||
dst[i+1] = x0 * sin_theta + x1 * cos_theta;
|
||||
if (nloe > 0) {
|
||||
if (nloe <= 32) {
|
||||
HVX_Vector v0 = hvx_vmemu(src0 + nvec * 64);
|
||||
HVX_Vector v2 = hvx_vmemu(theta_cache + nvec * 64);
|
||||
|
||||
HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(Q6_V_vzero(), v0, -4);
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(Q6_V_vzero(), v2, -4);
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
|
||||
HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
|
||||
|
||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||
|
||||
HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
|
||||
|
||||
hvx_vec_store_u(dst + nvec * 64, nloe * sizeof(float), Q6_V_lo_W(vstore));
|
||||
} else {
|
||||
HVX_Vector v0 = hvx_vmemu(src0 + nvec * 64);
|
||||
HVX_Vector v1 = hvx_vmemu(src0 + nvec * 64 + 32);
|
||||
|
||||
HVX_Vector v2 = hvx_vmemu(theta_cache + nvec * 64);
|
||||
HVX_Vector v3 = hvx_vmemu(theta_cache + nvec * 64 + 32);
|
||||
|
||||
HVX_VectorPair vx0_x1 = Q6_W_vdeal_VVR(v1, v0, -4);
|
||||
HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);
|
||||
|
||||
HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
|
||||
HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
|
||||
HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
|
||||
|
||||
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
|
||||
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
|
||||
|
||||
HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
|
||||
|
||||
((HVX_Vector *) dst)[nvec * 2 + 0] = Q6_V_lo_W(vstore);
|
||||
hvx_vec_store_u(dst + nvec * 64 + 32, (nloe - 32) * sizeof(float), Q6_V_hi_W(vstore));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -348,13 +460,19 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
const int32_t * pos = (const int32_t *) src1->data;
|
||||
const float * freq_factors = src2 ? (const float *) src2->data : NULL;
|
||||
|
||||
uint32_t ir = 0;
|
||||
const uint32_t i3_start = fastdiv(src0_start_row, &rctx->div_ne2_ne1);
|
||||
const uint32_t rem = fastmodulo(src0_start_row, ne2 * ne1, &rctx->div_ne2_ne1);
|
||||
const uint32_t i2_start = fastdiv(rem, &rctx->div_ne1);
|
||||
const uint32_t i1_start = fastmodulo(rem, ne1, &rctx->div_ne1);
|
||||
|
||||
uint32_t ir = src0_start_row;
|
||||
uint32_t prev_i2 = (uint32_t) -1;
|
||||
|
||||
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
|
||||
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
|
||||
for (uint32_t i1 = 0; i1 < ne1; ) { // attn-heads
|
||||
if (ir < src0_start_row) { ir++; i1++; continue; }
|
||||
for (uint32_t i3 = i3_start; i3 < ne3; i3++) { // batch
|
||||
const uint32_t i2_init = (i3 == i3_start) ? i2_start : 0;
|
||||
for (uint32_t i2 = i2_init; i2 < ne2; i2++) { // seq-len
|
||||
const uint32_t i1_init = (i3 == i3_start && i2 == i2_start) ? i1_start : 0;
|
||||
for (uint32_t i1 = i1_init; i1 < ne1; ) { // attn-heads
|
||||
if (ir >= src0_end_row) goto done;
|
||||
|
||||
// Rows in this block
|
||||
@@ -407,9 +525,6 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
ne0, rctx->ext_factor, rctx->attn_factor,
|
||||
theta_cache, rctx->theta_scale);
|
||||
}
|
||||
|
||||
// FARF(HIGH, "rope-theta %u: ir %u i1 %u i2 %u i3 %u cache %p : usec %u", ith, ir, i1, i2, i3, theta_cache,
|
||||
// (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start));
|
||||
}
|
||||
|
||||
// Skip output DMA transactions from prev block (if any)
|
||||
@@ -489,7 +604,7 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
|
||||
// Aligned row sizes for VTCM
|
||||
const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
|
||||
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
||||
const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 128);
|
||||
const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 256);
|
||||
|
||||
// Calculate spad sizes per thread
|
||||
size_t src0_spad_per_thread = theta_cache_size_aligned + HTP_ROPE_SPAD_NROWS * src0_row_size_aligned;
|
||||
@@ -546,6 +661,11 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
|
||||
rctx.src0_nrows = src0_nrows;
|
||||
rctx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
|
||||
|
||||
if (src0_nrows > 0) {
|
||||
rctx.div_ne2_ne1 = init_fastdiv_values(dst->ne[2] * dst->ne[1]);
|
||||
rctx.div_ne1 = init_fastdiv_values(dst->ne[1]);
|
||||
}
|
||||
|
||||
FARF(HIGH, "rope-f32 n-rows %u n-dims %d ne0 %u ext-factor %.6f theta-scale %.6f attn-factor %.6f\n", rctx.src0_nrows, rctx.n_dims, ne0,
|
||||
rctx.ext_factor, rctx.theta_scale, rctx.attn_factor);
|
||||
|
||||
|
||||
@@ -65,6 +65,9 @@ static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
|
||||
// parallelize by rows of src0
|
||||
const uint32_t dr = srctx->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
if (ir0 >= nr) {
|
||||
return;
|
||||
}
|
||||
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||
|
||||
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
|
||||
@@ -109,6 +112,9 @@ static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *da
|
||||
// parallelize by rows of src0
|
||||
const uint32_t dr = srctx->src0_nrows_per_thread;
|
||||
const uint32_t ir0 = dr * ith;
|
||||
if (ir0 >= nr) {
|
||||
return;
|
||||
}
|
||||
const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
|
||||
|
||||
const bool is_i32 = (octx->src[1]->type == HTP_TYPE_I32);
|
||||
|
||||
@@ -207,7 +207,7 @@ static void hvx_fast_norm_f32(const uint8_t * restrict src,
|
||||
|
||||
// scale = rsqrt(variance + epsilon), mean_x broadcast for subtraction
|
||||
HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(var_epsilon_v));
|
||||
HVX_Vector mean_x_b = hvx_vec_splat_f32(hvx_vec_get_f32(Q6_Vsf_equals_Vqf32(mean_x_v)));
|
||||
HVX_Vector mean_x_b = hvx_vec_repl_f32(Q6_Vsf_equals_Vqf32(mean_x_v));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < nvec; i++) {
|
||||
|
||||
@@ -215,6 +215,30 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
|
||||
// device
|
||||
//
|
||||
|
||||
enum ggml_metal_device_id {
|
||||
GGML_METAL_DEVICE_GENERIC = 0,
|
||||
|
||||
GGML_METAL_DEVICE_M1,
|
||||
GGML_METAL_DEVICE_M1_PRO,
|
||||
GGML_METAL_DEVICE_M1_MAX,
|
||||
GGML_METAL_DEVICE_M1_ULTRA,
|
||||
GGML_METAL_DEVICE_M2,
|
||||
GGML_METAL_DEVICE_M2_PRO,
|
||||
GGML_METAL_DEVICE_M2_MAX,
|
||||
GGML_METAL_DEVICE_M2_ULTRA,
|
||||
GGML_METAL_DEVICE_M3,
|
||||
GGML_METAL_DEVICE_M3_PRO,
|
||||
GGML_METAL_DEVICE_M3_MAX,
|
||||
GGML_METAL_DEVICE_M3_ULTRA,
|
||||
GGML_METAL_DEVICE_M4,
|
||||
GGML_METAL_DEVICE_M4_PRO,
|
||||
GGML_METAL_DEVICE_M4_MAX,
|
||||
GGML_METAL_DEVICE_M5,
|
||||
GGML_METAL_DEVICE_M5_PRO,
|
||||
GGML_METAL_DEVICE_M5_MAX,
|
||||
GGML_METAL_DEVICE_M5_ULTRA,
|
||||
};
|
||||
|
||||
struct ggml_metal_device_props {
|
||||
int device;
|
||||
char name[128];
|
||||
@@ -234,6 +258,8 @@ struct ggml_metal_device_props {
|
||||
|
||||
bool supports_gpu_family_apple7;
|
||||
|
||||
enum ggml_metal_device_id device_id;
|
||||
|
||||
int op_offload_min_batch_size;
|
||||
};
|
||||
|
||||
|
||||
@@ -628,6 +628,50 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
|
||||
free(rsets);
|
||||
}
|
||||
|
||||
static enum ggml_metal_device_id ggml_metal_device_id_parse(const char * name) {
|
||||
if (!name) {
|
||||
return GGML_METAL_DEVICE_GENERIC;
|
||||
}
|
||||
|
||||
static const char prefix[] = "Apple ";
|
||||
if (strncmp(name, prefix, sizeof(prefix) - 1) != 0) {
|
||||
return GGML_METAL_DEVICE_GENERIC;
|
||||
}
|
||||
const char * suffix = name + sizeof(prefix) - 1;
|
||||
|
||||
static const struct {
|
||||
const char * name;
|
||||
enum ggml_metal_device_id id;
|
||||
} table[] = {
|
||||
{"M1", GGML_METAL_DEVICE_M1},
|
||||
{"M1 Pro", GGML_METAL_DEVICE_M1_PRO},
|
||||
{"M1 Max", GGML_METAL_DEVICE_M1_MAX},
|
||||
{"M1 Ultra", GGML_METAL_DEVICE_M1_ULTRA},
|
||||
{"M2", GGML_METAL_DEVICE_M2},
|
||||
{"M2 Pro", GGML_METAL_DEVICE_M2_PRO},
|
||||
{"M2 Max", GGML_METAL_DEVICE_M2_MAX},
|
||||
{"M2 Ultra", GGML_METAL_DEVICE_M2_ULTRA},
|
||||
{"M3", GGML_METAL_DEVICE_M3},
|
||||
{"M3 Pro", GGML_METAL_DEVICE_M3_PRO},
|
||||
{"M3 Max", GGML_METAL_DEVICE_M3_MAX},
|
||||
{"M3 Ultra", GGML_METAL_DEVICE_M3_ULTRA},
|
||||
{"M4", GGML_METAL_DEVICE_M4},
|
||||
{"M4 Pro", GGML_METAL_DEVICE_M4_PRO},
|
||||
{"M4 Max", GGML_METAL_DEVICE_M4_MAX},
|
||||
{"M5", GGML_METAL_DEVICE_M5},
|
||||
{"M5 Pro", GGML_METAL_DEVICE_M5_PRO},
|
||||
{"M5 Max", GGML_METAL_DEVICE_M5_MAX},
|
||||
{"M5 Ultra", GGML_METAL_DEVICE_M5_ULTRA},
|
||||
};
|
||||
|
||||
for (size_t i = 0; i < sizeof(table)/sizeof(table[0]); ++i) {
|
||||
if (strcmp(suffix, table[i].name) == 0) {
|
||||
return table[i].id;
|
||||
}
|
||||
}
|
||||
return GGML_METAL_DEVICE_GENERIC;
|
||||
}
|
||||
|
||||
ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
|
||||
|
||||
@@ -795,6 +839,8 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
|
||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
|
||||
dev->props.device_id = ggml_metal_device_id_parse([[dev->mtl_device name] UTF8String]);
|
||||
|
||||
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
|
||||
|
||||
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
|
||||
|
||||
@@ -164,6 +164,7 @@ set(GGML_OPENCL_KERNELS
|
||||
sqr
|
||||
sqrt
|
||||
ssm_conv
|
||||
gated_delta_net
|
||||
sub
|
||||
sum_rows
|
||||
cumsum
|
||||
|
||||
@@ -412,6 +412,7 @@ struct ggml_backend_opencl_context {
|
||||
size_t max_workgroup_size;
|
||||
bool fp16_support;
|
||||
bool has_vector_subgroup_broadcast;
|
||||
bool has_qcom_subgroup_shuffle = false; // cl_qcom_subgroup_shuffle
|
||||
bool disable_fusion;
|
||||
|
||||
std::regex *opfilter = nullptr; // regex of ops to not claim
|
||||
@@ -634,6 +635,10 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_conv_2d_f32;
|
||||
cl_kernel kernel_conv_2d_f16_f32;
|
||||
cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
|
||||
// [size_idx][kda][tgpp] where size_idx: 0=S_V=16, 1=32, 2=64, 3=128; kda: 0 or 1.
|
||||
// tgpp 0 = TG variant (COLS_PER_LANE_GROUP=1), tgpp 1 = prefill variant (COLS_PER_LANE_GROUP=4).
|
||||
cl_kernel kernel_gated_delta_net_f32[4][2][2] = {};
|
||||
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns;
|
||||
@@ -837,16 +842,16 @@ static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
||||
static std::vector<std::unique_ptr<ggml_backend_opencl_device_context>> g_ggml_backend_opencl_dev_ctxs;
|
||||
|
||||
inline std::string read_file(const std::string &path) {
|
||||
std::ifstream ifs(path);
|
||||
if (!ifs) {
|
||||
return "";
|
||||
}
|
||||
std::string text;
|
||||
ifs.seekg(0, std::ios::end);
|
||||
text.resize(ifs.tellg());
|
||||
ifs.seekg(0, std::ios::beg);
|
||||
ifs.read(&text[0], text.size());
|
||||
return text;
|
||||
std::ifstream ifs(path);
|
||||
if (!ifs) {
|
||||
return "";
|
||||
}
|
||||
std::string text;
|
||||
ifs.seekg(0, std::ios::end);
|
||||
text.resize(ifs.tellg());
|
||||
ifs.seekg(0, std::ios::beg);
|
||||
ifs.read(&text[0], text.size());
|
||||
return text;
|
||||
}
|
||||
|
||||
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
|
||||
@@ -2463,12 +2468,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
|
||||
if (backend_ctx->program_upscale) {
|
||||
cl_int err_bilinear;
|
||||
backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
|
||||
if (err_bilinear != CL_SUCCESS) {
|
||||
cl_int err_bilinear;
|
||||
backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
|
||||
if (err_bilinear != CL_SUCCESS) {
|
||||
GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
|
||||
backend_ctx->kernel_upscale_bilinear = nullptr;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
backend_ctx->kernel_upscale_bilinear = nullptr;
|
||||
}
|
||||
@@ -2538,8 +2543,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// conv2d
|
||||
{
|
||||
// conv2d
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "conv2d.cl.h"
|
||||
@@ -2597,6 +2602,86 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gated_delta_net: one kernel per (S_V, KDA, tgpp) triple.
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gated_delta_net.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gated_delta_net.cl");
|
||||
#endif
|
||||
|
||||
const int gdn_sizes[4] = { 16, 32, 64, 128 };
|
||||
const int sg_size = backend_ctx->gpu_family == GPU_FAMILY::ADRENO ? 64 : backend_ctx->gpu_family == GPU_FAMILY::INTEL ? 32 : -1;
|
||||
if (sg_size < 0) {
|
||||
GGML_LOG_ERROR("Unsupported GPU Family: only Adreno and Intel are supported.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (int si = 0; si < 4; si++) {
|
||||
const int S_V = gdn_sizes[si];
|
||||
|
||||
// MUST match the dispatcher heuristic in ggml_cl_gated_delta_net exactly.
|
||||
int lanes_per_column;
|
||||
if (S_V >= 128) {
|
||||
lanes_per_column = 8;
|
||||
} else {
|
||||
lanes_per_column = std::min(S_V, sg_size);
|
||||
}
|
||||
|
||||
// Round LANES_PER_COLUMN down until it is:
|
||||
// * power-of-two
|
||||
// * divides both S_V and sg_size
|
||||
while (lanes_per_column > 1 &&
|
||||
(((lanes_per_column & (lanes_per_column - 1)) != 0) ||
|
||||
(S_V % lanes_per_column) != 0 ||
|
||||
(sg_size % lanes_per_column) != 0)) {
|
||||
lanes_per_column >>= 1;
|
||||
}
|
||||
|
||||
GGML_ASSERT(lanes_per_column >= 1);
|
||||
GGML_ASSERT(((lanes_per_column & (lanes_per_column - 1)) == 0));
|
||||
GGML_ASSERT((S_V % lanes_per_column) == 0);
|
||||
GGML_ASSERT((sg_size % lanes_per_column) == 0);
|
||||
|
||||
const bool is_partial_reduce = (lanes_per_column != 1) && (lanes_per_column < sg_size);
|
||||
int use_qcom_shuffle = 0;
|
||||
if (is_partial_reduce) {
|
||||
if (backend_ctx->has_qcom_subgroup_shuffle) {
|
||||
use_qcom_shuffle = 1;
|
||||
}
|
||||
}
|
||||
for (int kda = 0; kda < 2; kda++) {
|
||||
for (int tgpp = 0; tgpp < 2; tgpp++) {
|
||||
const int cpl = (tgpp == 0) ? 1 : 4;
|
||||
const int spw = (tgpp == 0) ? 1 : 1;
|
||||
|
||||
std::string opts = compile_opts;
|
||||
opts += " -DS_V=" + std::to_string(S_V);
|
||||
opts += " -DKDA=" + std::to_string(kda);
|
||||
opts += " -DSUBGROUP_SIZE=" + std::to_string(sg_size);
|
||||
opts += " -DLANES_PER_COLUMN=" + std::to_string(lanes_per_column);
|
||||
opts += " -DCOLS_PER_LANE_GROUP=" + std::to_string(cpl);
|
||||
opts += " -DUSE_QCOM_SUBGROUP_SHUFFLE=" + std::to_string(use_qcom_shuffle);
|
||||
|
||||
// Since spw=1 is found to be optimal, SUBGROUPS_PER_WG > 1 code in
|
||||
// the kernel is removed. If you want to experiment with spw > 1,
|
||||
// Please remember to implement code to handle it.
|
||||
opts += " -DSUBGROUPS_PER_WG=" + std::to_string(spw);
|
||||
|
||||
cl_program prog = build_program_from_source(
|
||||
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gated_delta_net_f32[si][kda][tgpp] =
|
||||
clCreateKernel(prog, "kernel_gated_delta_net", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_id_q4_0_f32_8x_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -2827,7 +2912,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_q4_1_f32.cl.h"
|
||||
};
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_q4_1_f32.cl");
|
||||
#endif
|
||||
@@ -2866,7 +2951,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_iq4_nl_f32.cl.h"
|
||||
};
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_iq4_nl_f32.cl");
|
||||
#endif
|
||||
@@ -2905,7 +2990,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_q8_0_f32.cl.h"
|
||||
};
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_q8_0_f32.cl");
|
||||
#endif
|
||||
@@ -2946,7 +3031,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_noshuffle_q4_k_f32.cl.h"
|
||||
};
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_noshuffle_q4_k_f32.cl");
|
||||
#endif
|
||||
@@ -3781,6 +3866,16 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
||||
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
||||
|
||||
// check support for qcom_subgroup_shuffle
|
||||
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") != NULL) {
|
||||
GGML_LOG_INFO("ggml_opencl: cl_khr_subgroups support: true\n");
|
||||
if (strstr(ext_buffer, "cl_qcom_subgroup_shuffle") != NULL) {
|
||||
backend_ctx->has_qcom_subgroup_shuffle = true;
|
||||
}
|
||||
}
|
||||
GGML_LOG_INFO("ggml_opencl: cl_qcom_subgroup_shuffle support: %s\n",
|
||||
backend_ctx->has_qcom_subgroup_shuffle ? "true" : "false");
|
||||
|
||||
// Check if ext_buffer contains cl_khr_fp16
|
||||
backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
|
||||
@@ -4832,17 +4927,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_NEG:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -4891,6 +4986,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
(op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
|
||||
case GGML_OP_SSM_CONV:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
|
||||
case GGML_OP_GATED_DELTA_NET:
|
||||
{
|
||||
// Match the Vulkan backend: only F32 -> F32, S_v in {16, 32, 64, 128}.
|
||||
if (op->src[0]->type != GGML_TYPE_F32 || op->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
const int64_t S_v = op->src[2]->ne[0];
|
||||
return S_v == 16 || S_v == 32 || S_v == 64 || S_v == 128;
|
||||
}
|
||||
case GGML_OP_CONCAT:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
@@ -10555,7 +10659,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
|
||||
size_t local_work_size[] = { lws0, 1, 1 };
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
@@ -17052,6 +17156,185 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_gated_delta_net(ggml_backend_t backend, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const ggml_tensor * src_q = dst->src[0];
|
||||
const ggml_tensor * src_k = dst->src[1];
|
||||
const ggml_tensor * src_v = dst->src[2];
|
||||
const ggml_tensor * src_g = dst->src[3];
|
||||
const ggml_tensor * src_beta = dst->src[4];
|
||||
const ggml_tensor * src_state = dst->src[5];
|
||||
|
||||
GGML_ASSERT(src_q && src_q->extra);
|
||||
GGML_ASSERT(src_k && src_k->extra);
|
||||
GGML_ASSERT(src_v && src_v->extra);
|
||||
GGML_ASSERT(src_g && src_g->extra);
|
||||
GGML_ASSERT(src_beta && src_beta->extra);
|
||||
GGML_ASSERT(src_state && src_state->extra);
|
||||
|
||||
ggml_backend_opencl_context * backend_ctx = (ggml_backend_opencl_context *) backend->context;
|
||||
|
||||
const cl_uint S_v = (cl_uint) src_v->ne[0];
|
||||
const cl_uint H_v = (cl_uint) src_v->ne[1];
|
||||
const cl_uint n_tokens = (cl_uint) src_v->ne[2];
|
||||
const cl_uint n_seqs = (cl_uint) src_v->ne[3];
|
||||
const cl_uint K = (cl_uint) src_state->ne[1];
|
||||
|
||||
int si;
|
||||
switch (S_v) {
|
||||
case 16: si = 0; break;
|
||||
case 32: si = 1; break;
|
||||
case 64: si = 2; break;
|
||||
case 128: si = 3; break;
|
||||
default:
|
||||
GGML_ASSERT(false && "ggml_cl_gated_delta_net: unsupported S_v");
|
||||
}
|
||||
|
||||
const int kda = (src_g->ne[0] == (int64_t) S_v) ? 1 : 0;
|
||||
|
||||
// TODO: Optimize when S_v!=128. Not necessary for now as Qwen3.5/6 are all S_v=128
|
||||
// token generation mode (tgpp=0):
|
||||
// process 1 token at a time, so columns per lane (cpl) == 1
|
||||
// prompt processing mode (tgpp=1):
|
||||
// cpl=4 to process 4 tokens for single-token. 4 is chosen for Adreno 750 as per
|
||||
// work-item/thread has at most 128 registers.
|
||||
// All Qwen3.5/6 models are S_v == 128, so LANES_PER_COLUMN == 8
|
||||
// such that ROWS_PER_LANE = 128/8 = 16
|
||||
// Variables in the kernel:
|
||||
// k_reg, q_reg, g_exp are all 16 floats
|
||||
// s_shard has cpl*ROWS_PER_LANE = 4*16 = 64 floats
|
||||
// Total 112 registers used.
|
||||
// subgroups_per_workgroup (spw) can be set to 1,2,4,8,16 for tg and 1,2,4 for pp
|
||||
// for S_v=128.
|
||||
// Empirically found that when spw=1, we get the best performance for both tg and pp
|
||||
const int tgpp = (n_tokens == 1) ? 0 : 1;
|
||||
const int cpl = (tgpp == 0) ? 1 : 4;
|
||||
// spw needs adjustment when S_v != 128
|
||||
const int spw = (tgpp == 0) ? 1 : 1;
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_gated_delta_net_f32[si][kda][tgpp];
|
||||
GGML_ASSERT(kernel != nullptr);
|
||||
|
||||
const cl_uint s_off = S_v * H_v * n_tokens * n_seqs;
|
||||
|
||||
const cl_uint sq1 = (cl_uint)(src_q->nb[1] / sizeof(float));
|
||||
const cl_uint sq2 = (cl_uint)(src_q->nb[2] / sizeof(float));
|
||||
const cl_uint sq3 = (cl_uint)(src_q->nb[3] / sizeof(float));
|
||||
const cl_uint sv1 = (cl_uint)(src_v->nb[1] / sizeof(float));
|
||||
const cl_uint sv2 = (cl_uint)(src_v->nb[2] / sizeof(float));
|
||||
const cl_uint sv3 = (cl_uint)(src_v->nb[3] / sizeof(float));
|
||||
const cl_uint sb1 = (cl_uint)(src_beta->nb[1] / sizeof(float));
|
||||
const cl_uint sb2 = (cl_uint)(src_beta->nb[2] / sizeof(float));
|
||||
const cl_uint sb3 = (cl_uint)(src_beta->nb[3] / sizeof(float));
|
||||
|
||||
const cl_uint H_k = (cl_uint) src_q->ne[1];
|
||||
const cl_uint rq3 = (cl_uint)(src_v->ne[3] / src_q->ne[3]);
|
||||
|
||||
const float scale = 1.0f / sqrtf((float) S_v);
|
||||
|
||||
ggml_tensor_extra_cl * extra_q = (ggml_tensor_extra_cl *) src_q->extra;
|
||||
ggml_tensor_extra_cl * extra_k = (ggml_tensor_extra_cl *) src_k->extra;
|
||||
ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *) src_v->extra;
|
||||
ggml_tensor_extra_cl * extra_g = (ggml_tensor_extra_cl *) src_g->extra;
|
||||
ggml_tensor_extra_cl * extra_beta = (ggml_tensor_extra_cl *) src_beta->extra;
|
||||
ggml_tensor_extra_cl * extra_state = (ggml_tensor_extra_cl *) src_state->extra;
|
||||
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *) dst->extra;
|
||||
|
||||
const cl_ulong off_q = extra_q->offset + src_q->view_offs;
|
||||
const cl_ulong off_k = extra_k->offset + src_k->view_offs;
|
||||
const cl_ulong off_v = extra_v->offset + src_v->view_offs;
|
||||
const cl_ulong off_g = extra_g->offset + src_g->view_offs;
|
||||
const cl_ulong off_beta = extra_beta->offset + src_beta->view_offs;
|
||||
const cl_ulong off_state = extra_state->offset + src_state->view_offs;
|
||||
const cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
||||
|
||||
int idx = 0;
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_q->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_q));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_k->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_k));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_v->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_v));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_g->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_g));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_beta->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_beta));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_state->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_state));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra_dst->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H_v));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &n_tokens));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &n_seqs));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s_off));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq1));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq2));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sq3));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv1));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv2));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sv3));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &sb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H_k));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &rq3));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(float), &scale));
|
||||
CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &K));
|
||||
|
||||
// Subgroup size is 64 for Adreno and 32 for Intel
|
||||
const int sg_size = backend_ctx->gpu_family == GPU_FAMILY::ADRENO ? 64 : backend_ctx->gpu_family == GPU_FAMILY::INTEL ? 32 : -1;
|
||||
if (sg_size < 0) {
|
||||
GGML_LOG_ERROR("Unsupported GPU Family: only Adreno and Intel are supported.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// For the subgroup-shuffle kernel, we can safely prefer 8 lanes/column for S_v>=128
|
||||
// For the subgroup-shuffle kernel:
|
||||
// S_v >= 128 -> prefer 8 lanes/column (good occupancy & register pressure tradeoff)
|
||||
// else -> min(S_v, subgroup_size)
|
||||
int lanes_per_column;
|
||||
if ((int)S_v >= 128) {
|
||||
lanes_per_column = 8;
|
||||
} else {
|
||||
lanes_per_column = std::min((int)S_v, sg_size);
|
||||
}
|
||||
|
||||
// Max workgroup size for Adreno 750 is 1024
|
||||
const int wg_size = sg_size * spw;
|
||||
|
||||
// Ensure lanes_per_column is a power-of-two and divides both S_v and subgroup_size.
|
||||
// (Required for lane-group shuffle-xor reduction correctness.)
|
||||
while (lanes_per_column > 1 &&
|
||||
(((lanes_per_column & (lanes_per_column - 1)) != 0) ||
|
||||
(((int)S_v % lanes_per_column) != 0) ||
|
||||
(sg_size % lanes_per_column) != 0)) {
|
||||
lanes_per_column >>= 1;
|
||||
}
|
||||
GGML_ASSERT(lanes_per_column >= 1);
|
||||
GGML_ASSERT(((lanes_per_column & (lanes_per_column - 1)) == 0));
|
||||
GGML_ASSERT(((int)S_v % lanes_per_column) == 0);
|
||||
GGML_ASSERT((sg_size % lanes_per_column) == 0);
|
||||
|
||||
const int cols_per_wg = spw * (sg_size / lanes_per_column) * cpl;
|
||||
GGML_ASSERT(cols_per_wg > 0);
|
||||
GGML_ASSERT(((int)S_v % cols_per_wg) == 0);
|
||||
|
||||
size_t global_work_size[3];
|
||||
size_t local_work_size[3];
|
||||
|
||||
global_work_size[0] = (size_t) H_v * (size_t) wg_size;
|
||||
global_work_size[1] = (size_t) n_seqs;
|
||||
global_work_size[2] = (size_t) S_v / (size_t) cols_per_wg;
|
||||
|
||||
local_work_size[0] = (size_t) wg_size;
|
||||
local_work_size[1] = 1;
|
||||
local_work_size[2] = 1;
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Op offloading
|
||||
//------------------------------------------------------------------------------
|
||||
@@ -17267,8 +17550,8 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_group_norm;
|
||||
break;
|
||||
case GGML_OP_REPEAT:
|
||||
if (!any_on_device) {
|
||||
case GGML_OP_REPEAT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_repeat;
|
||||
@@ -17297,6 +17580,14 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_ssm_conv;
|
||||
break;
|
||||
case GGML_OP_GATED_DELTA_NET:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
// GDN has 6 source tensors, so it cannot use the standard
|
||||
// (src0, src1, dst) func signature. Dispatch directly and return.
|
||||
ggml_cl_gated_delta_net(backend, tensor);
|
||||
return true;
|
||||
case GGML_OP_CONCAT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
247
ggml/src/ggml-opencl/kernels/gated_delta_net.cl
Normal file
247
ggml/src/ggml-opencl/kernels/gated_delta_net.cl
Normal file
@@ -0,0 +1,247 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#ifndef S_V
|
||||
#define S_V 128
|
||||
#endif
|
||||
#ifndef KDA
|
||||
#define KDA 0
|
||||
#endif
|
||||
#ifndef SUBGROUP_SIZE
|
||||
#define SUBGROUP_SIZE 64
|
||||
#endif
|
||||
#ifndef LANES_PER_COLUMN
|
||||
#define LANES_PER_COLUMN 8
|
||||
#endif
|
||||
#ifndef COLS_PER_LANE_GROUP
|
||||
#define COLS_PER_LANE_GROUP 1
|
||||
#endif
|
||||
#ifndef SUBGROUPS_PER_WG
|
||||
#define SUBGROUPS_PER_WG 1
|
||||
#endif
|
||||
#ifndef USE_QCOM_SUBGROUP_SHUFFLE
|
||||
#define USE_QCOM_SUBGROUP_SHUFFLE 0
|
||||
#endif
|
||||
|
||||
#define WG_SIZE (SUBGROUP_SIZE * SUBGROUPS_PER_WG)
|
||||
#define LANE_GROUPS_PER_SG (SUBGROUP_SIZE / LANES_PER_COLUMN)
|
||||
#define COLS_PER_SG (LANE_GROUPS_PER_SG * COLS_PER_LANE_GROUP)
|
||||
#define COLS_PER_WG (SUBGROUPS_PER_WG * COLS_PER_SG)
|
||||
#define ROWS_PER_LANE (S_V / LANES_PER_COLUMN)
|
||||
|
||||
#if USE_QCOM_SUBGROUP_SHUFFLE
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
|
||||
#endif
|
||||
|
||||
// XOR-based parallel sum
|
||||
// This does a reduction across groups of LANES_PER_COLUMN
|
||||
static inline float reduce_add_shmem(float partial, __local float * temp, uint lane) {
|
||||
#if USE_QCOM_SUBGROUP_SHUFFLE
|
||||
#pragma unroll
|
||||
for (uint s = LANES_PER_COLUMN / 2u; s > 0u; s >>= 1u) {
|
||||
partial += qcom_sub_group_shuffle_xor(partial, s, CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, partial);
|
||||
}
|
||||
return partial;
|
||||
#else
|
||||
temp[lane] = partial;
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
#pragma unroll
|
||||
for (uint s = LANES_PER_COLUMN / 2u; s > 0u; s >>= 1u) {
|
||||
float other = temp[lane ^ s];
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
temp[lane] += other;
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
const float result = temp[lane];
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define REDUCE_PARTIAL(partial, temp_ptr, lid) \
|
||||
((LANES_PER_COLUMN == 1u) ? (partial) : reduce_add_shmem((partial), (temp_ptr), (lid)))
|
||||
|
||||
// force compiler to optimize kernel for a specific fixed work-group size
|
||||
__attribute__((reqd_work_group_size(WG_SIZE, 1, 1)))
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_32
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_gated_delta_net(
|
||||
global const char * q_buf, ulong off_q,
|
||||
global const char * k_buf, ulong off_k,
|
||||
global const char * v_buf, ulong off_v,
|
||||
global const char * g_buf, ulong off_g,
|
||||
global const char * beta_buf, ulong off_beta,
|
||||
global const char * state_buf, ulong off_state,
|
||||
global char * dst_buf, ulong off_dst,
|
||||
uint H_v,
|
||||
uint n_tokens,
|
||||
uint n_seqs,
|
||||
uint s_off,
|
||||
uint sq1, uint sq2, uint sq3,
|
||||
uint sv1, uint sv2, uint sv3,
|
||||
uint sb1, uint sb2, uint sb3,
|
||||
uint H_k,
|
||||
uint rq3,
|
||||
float scale,
|
||||
uint K) {
|
||||
|
||||
global const float * data_q = (global const float *)(q_buf + off_q);
|
||||
global const float * data_k = (global const float *)(k_buf + off_k);
|
||||
global const float * data_v = (global const float *)(v_buf + off_v);
|
||||
global const float * data_g = (global const float *)(g_buf + off_g);
|
||||
global const float * data_beta = (global const float *)(beta_buf + off_beta);
|
||||
global const float * data_state = (global const float *)(state_buf + off_state);
|
||||
global float * data_dst = (global float *)(dst_buf + off_dst);
|
||||
|
||||
const uint head_id = get_group_id(0);
|
||||
const uint seq_id = get_group_id(1);
|
||||
const uint tid = (uint)get_local_id(0);
|
||||
|
||||
const uint sg_id = get_sub_group_id(); // subgroup id
|
||||
const uint sg_lid = get_sub_group_local_id(); // subgroup lane id
|
||||
|
||||
const uint lane = sg_lid % LANES_PER_COLUMN;
|
||||
const uint lane_group = sg_lid / LANES_PER_COLUMN;
|
||||
const uint wg_col_base = get_group_id(2) * COLS_PER_WG;
|
||||
const uint sg_col_base = wg_col_base + sg_id * COLS_PER_SG;
|
||||
|
||||
const uint iq1 = head_id % H_k; // head index for Q and K
|
||||
const uint iq3 = seq_id / rq3; // seq index for Q and K
|
||||
|
||||
const uint state_size = S_V * S_V;
|
||||
const uint state_base = (seq_id * K * H_v + head_id) * state_size;
|
||||
const uint q_off_base = iq3 * sq3 + iq1 * sq1;
|
||||
const uint v_off_base = seq_id * sv3 + head_id * sv1;
|
||||
const uint gb_off_base = seq_id * sb3 + head_id * sb1;
|
||||
const uint state_out_base = (seq_id * H_v + head_id) * state_size;
|
||||
const uint state_size_per_snap = state_size * H_v * n_seqs;
|
||||
|
||||
__local float reduce_temp[WG_SIZE];
|
||||
__local float * temp_ptr = reduce_temp + sg_id * SUBGROUP_SIZE;
|
||||
|
||||
float s_shard[COLS_PER_LANE_GROUP][ROWS_PER_LANE];
|
||||
#pragma unroll
|
||||
for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
|
||||
const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
|
||||
#pragma unroll
|
||||
for (uint r = 0; r < ROWS_PER_LANE; r++) {
|
||||
s_shard[cg][r] = data_state[state_base + col * S_V + r * LANES_PER_COLUMN + lane];
|
||||
}
|
||||
}
|
||||
|
||||
const int shift = (int)n_tokens - (int)K;
|
||||
uint attn_off = (seq_id * n_tokens * H_v + head_id) * S_V;
|
||||
|
||||
for (uint t = 0; t < n_tokens; t++) {
|
||||
const uint q_off = q_off_base + t * sq2;
|
||||
const uint k_off = q_off;
|
||||
const uint v_off = v_off_base + t * sv2;
|
||||
const uint gb_off = gb_off_base + t * sb2;
|
||||
const float beta_val = data_beta[gb_off];
|
||||
|
||||
float k_reg[ROWS_PER_LANE];
|
||||
float q_reg[ROWS_PER_LANE];
|
||||
#if KDA
|
||||
float g_exp[ROWS_PER_LANE];
|
||||
#pragma unroll
|
||||
for (uint r = 0; r < ROWS_PER_LANE; r++) {
|
||||
const uint i = r * LANES_PER_COLUMN + lane;
|
||||
k_reg[r] = data_k[k_off + i];
|
||||
q_reg[r] = data_q[q_off + i];
|
||||
g_exp[r] = exp(data_g[gb_off * S_V + i]);
|
||||
}
|
||||
#else
|
||||
const float g_val = exp(data_g[gb_off]);
|
||||
|
||||
#pragma unroll
|
||||
for (uint r = 0; r < ROWS_PER_LANE; r++) {
|
||||
const uint i = r * LANES_PER_COLUMN + lane;
|
||||
k_reg[r] = data_k[k_off + i];
|
||||
q_reg[r] = data_q[q_off + i];
|
||||
}
|
||||
#endif
|
||||
|
||||
#pragma unroll
|
||||
for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
|
||||
const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
|
||||
float v_val = data_v[v_off + col];
|
||||
|
||||
float kv_shard = 0.0f;
|
||||
#pragma unroll
|
||||
for (uint r = 0; r < ROWS_PER_LANE; r++) {
|
||||
#if KDA
|
||||
float gs = g_exp[r] * s_shard[cg][r];
|
||||
kv_shard += gs * k_reg[r];
|
||||
#else
|
||||
kv_shard += s_shard[cg][r] * k_reg[r];
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !KDA
|
||||
kv_shard *= g_val; // Applied once instead of ROWS_PER_LANE times
|
||||
#endif
|
||||
|
||||
const float kv_col = REDUCE_PARTIAL(kv_shard, temp_ptr, sg_lid);
|
||||
|
||||
const float delta_col = (v_val - kv_col) * beta_val;
|
||||
|
||||
float attn_partial = 0.0f;
|
||||
#pragma unroll
|
||||
for (uint r = 0; r < ROWS_PER_LANE; r++) {
|
||||
#if KDA
|
||||
float gs = g_exp[r] * s_shard[cg][r];
|
||||
#else
|
||||
float gs = g_val * s_shard[cg][r];
|
||||
#endif
|
||||
s_shard[cg][r] = gs + k_reg[r] * delta_col;
|
||||
attn_partial += s_shard[cg][r] * q_reg[r];
|
||||
}
|
||||
const float attn_col = REDUCE_PARTIAL(attn_partial, temp_ptr, sg_lid);
|
||||
|
||||
if (lane == 0) {
|
||||
data_dst[attn_off + col] = attn_col * scale;
|
||||
}
|
||||
}
|
||||
attn_off += S_V * H_v;
|
||||
|
||||
if (K > 1u) {
|
||||
const int target_slot = (int)t - shift;
|
||||
if (target_slot >= 0 && target_slot < (int)K) {
|
||||
#pragma unroll
|
||||
for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
|
||||
const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
|
||||
const uint slot_base = s_off + (uint)target_slot * state_size_per_snap + state_out_base;
|
||||
#pragma unroll
|
||||
for (uint r = 0; r < ROWS_PER_LANE; r++) {
|
||||
data_dst[slot_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[cg][r];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (K == 1u) {
|
||||
#pragma unroll
|
||||
for (uint cg = 0; cg < COLS_PER_LANE_GROUP; cg++) {
|
||||
const uint col = sg_col_base + cg * LANE_GROUPS_PER_SG + lane_group;
|
||||
#pragma unroll
|
||||
for (uint r = 0; r < ROWS_PER_LANE; r++) {
|
||||
data_dst[s_off + state_base + col * S_V + r * LANES_PER_COLUMN + lane] = s_shard[cg][r];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -224,6 +224,7 @@ struct sycl_device_info {
|
||||
int max_wg_per_cu; // max work groups per compute unit - refer to
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
bool vmm; // virtual memory support
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
sycl_hw_info hw_info;
|
||||
optimize_feature opt_feature;
|
||||
@@ -244,6 +245,8 @@ struct ggml_sycl_device_info {
|
||||
|
||||
const ggml_sycl_device_info & ggml_sycl_info();
|
||||
|
||||
static constexpr size_t SYCL_BUFFER_ALIGNMENT = 128;
|
||||
|
||||
struct ggml_sycl_pool {
|
||||
virtual ~ggml_sycl_pool() = default;
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include <cstdlib>
|
||||
#include <float.h>
|
||||
#include <limits>
|
||||
#include <optional>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
@@ -37,6 +38,11 @@
|
||||
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
|
||||
#endif
|
||||
#if SYCL_EXT_ONEAPI_VIRTUAL_MEM
|
||||
# include <sycl/ext/oneapi/virtual_mem/physical_mem.hpp>
|
||||
# include <sycl/ext/oneapi/virtual_mem/virtual_mem.hpp>
|
||||
# define GGML_SYCL_USE_VMM
|
||||
#endif
|
||||
#include <sycl/half_type.hpp>
|
||||
|
||||
#include "ggml.h"
|
||||
@@ -70,6 +76,7 @@ int g_ggml_sycl_debug = 0;
|
||||
int g_ggml_sycl_disable_optimize = 0;
|
||||
int g_ggml_sycl_disable_graph = 0;
|
||||
int g_ggml_sycl_disable_dnn = 0;
|
||||
int g_ggml_sycl_enable_vmm = 1;
|
||||
int g_ggml_sycl_prioritize_dmmv = 0;
|
||||
int g_ggml_sycl_use_async_mem_op = 0;
|
||||
int g_ggml_sycl_use_async_mem_op_requested = 1;
|
||||
@@ -96,13 +103,30 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
// GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
|
||||
// #endif
|
||||
for (int i = 0; i < info.device_count; ++i) {
|
||||
info.devices[i].vmm = 0;
|
||||
dpct::device_info prop;
|
||||
auto & device = dpct::dev_mgr::instance().get_device(i);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, device)));
|
||||
|
||||
#if !defined(GGML_SYCL_USE_VMM)
|
||||
info.devices[i].vmm = 0;
|
||||
#else
|
||||
info.devices[i].vmm = device.has(sycl::aspect::ext_oneapi_virtual_mem);
|
||||
if (info.devices[i].vmm) {
|
||||
// NB: SYCL's get_mem_granularity always returns the _minimum_ granularity,
|
||||
// but the L0 API requires a larger page size for allocs above 2 MiB and
|
||||
// rejects non-multiples with UR_RESULT_ERROR_INVALID_VALUE [sic].
|
||||
// Here we clamp it to 2 MiB for simplicity, but other devices may require
|
||||
// calling zeVirtualMemQueryPageSize or yet unexposed public API.
|
||||
const size_t physical_page = 2ull << 20; // 2 MiB
|
||||
info.devices[i].vmm_granularity = std::max<size_t>(
|
||||
sycl::ext::oneapi::experimental::get_mem_granularity(
|
||||
device, sycl::context(device)),
|
||||
physical_page);
|
||||
}
|
||||
#endif
|
||||
|
||||
info.default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
@@ -234,6 +258,7 @@ static void ggml_check_sycl() try {
|
||||
g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
||||
g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1);
|
||||
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
|
||||
@@ -275,6 +300,11 @@ static void ggml_check_sycl() try {
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
|
||||
#endif
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n");
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_VMM: no\n");
|
||||
#endif
|
||||
|
||||
GGML_LOG_INFO("Running with Environment Variables:\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
||||
@@ -293,6 +323,11 @@ static void ggml_check_sycl() try {
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
|
||||
#endif
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: %d\n", g_ggml_sycl_enable_vmm);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n");
|
||||
#endif
|
||||
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
||||
g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
||||
@@ -754,7 +789,7 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
return SYCL_BUFFER_ALIGNMENT;
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
@@ -1177,7 +1212,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(gg
|
||||
}
|
||||
|
||||
static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
return SYCL_BUFFER_ALIGNMENT;
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
@@ -1462,6 +1497,121 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
}
|
||||
};
|
||||
|
||||
// pool with virtual memory management
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
struct ggml_sycl_pool_vmm : public ggml_sycl_pool {
|
||||
static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
|
||||
int device;
|
||||
sycl::context ctx;
|
||||
sycl::device dev;
|
||||
|
||||
uintptr_t pool_addr = 0;
|
||||
size_t pool_used = 0;
|
||||
size_t pool_size = 0;
|
||||
size_t granularity;
|
||||
|
||||
// physical_mem owns the commits (unlike cuMemMap)
|
||||
struct mapping {
|
||||
sycl::ext::oneapi::experimental::physical_mem phys;
|
||||
void * map_ptr;
|
||||
};
|
||||
std::vector<mapping> mappings;
|
||||
|
||||
explicit ggml_sycl_pool_vmm(queue_ptr qptr_, int device_) :
|
||||
device(device_),
|
||||
ctx(qptr_->get_context()),
|
||||
dev(qptr_->get_device()),
|
||||
granularity(ggml_sycl_info().devices[device_].vmm_granularity) {
|
||||
}
|
||||
|
||||
~ggml_sycl_pool_vmm() {
|
||||
if (pool_addr == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Per spec, unmap must (a) match the exact (ptr, size) of an earlier
|
||||
// physical_mem::map() call and (b) precede destruction of the
|
||||
// physical_mem objects (their dtors won't unmap).
|
||||
for (auto & m : mappings) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::unmap(
|
||||
m.map_ptr, m.phys.size(), ctx)));
|
||||
}
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::ext::oneapi::experimental::free_virtual_mem(
|
||||
pool_addr, SYCL_POOL_VMM_MAX_SIZE, ctx)));
|
||||
}
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
|
||||
size = GGML_PAD(size, SYCL_BUFFER_ALIGNMENT);
|
||||
|
||||
size_t avail = pool_size - pool_used;
|
||||
|
||||
if (size > avail) {
|
||||
// round up to the next multiple of the granularity
|
||||
size_t reserve_size = GGML_PAD(size - avail, granularity);
|
||||
|
||||
GGML_ASSERT(pool_size + reserve_size <= SYCL_POOL_VMM_MAX_SIZE);
|
||||
|
||||
// allocate more physical memory
|
||||
std::optional<sycl::ext::oneapi::experimental::physical_mem> phys;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(phys.emplace(dev, ctx, reserve_size)));
|
||||
|
||||
// reserve virtual address space (if not already reserved)
|
||||
if (pool_addr == 0) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
pool_addr = sycl::ext::oneapi::experimental::reserve_virtual_mem(
|
||||
SYCL_POOL_VMM_MAX_SIZE, ctx)));
|
||||
}
|
||||
|
||||
// map at the end of the pool
|
||||
void * map_ptr = nullptr;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
map_ptr = phys->map(pool_addr + pool_size, reserve_size,
|
||||
sycl::ext::oneapi::experimental::address_access_mode::read_write)));
|
||||
|
||||
// stash these so we could unmap this exact range in dtor
|
||||
mappings.push_back({
|
||||
std::move(*phys),
|
||||
map_ptr,
|
||||
});
|
||||
|
||||
// add to the pool
|
||||
pool_size += reserve_size;
|
||||
|
||||
#ifdef DEBUG_SYCL_MALLOC
|
||||
GGML_LOG_INFO("sycl pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
||||
device, (unsigned long long) (pool_size/1024/1024),
|
||||
(unsigned long long) (reserve_size/1024/1024));
|
||||
#endif
|
||||
}
|
||||
|
||||
GGML_ASSERT(pool_addr != 0);
|
||||
|
||||
void * ptr = reinterpret_cast<void *>(pool_addr + pool_used);
|
||||
*actual_size = size;
|
||||
pool_used += size;
|
||||
|
||||
#ifdef DEBUG_SYCL_MALLOC
|
||||
GGML_LOG_INFO("sycl pool[%d]: allocated %llu bytes at %p\n", device, (unsigned long long) size, ptr);
|
||||
#endif
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void free(void * ptr, size_t size) override {
|
||||
#ifdef DEBUG_SYCL_MALLOC
|
||||
GGML_LOG_INFO("sycl pool[%d]: freed %llu bytes at %p\n", device, (unsigned long long) size, ptr);
|
||||
#endif
|
||||
|
||||
pool_used -= size;
|
||||
|
||||
// all deallocations must be in reverse order of the allocations
|
||||
GGML_ASSERT(ptr == reinterpret_cast<void *>(pool_addr + pool_used));
|
||||
}
|
||||
};
|
||||
#endif // defined(GGML_SYCL_USE_VMM)
|
||||
|
||||
struct ggml_sycl_pool_host : public ggml_sycl_pool {
|
||||
queue_ptr qptr;
|
||||
int device;
|
||||
@@ -1542,20 +1692,19 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(que
|
||||
}
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
|
||||
// TBD: NO VMM support
|
||||
// if (ggml_sycl_info().devices[device].vmm) {
|
||||
// return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(device));
|
||||
// }
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
||||
#if defined(GGML_SYCL_USE_VMM)
|
||||
if (g_ggml_sycl_enable_vmm && ggml_sycl_info().devices[device].vmm) {
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_vmm(qptr, device));
|
||||
}
|
||||
#endif // defined(GGML_SYCL_USE_VMM)
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
|
||||
return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
|
||||
}
|
||||
|
||||
// TBD pool with virtual memory management
|
||||
// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
|
||||
|
||||
/// kernels
|
||||
typedef void (*ggml_sycl_op_mul_mat_t)(
|
||||
ggml_backend_sycl_context & ctx,
|
||||
|
||||
@@ -79,6 +79,12 @@ if (Vulkan_FOUND)
|
||||
"GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT"
|
||||
)
|
||||
|
||||
test_shader_extension_support(
|
||||
"GL_NV_cooperative_matrix_decode_vector"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/coopmat2_decode_vector.comp"
|
||||
"GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT"
|
||||
)
|
||||
|
||||
test_shader_extension_support(
|
||||
"GL_EXT_integer_dot_product"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/feature-tests/integer_dot.comp"
|
||||
|
||||
@@ -21,6 +21,19 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
|
||||
#include <vulkan/vulkan.hpp>
|
||||
|
||||
// Fallback definitions for VK_NV_cooperative_matrix_decode_vector in case the
|
||||
// installed Vulkan headers predate the extension.
|
||||
#ifndef VK_NV_cooperative_matrix_decode_vector
|
||||
#define VK_NV_cooperative_matrix_decode_vector 1
|
||||
#define VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME "VK_NV_cooperative_matrix_decode_vector"
|
||||
#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV ((VkStructureType)1000689000)
|
||||
typedef struct VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV {
|
||||
VkStructureType sType;
|
||||
void* pNext;
|
||||
VkBool32 cooperativeMatrixDecodeVector;
|
||||
} VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV;
|
||||
#endif
|
||||
|
||||
// SPIR-V Headers: different SDK installations expose different include paths.
|
||||
// LunarG Vulkan SDK on Windows typically provides <spirv-headers/spirv.hpp>.
|
||||
// Linux packages, MSYS2 and MinGW often use the Khronos layout <spirv/unified1/spirv.hpp>.
|
||||
@@ -398,6 +411,7 @@ enum vk_conv_shapes {
|
||||
CONV_SHAPE_128x128,
|
||||
CONV_SHAPE_64x32,
|
||||
CONV_SHAPE_32x256,
|
||||
CONV_SHAPE_64x128,
|
||||
CONV_SHAPE_COUNT,
|
||||
};
|
||||
|
||||
@@ -412,6 +426,7 @@ vk_conv_block_size vk_conv_block_sizes[CONV_SHAPE_COUNT] = {
|
||||
{ 128, 128, 16 }, // CONV_SHAPE_128x128
|
||||
{ 64, 32, 32 }, // CONV_SHAPE_64x32
|
||||
{ 32, 256, 16 }, // CONV_SHAPE_32x256
|
||||
{ 64, 128, 16 }, // CONV_SHAPE_64x128
|
||||
};
|
||||
|
||||
enum dmmv_wg_sizes {
|
||||
@@ -447,14 +462,16 @@ struct vk_fa_pipeline_state {
|
||||
};
|
||||
|
||||
struct vk_conv2d_pipeline_state {
|
||||
vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH)
|
||||
: s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH) {}
|
||||
vk_conv2d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t p0, uint32_t p1, uint32_t d0, uint32_t d1, uint32_t KW, uint32_t KH, uint32_t aligned)
|
||||
: s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), KW(KW), KH(KH), aligned(aligned) {}
|
||||
|
||||
uint32_t s0, s1, p0, p1, d0, d1, KW, KH;
|
||||
// when set, shader can skip K/CRS/NPQ bounds checks and address clamps
|
||||
uint32_t aligned;
|
||||
|
||||
bool operator<(const vk_conv2d_pipeline_state &b) const {
|
||||
return std::tie(s0, s1, p0, p1, d0, d1, KW, KH) <
|
||||
std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH);
|
||||
return std::tie(s0, s1, p0, p1, d0, d1, KW, KH, aligned) <
|
||||
std::tie(b.s0, b.s1, b.p0, b.p1, b.d0, b.d1, b.KW, b.KH, b.aligned);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -674,6 +691,7 @@ struct vk_device_struct {
|
||||
uint32_t coopmat_int_k;
|
||||
|
||||
bool coopmat2;
|
||||
bool coopmat2_decode_vector;
|
||||
|
||||
bool pipeline_executable_properties_support {};
|
||||
|
||||
@@ -764,7 +782,8 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_clamp_f32;
|
||||
vk_pipeline pipeline_pad_f32;
|
||||
vk_pipeline pipeline_roll_f32;
|
||||
vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
|
||||
vk_pipeline pipeline_repeat_i32, pipeline_repeat_back_f32;
|
||||
vk_pipeline pipeline_repeat_i16;
|
||||
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16, pipeline_cpy_bf16_f32, pipeline_cpy_f32_i32, pipeline_cpy_i32_f32;
|
||||
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16, pipeline_contig_cpy_bf16_f32, pipeline_contig_cpy_f32_i32, pipeline_contig_cpy_i32_f32;
|
||||
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
|
||||
@@ -841,6 +860,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_argsort_large_f32[num_argsort_pipelines];
|
||||
vk_pipeline pipeline_topk_f32[num_topk_pipelines];
|
||||
vk_pipeline pipeline_sum_rows_f32;
|
||||
vk_pipeline pipeline_fwht_f32[4];
|
||||
vk_pipeline pipeline_cumsum_f32;
|
||||
vk_pipeline pipeline_cumsum_small_f32;
|
||||
vk_pipeline pipeline_cumsum_multipass1_f32;
|
||||
@@ -1131,6 +1151,13 @@ struct vk_op_push_constants {
|
||||
float param4;
|
||||
};
|
||||
|
||||
struct vk_op_fwht_push_constants {
|
||||
uint32_t n_rows;
|
||||
uint32_t src_offset;
|
||||
uint32_t dst_offset;
|
||||
float scale;
|
||||
};
|
||||
|
||||
struct vk_op_count_experts_push_constants {
|
||||
uint32_t ne00;
|
||||
uint32_t ne01;
|
||||
@@ -2036,6 +2063,15 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_fwht_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
||||
p.src_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
||||
p.dst_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
||||
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(src2);
|
||||
GGML_UNUSED(src3);
|
||||
}
|
||||
|
||||
struct ggml_backend_vk_buffer_context {
|
||||
vk_device_ref device;
|
||||
vk_buffer dev_buffer;
|
||||
@@ -2076,9 +2112,9 @@ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
|
||||
const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
std::string type = device ? "device" : "host";
|
||||
auto it = allocations.find(buf->buffer);
|
||||
total_device -= device ? it->second : 0;
|
||||
total_host -= device ? 0 : it->second;
|
||||
if (it != allocations.end()) {
|
||||
total_device -= device ? it->second : 0;
|
||||
total_host -= device ? 0 : it->second;
|
||||
VK_LOG_MEMORY(buf->device->name << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host));
|
||||
allocations.erase(it);
|
||||
} else {
|
||||
@@ -2162,6 +2198,136 @@ static uint32_t compile_count = 0;
|
||||
static std::mutex compile_count_mutex;
|
||||
static std::condition_variable compile_count_cond;
|
||||
|
||||
static constexpr uint32_t kSpvOpCooperativeMatrixLoadTensorNV = 5367;
|
||||
static constexpr uint32_t kSpvCapabilityCooperativeMatrixDecodeVectorNV = 5447;
|
||||
static constexpr uint32_t kSpvTensorAddressingDecodeVectorFuncBit = 0x4;
|
||||
|
||||
// Remove SPV_NV_cooperative_matrix_decode_vector usage from a SPIR-V module so it
|
||||
// can be loaded on drivers that only support SPV_NV_cooperative_matrix2. Drops the
|
||||
// OpExtension declaration, the CooperativeMatrixDecodeVectorNV OpCapability, and the
|
||||
// DecodeVectorFunc operand from any OpCooperativeMatrixLoadTensorNV instruction.
|
||||
// Returns true when the input used the extension (and `out` was populated with a
|
||||
// stripped copy); returns false otherwise without touching `out`.
|
||||
static bool ggml_vk_strip_decode_vector(const uint32_t * code, size_t word_count, std::vector<uint32_t> & out) {
|
||||
static const char kDecodeVectorExt[] = "SPV_NV_cooperative_matrix_decode_vector";
|
||||
|
||||
if (word_count < 5) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool uses_decode_vector = false;
|
||||
for (size_t pos = 5; pos < word_count; ) {
|
||||
uint32_t word = code[pos];
|
||||
uint32_t wc = word >> spv::WordCountShift;
|
||||
uint32_t op = word & spv::OpCodeMask;
|
||||
GGML_ASSERT(wc > 0 && pos + wc <= word_count);
|
||||
if (op == spv::OpExtension && wc >= 2) {
|
||||
const char * s = reinterpret_cast<const char *>(&code[pos + 1]);
|
||||
if (strcmp(s, kDecodeVectorExt) == 0) {
|
||||
uses_decode_vector = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
pos += wc;
|
||||
}
|
||||
|
||||
if (!uses_decode_vector) {
|
||||
return false;
|
||||
}
|
||||
|
||||
VK_LOG_DEBUG("ggml_vk_strip_decode_vector: stripping SPV_NV_cooperative_matrix_decode_vector");
|
||||
|
||||
// Bulk-copy unchanged runs and only break the run when an instruction needs to
|
||||
// be dropped or patched. Use reserve + insert/push_back so the destination buffer
|
||||
// is touched exactly once (no zero-initialization pass from resize()).
|
||||
out.clear();
|
||||
out.reserve(word_count);
|
||||
|
||||
size_t run_start = 0;
|
||||
auto flush_run = [&](size_t up_to) {
|
||||
if (up_to > run_start) {
|
||||
out.insert(out.end(), code + run_start, code + up_to);
|
||||
}
|
||||
};
|
||||
|
||||
for (size_t pos = 5; pos < word_count; ) {
|
||||
uint32_t word = code[pos];
|
||||
uint32_t wc = word >> spv::WordCountShift;
|
||||
uint32_t op = word & spv::OpCodeMask;
|
||||
GGML_ASSERT(wc > 0 && pos + wc <= word_count);
|
||||
|
||||
if (op == spv::OpExtension && wc >= 2) {
|
||||
const char * s = reinterpret_cast<const char *>(&code[pos + 1]);
|
||||
if (strcmp(s, kDecodeVectorExt) == 0) {
|
||||
flush_run(pos);
|
||||
pos += wc;
|
||||
run_start = pos;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (op == spv::OpCapability && wc == 2 && code[pos + 1] == kSpvCapabilityCooperativeMatrixDecodeVectorNV) {
|
||||
flush_run(pos);
|
||||
pos += wc;
|
||||
run_start = pos;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (op == kSpvOpCooperativeMatrixLoadTensorNV) {
|
||||
// [opcode/wc][ResultType][Result][Pointer][Object][TensorLayout][MemOperand mask][mem extras...][TA mask][ta extras...]
|
||||
GGML_ASSERT(wc >= 8);
|
||||
|
||||
uint32_t mem_mask = code[pos + 6];
|
||||
size_t cur = pos + 7;
|
||||
// Each of these MemoryAccess bits (when set) carries one trailing operand.
|
||||
cur += (mem_mask & 0x2) ? 1 : 0; // Aligned
|
||||
cur += (mem_mask & 0x8) ? 1 : 0; // MakePointerAvailable
|
||||
cur += (mem_mask & 0x10) ? 1 : 0; // MakePointerVisible
|
||||
cur += (mem_mask & 0x10000) ? 1 : 0; // AliasScopeINTELMask
|
||||
cur += (mem_mask & 0x20000) ? 1 : 0; // NoAliasINTELMask
|
||||
GGML_ASSERT(cur < pos + wc);
|
||||
|
||||
uint32_t ta_mask = code[cur];
|
||||
if ((ta_mask & kSpvTensorAddressingDecodeVectorFuncBit) == 0) {
|
||||
pos += wc;
|
||||
continue; // leave instruction inside the current unchanged run
|
||||
}
|
||||
|
||||
flush_run(pos);
|
||||
|
||||
// Append unchanged prefix of the instruction (header through the mem-extras).
|
||||
size_t inst_start = out.size();
|
||||
size_t pre_n = cur - pos;
|
||||
out.insert(out.end(), code + pos, code + pos + pre_n);
|
||||
|
||||
// Emit TA mask with the DecodeVectorFunc bit cleared.
|
||||
out.push_back(ta_mask & ~kSpvTensorAddressingDecodeVectorFuncBit);
|
||||
|
||||
// TA extras: TensorView (0x1) and DecodeFunc (0x2) are kept verbatim;
|
||||
// DecodeVectorFunc (0x4) is dropped along with its trailing id operand.
|
||||
size_t keep_ta_extras = ((ta_mask & 0x1) ? 1 : 0) + ((ta_mask & 0x2) ? 1 : 0);
|
||||
if (keep_ta_extras) {
|
||||
out.insert(out.end(), code + cur + 1, code + cur + 1 + keep_ta_extras);
|
||||
}
|
||||
|
||||
GGML_ASSERT(wc == pre_n + 1 + keep_ta_extras + 1);
|
||||
|
||||
// Patch the instruction header with the new (one-shorter) word count.
|
||||
uint32_t new_wc = wc - 1;
|
||||
out[inst_start] = (new_wc << spv::WordCountShift) | op;
|
||||
|
||||
pos += wc;
|
||||
run_start = pos;
|
||||
continue;
|
||||
}
|
||||
|
||||
pos += wc;
|
||||
}
|
||||
|
||||
flush_run(word_count);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint,
|
||||
uint32_t parameter_count, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
|
||||
bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
|
||||
@@ -2233,6 +2399,18 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
|
||||
}
|
||||
|
||||
#if defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
|
||||
if (device->coopmat2 && !device->coopmat2_decode_vector) {
|
||||
const uint32_t * src = spirv.empty() ? reinterpret_cast<const uint32_t *>(spv_data) : spirv.data();
|
||||
size_t src_n = spirv.empty() ? spv_size / sizeof(uint32_t) : spirv.size();
|
||||
std::vector<uint32_t> stripped;
|
||||
if (ggml_vk_strip_decode_vector(src, src_n, stripped)) {
|
||||
spirv = std::move(stripped);
|
||||
shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
||||
|
||||
vk::PushConstantRange pcr(
|
||||
@@ -4704,9 +4882,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_repeat_i32, "repeat_i32", repeat_i32_len, repeat_i32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
#define CREATE_UNARY(name) \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \
|
||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
@@ -4819,6 +4999,16 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
// Intel Arc B390 was observed segfaulting with this shader.
|
||||
if (device->subgroup_basic && device->subgroup_shuffle && device->vendor_id != VK_VENDOR_ID_INTEL) {
|
||||
int idx = 0;
|
||||
for (uint32_t n : {64, 128, 256, 512}) {
|
||||
if (device->subgroup_size <= n) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_fwht_f32[idx], "fwht_f32", fwht_f32_len, fwht_f32_data, "main", 2, sizeof(vk_op_fwht_push_constants), {1, 1, 1}, { device->subgroup_size, n }, 1, true, true, device->subgroup_size);
|
||||
}
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t cumsum_elem_per_thread = (device->vendor_id == VK_VENDOR_ID_AMD || device->vendor_id == VK_VENDOR_ID_INTEL) ? 2 : 4;
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cumsum_f32, "cumsum_f32", cumsum_f32_len, cumsum_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { 256, device->subgroup_size, cumsum_elem_per_thread }, 1, true, true, device->subgroup_size);
|
||||
@@ -4934,7 +5124,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
// conv2d, conv_transpose_2d
|
||||
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
|
||||
uint32_t conv2d_WG_SIZE = 256;
|
||||
// smaller WG for the small-tile fallback gives more concurrent WGs per SM
|
||||
uint32_t conv2d_WG_SIZE = (s == CONV_SHAPE_64x32) ? 128 : 256;
|
||||
uint32_t use_collectives = 0; // Enables subgroup ops for preventing the re-calculation of indices.
|
||||
uint32_t conv2d_TS_K = (s == CONV_SHAPE_64x32) ? 4 : 8;
|
||||
uint32_t conv2d_SHMEM_PAD = 4;
|
||||
@@ -4973,18 +5164,77 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
conv2d_BS.CRS); // CRS block size should be capped at subgroup size for correctness when shuffle is used.
|
||||
}
|
||||
|
||||
uint32_t conv2d_shmem_req =
|
||||
(conv2d_BS.K * (conv2d_BS.CRS + conv2d_SHMEM_PAD) + conv2d_BS.CRS * (conv2d_BS.NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
|
||||
if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
|
||||
// cm1 is used only when cm2 is unavailable; capped at 64x128 (due to shared memory size).
|
||||
// Requires 16x16x16 f16-acc since that's the fragment shape hard-coded in the shader.
|
||||
// Subgroup size must be 32 or 64 (to keep WG_SIZE sane) and we need
|
||||
// subgroup_size_control to force the driver to actually use it.
|
||||
bool conv2d_use_cm1 = false;
|
||||
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
conv2d_use_cm1 = !device->coopmat2 &&
|
||||
device->coopmat_support && device->coopmat_support_16x16x16_f16acc &&
|
||||
device->subgroup_size_control &&
|
||||
(device->subgroup_size == 32 || device->subgroup_size == 64) &&
|
||||
s != CONV_SHAPE_128x128;
|
||||
#endif
|
||||
|
||||
const uint32_t conv2d_cm1_shmem_pad = 8;
|
||||
|
||||
auto shmem_req = [&](uint32_t pad, bool csh_store, bool fp16_shmem) {
|
||||
const uint32_t elem_size = fp16_shmem ? (uint32_t)sizeof(uint16_t) : (uint32_t)sizeof(float);
|
||||
const uint32_t csh_elems = csh_store ? conv2d_BS.K * conv2d_BS.NPQ : 0u;
|
||||
return (conv2d_BS.K * (conv2d_BS.CRS + pad) + conv2d_BS.CRS * (conv2d_BS.NPQ + pad) + csh_elems) * elem_size;
|
||||
};
|
||||
|
||||
// coopmat1 needs to store the output through shared memory, so check up front
|
||||
// whether it'll fit and disable it before applying coopmat1 parameters.
|
||||
if (conv2d_use_cm1 && device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_cm1_shmem_pad, true, true)) {
|
||||
conv2d_use_cm1 = false;
|
||||
}
|
||||
|
||||
uint32_t conv2d_WM = 16, conv2d_WN = 16; // cm1 subgroup tile, ignored otherwise
|
||||
if (conv2d_use_cm1) {
|
||||
conv2d_SHMEM_PAD = conv2d_cm1_shmem_pad;
|
||||
// 16x16x16 fragments; pick WM/WN to keep WG_SIZE at 256
|
||||
// (i.e. 8 subgroups for sg=32, 4 subgroups for sg=64).
|
||||
const bool sg64 = (device->subgroup_size == 64);
|
||||
switch (s) {
|
||||
case CONV_SHAPE_64x32: conv2d_WM = sg64 ? 32 : 16; conv2d_WN = 16; break;
|
||||
case CONV_SHAPE_64x128: conv2d_WM = 32; conv2d_WN = sg64 ? 64 : 32; break;
|
||||
case CONV_SHAPE_32x256: conv2d_WM = sg64 ? 16 : 32; conv2d_WN = sg64 ? 128 : 32; break;
|
||||
default: break;
|
||||
}
|
||||
const uint32_t warps_M = conv2d_BS.K / conv2d_WM;
|
||||
const uint32_t warps_N = conv2d_BS.NPQ / conv2d_WN;
|
||||
conv2d_WG_SIZE = warps_M * warps_N * device->subgroup_size;
|
||||
}
|
||||
|
||||
// stage cm2 accumulator through shmem for coalesced global stores;
|
||||
// skipped on 128x128 where the extra Csh footprint hurts occupancy.
|
||||
// cm1 always uses the staged path.
|
||||
uint32_t conv2d_csh_store = (device->coopmat2 && s != CONV_SHAPE_128x128) ? 1u : 0u;
|
||||
if (conv2d_use_cm1) {
|
||||
conv2d_csh_store = 1;
|
||||
}
|
||||
|
||||
// shmem is fp16 on cm2/cm1 (matches Csh), fp32 on scalar
|
||||
const bool conv2d_use_fp16_shmem = device->coopmat2 || conv2d_use_cm1;
|
||||
|
||||
// shrink CRS if the non-cm1 config still doesn't fit
|
||||
if (device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_SHMEM_PAD, conv2d_csh_store, conv2d_use_fp16_shmem)) {
|
||||
GGML_ASSERT(!conv2d_use_cm1);
|
||||
conv2d_BS.CRS = 8;
|
||||
if (use_collectives) {
|
||||
conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS);
|
||||
}
|
||||
conv2d_csh_store = 0;
|
||||
}
|
||||
|
||||
std::array<uint32_t, 3> wg_denoms = { conv2d_BS.K, 1, 1 };
|
||||
std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
|
||||
|
||||
// cm1 needs a fixed subgroup width to match the WG_SIZE we computed
|
||||
const uint32_t conv2d_required_subgroup_size = conv2d_use_cm1 ? device->subgroup_size : 0;
|
||||
|
||||
#define CREATE_CONV(name, type_suffix, spv_suffix) \
|
||||
for (auto &c : device->pipeline_##name##type_suffix[s]) { \
|
||||
const vk_conv2d_pipeline_state &state = c.first; \
|
||||
@@ -4997,10 +5247,14 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
spec_constants_cpy.push_back(state.d1); \
|
||||
spec_constants_cpy.push_back(state.KW); \
|
||||
spec_constants_cpy.push_back(state.KH); \
|
||||
spec_constants_cpy.push_back(state.aligned); \
|
||||
spec_constants_cpy.push_back(conv2d_csh_store); \
|
||||
spec_constants_cpy.push_back(conv2d_WM); \
|
||||
spec_constants_cpy.push_back(conv2d_WN); \
|
||||
ggml_vk_create_pipeline( \
|
||||
device, c.second, #name #type_suffix, \
|
||||
name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \
|
||||
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives); \
|
||||
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives || conv2d_required_subgroup_size, conv2d_required_subgroup_size); \
|
||||
}
|
||||
#define CREATE_CONVS(spv_suffix) \
|
||||
CREATE_CONV(conv2d, _f32, spv_suffix) \
|
||||
@@ -5011,6 +5265,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
if (device->coopmat2) {
|
||||
CREATE_CONVS(_cm2)
|
||||
} else
|
||||
#endif
|
||||
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (conv2d_use_cm1) {
|
||||
CREATE_CONVS(_cm1)
|
||||
} else
|
||||
#endif
|
||||
if (conv2d_UNROLL) {
|
||||
CREATE_CONVS(_unroll)
|
||||
@@ -5083,6 +5342,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
bool amd_shader_core_properties2 = false;
|
||||
bool pipeline_robustness = false;
|
||||
bool coopmat2_support = false;
|
||||
bool coopmat2_decode_vector_support = false;
|
||||
bool pipeline_executable_properties_support = false;
|
||||
device->coopmat_support = false;
|
||||
device->integer_dot_product = false;
|
||||
@@ -5117,6 +5377,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
||||
coopmat2_support = true;
|
||||
#endif
|
||||
} else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) {
|
||||
coopmat2_decode_vector_support = true;
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
|
||||
@@ -5394,6 +5657,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
}
|
||||
#endif
|
||||
|
||||
VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {};
|
||||
coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV;
|
||||
if (coopmat2_decode_vector_support) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
|
||||
last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
|
||||
device_extensions.push_back(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME);
|
||||
}
|
||||
|
||||
#if defined(VK_KHR_shader_bfloat16)
|
||||
VkPhysicalDeviceShaderBfloat16FeaturesKHR bfloat16_features {};
|
||||
bfloat16_features.pNext = nullptr;
|
||||
@@ -5553,6 +5824,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
found_fp32_128 && found_fp32_256 &&
|
||||
coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) {
|
||||
device->coopmat2 = true;
|
||||
device->coopmat2_decode_vector = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -5768,8 +6040,12 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
|
||||
ggml_vk_load_shaders(device);
|
||||
|
||||
// Only use transfer queue on AMD non-GCN, when the graphics queue is not enabled
|
||||
const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN && !allow_graphics_queue;
|
||||
// Prefer a dedicated transfer queue on AMD dGPUs (non-GCN) when graphics queue use is disabled.
|
||||
const bool prefers_transfer_queue =
|
||||
device->vendor_id == VK_VENDOR_ID_AMD &&
|
||||
device->architecture != AMD_GCN &&
|
||||
!device->uma &&
|
||||
!allow_graphics_queue;
|
||||
|
||||
if (!device->single_queue) {
|
||||
const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
|
||||
@@ -5835,6 +6111,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
bool fp16_compute = false;
|
||||
bool coopmat_support = false;
|
||||
bool coopmat2_support = false;
|
||||
bool coopmat2_decode_vector_support = false;
|
||||
bool integer_dot_product = false;
|
||||
bool bfloat16_support = false;
|
||||
|
||||
@@ -5853,6 +6130,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
||||
coopmat2_support = true;
|
||||
#endif
|
||||
} else if (strcmp(VK_NV_COOPERATIVE_MATRIX_DECODE_VECTOR_EXTENSION_NAME, properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT2_DECODE_VECTOR")) {
|
||||
coopmat2_decode_vector_support = true;
|
||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
|
||||
@@ -5937,6 +6217,13 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
}
|
||||
#endif
|
||||
|
||||
VkPhysicalDeviceCooperativeMatrixDecodeVectorFeaturesNV coopmat2_decode_vector_features {};
|
||||
coopmat2_decode_vector_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_DECODE_VECTOR_FEATURES_NV;
|
||||
if (coopmat2_decode_vector_support) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
|
||||
last_struct = (VkBaseOutStructure *)&coopmat2_decode_vector_features;
|
||||
}
|
||||
|
||||
vkGetPhysicalDeviceFeatures2(physical_device, &device_features2);
|
||||
|
||||
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||
@@ -5961,7 +6248,14 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
#endif
|
||||
&& ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture);
|
||||
|
||||
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
||||
coopmat2_decode_vector_support = coopmat2_decode_vector_support && coopmat2_decode_vector_features.cooperativeMatrixDecodeVector;
|
||||
#if !defined(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
|
||||
coopmat2_decode_vector_support = false;
|
||||
#endif
|
||||
|
||||
std::string matrix_cores = coopmat2_support ? (coopmat2_decode_vector_support ? "NV_coopmat2v" : "NV_coopmat2")
|
||||
: coopmat_support ? "KHR_coopmat"
|
||||
: "none";
|
||||
|
||||
std::string device_name = props2.properties.deviceName.data();
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | bf16: %d | warp size: %zu | shared memory: %d | int dot: %d | matrix cores: %s\n",
|
||||
@@ -6966,7 +7260,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
||||
const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
|
||||
const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
|
||||
for (uint64_t i0 = 0; i0 < ne0; i0++) {
|
||||
slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 });
|
||||
slices.push_back({ s_off + i0*nb0, d_off + i0*dstnb0, dstnb0 });
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8474,6 +8768,68 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||
}, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 });
|
||||
}
|
||||
|
||||
static int ggml_vk_fwht_pipeline_idx(int64_t n) {
|
||||
switch (n) {
|
||||
case 64: return 0;
|
||||
case 128: return 1;
|
||||
case 256: return 2;
|
||||
case 512: return 3;
|
||||
default: return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_vk_can_use_fwht(const ggml_backend_vk_context * ctx, const ggml_tensor * src1, const ggml_tensor * dst) {
|
||||
if (ctx->num_additional_fused_ops != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ggml_get_op_params_i32(dst, 1) != GGML_HINT_SRC0_IS_HADAMARD) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int idx = ggml_vk_fwht_pipeline_idx(src1->ne[0]);
|
||||
if (idx < 0 || ctx->device->pipeline_fwht_f32[idx] == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous(src1)) {
|
||||
return false;
|
||||
}
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void ggml_vk_fwht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
const int idx = ggml_vk_fwht_pipeline_idx(src->ne[0]);
|
||||
vk_pipeline pipeline = ctx->device->pipeline_fwht_f32[idx];
|
||||
|
||||
const uint32_t rows_per_workgroup = 4;
|
||||
const uint32_t n_rows = (uint32_t)ggml_nrows(src);
|
||||
const uint32_t max_workgroups_x = ctx->device->properties.limits.maxComputeWorkGroupCount[0];
|
||||
|
||||
const uint32_t total_workgroups = CEIL_DIV(n_rows, rows_per_workgroup);
|
||||
const uint32_t workgroups_x = std::min(total_workgroups, max_workgroups_x);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
|
||||
const vk_subbuffer src_buf = ggml_vk_tensor_subbuffer(ctx, src, true);
|
||||
const vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, true);
|
||||
|
||||
vk_op_fwht_push_constants pc = {
|
||||
n_rows,
|
||||
0,
|
||||
0,
|
||||
1.0f / std::sqrt((float)src->ne[0]),
|
||||
};
|
||||
init_pushconst_tensor_offsets(ctx, pc, src, nullptr, nullptr, nullptr, dst);
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src_buf, dst_buf }, pc, { workgroups_x, 1, 1 });
|
||||
}
|
||||
|
||||
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx) {
|
||||
ggml_tensor * dst = cgraph->nodes[node_idx];
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
@@ -8507,6 +8863,8 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
||||
|
||||
m_offset += cur_M_size;
|
||||
}
|
||||
} else if (ggml_vk_can_use_fwht(ctx, src1, dst)) {
|
||||
ggml_vk_fwht(ctx, subctx, src1, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1 &&
|
||||
// detect 0213 permutation, and batch size of 1
|
||||
src0->nb[0] <= src0->nb[2] &&
|
||||
@@ -9473,10 +9831,23 @@ static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, u
|
||||
// so small convolutions will still choose a smaller tile.
|
||||
const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
|
||||
|
||||
if (K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
|
||||
// 128x128 isn't used with cm1 due to shared memory size; fall through to a smaller tile.
|
||||
bool allow_128x128 = true;
|
||||
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (!ctx->device->coopmat2 && ctx->device->coopmat_support && ctx->device->coopmat_support_16x16x16_f16acc) {
|
||||
allow_128x128 = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
|
||||
return CONV_SHAPE_128x128;
|
||||
} else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) {
|
||||
return CONV_SHAPE_32x256;
|
||||
} else if (K <= 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) {
|
||||
return CONV_SHAPE_64x128;
|
||||
} else if (!allow_128x128 && K > 64 && n_tiles(CONV_SHAPE_64x128) >= shader_core_count * 2) {
|
||||
// cm1 fallback for large K when 128x128 isn't available
|
||||
return CONV_SHAPE_64x128;
|
||||
} else {
|
||||
return CONV_SHAPE_64x32;
|
||||
}
|
||||
@@ -9648,7 +10019,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return nullptr;
|
||||
case GGML_OP_REPEAT:
|
||||
if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
|
||||
return ctx->device->pipeline_repeat_f32;
|
||||
return ctx->device->pipeline_repeat_i32;
|
||||
}
|
||||
if (ggml_type_size(src0->type) == 2 && ggml_type_size(dst->type) == 2) {
|
||||
return ctx->device->pipeline_repeat_i16;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
@@ -10008,7 +10382,18 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0;
|
||||
uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1;
|
||||
uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1;
|
||||
vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH);
|
||||
|
||||
// tile-aligned shapes let the shader skip bounds checks
|
||||
const uint32_t Cin = (uint32_t)src1->ne[2];
|
||||
const uint32_t CRS = Cin * KW * KH;
|
||||
const uint32_t BS_K = vk_conv_block_sizes[shape].K;
|
||||
const uint32_t BS_CRS = vk_conv_block_sizes[shape].CRS;
|
||||
const uint32_t BS_NPQ = vk_conv_block_sizes[shape].NPQ;
|
||||
const uint32_t aligned = ((K % BS_K == 0) &&
|
||||
(CRS % BS_CRS == 0) &&
|
||||
(NPQ % BS_NPQ == 0)) ? 1u : 0u;
|
||||
|
||||
vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH, aligned);
|
||||
|
||||
std::map<vk_conv2d_pipeline_state, vk_pipeline> *pipelines = nullptr;
|
||||
if (op == GGML_OP_CONV_2D) {
|
||||
@@ -16152,7 +16537,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_REPEAT:
|
||||
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
|
||||
return ggml_type_size(op->type) == ggml_type_size(op->src[0]->type) &&
|
||||
(ggml_type_size(op->type) == sizeof(float) || ggml_type_size(op->type) == 2);
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_ROPE:
|
||||
|
||||
@@ -11,6 +11,10 @@ if (GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
message(STATUS "Enabling coopmat2 glslc support")
|
||||
endif()
|
||||
if (GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
|
||||
add_compile_definitions(GGML_VULKAN_COOPMAT2_DECODE_VECTOR_GLSLC_SUPPORT)
|
||||
message(STATUS "Enabling coopmat2 decode_vector glslc support")
|
||||
endif()
|
||||
if (GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||
message(STATUS "Enabling dot glslc support")
|
||||
|
||||
@@ -7,6 +7,13 @@
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#endif
|
||||
|
||||
#ifdef COOPMAT
|
||||
#extension GL_KHR_cooperative_matrix : enable
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#endif
|
||||
|
||||
#ifdef USE_COLLECTIVES
|
||||
# extension GL_KHR_shader_subgroup_shuffle : enable
|
||||
#endif
|
||||
@@ -77,6 +84,39 @@ layout(constant_id = 12) const uint d1 = 1;
|
||||
// Kernel spatial sizes
|
||||
layout(constant_id = 13) const uint KW = 1;
|
||||
layout(constant_id = 14) const uint KH = 1;
|
||||
// when set, skip bounds checks and address clamps (K/CRS/NPQ are tile-aligned)
|
||||
layout(constant_id = 15) const uint aligned = 0;
|
||||
// stage cm2 result through shmem (Csh) for coalesced stores. cm1 always does this.
|
||||
layout(constant_id = 16) const uint csh_store = 0;
|
||||
|
||||
#ifdef COOPMAT
|
||||
// cm1 subgroup tile: each subgroup computes a WM x WN region as a grid of
|
||||
// TM x TN x TK fragments. Requires WM%TM == WN%TN == BS_K%WM == BS_NPQ%WN ==
|
||||
// BS_CRS%TK == 0, and WG_SIZE == (BS_K/WM) * (BS_NPQ/WN) * subgroup_size.
|
||||
layout(constant_id = 17) const uint WM = 32;
|
||||
layout(constant_id = 18) const uint WN = 32;
|
||||
const uint TM = 16;
|
||||
const uint TN = 16;
|
||||
const uint TK = 16;
|
||||
const uint cms_per_row = WM / TM;
|
||||
const uint cms_per_col = WN / TN;
|
||||
const uint warps_M = BS_K / WM;
|
||||
const uint warps_N = BS_NPQ / WN;
|
||||
#endif
|
||||
|
||||
// without padding, H_idx/W_idx are in bounds by construction (non-TRANSPOSE only)
|
||||
#ifdef TRANSPOSE
|
||||
const bool hw_in_bounds = false;
|
||||
#else
|
||||
const bool hw_in_bounds = (p0 == 0) && (p1 == 0);
|
||||
#endif
|
||||
|
||||
// TRANSPOSE stride alignment is trivially satisfied for stride 1
|
||||
#ifdef TRANSPOSE
|
||||
const bool stride_in_bounds = (s0 == 1) && (s1 == 1);
|
||||
#else
|
||||
const bool stride_in_bounds = true;
|
||||
#endif
|
||||
|
||||
uint32_t tid = gl_LocalInvocationID.x;
|
||||
const uint32_t WG_SIZE = gl_WorkGroupSize.x;
|
||||
@@ -94,7 +134,7 @@ uint32_t n_elems_out = K * NPQ;
|
||||
// Number of blocktiles per input
|
||||
uint32_t NB_CRS = splitWork(CRS, BS_CRS);
|
||||
|
||||
#ifdef COOPMAT2
|
||||
#if defined(COOPMAT2) || defined(COOPMAT)
|
||||
#define SHMEM_TYPE float16_t
|
||||
#else
|
||||
#define SHMEM_TYPE float
|
||||
@@ -112,6 +152,17 @@ const uint32_t Bsh_len = BS_CRS * Bsh_stride;
|
||||
shared SHMEM_TYPE Ash[Ash_len]; // K x CRS
|
||||
shared SHMEM_TYPE Bsh[Bsh_len]; // CRS x NPQ
|
||||
|
||||
#if defined(COOPMAT2) || defined(COOPMAT)
|
||||
// stage matC through shmem so global stores are row-major (NPQ-contiguous)
|
||||
const uint32_t Csh_stride = BS_NPQ;
|
||||
#ifdef COOPMAT
|
||||
const uint32_t Csh_len = BS_K * Csh_stride;
|
||||
#else
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 1;
|
||||
#endif
|
||||
shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ
|
||||
#endif
|
||||
|
||||
// Threadtile sizes
|
||||
const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
|
||||
|
||||
@@ -161,7 +212,7 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T
|
||||
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
|
||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||
if (K_idx < K && NPQ_idx < NPQ) {
|
||||
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
|
||||
dst_data[dst_idx] = D_TYPE(elem);
|
||||
}
|
||||
return elem;
|
||||
@@ -176,6 +227,13 @@ void main() {
|
||||
#ifdef COOPMAT2
|
||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
|
||||
matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
|
||||
#elif defined(COOPMAT)
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
|
||||
[[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
|
||||
sums[i] = coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0);
|
||||
}
|
||||
const uint warp_r = gl_SubgroupID / warps_N;
|
||||
const uint warp_c = gl_SubgroupID % warps_N;
|
||||
#else
|
||||
float regC[TS_K][TS_NPQ];
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
@@ -228,12 +286,15 @@ void main() {
|
||||
uint32_t B_lx = Ac;
|
||||
uint32_t K_idx = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/
|
||||
#ifdef TRANSPOSE
|
||||
uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03, K * CRS - 1);
|
||||
uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + K_idx * p.nb02 + Cin_idx_a * p.nb03;
|
||||
#else
|
||||
uint32_t knl_idx = min(KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03, K * CRS - 1);
|
||||
uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + Cin_idx_a * p.nb02 + K_idx * p.nb03;
|
||||
#endif
|
||||
if (aligned == 0) {
|
||||
knl_idx = min(knl_idx, K * CRS - 1);
|
||||
}
|
||||
float val = knl_data[knl_idx];
|
||||
if (K_idx >= K || CRS_idx_a >= CRS) {
|
||||
if (aligned == 0 && (K_idx >= K || CRS_idx_a >= CRS)) {
|
||||
val = 0.0;
|
||||
}
|
||||
Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
|
||||
@@ -282,15 +343,27 @@ void main() {
|
||||
uint32_t H_idx = OH_idx * s1 + KH_idx_b * d1 - p1;
|
||||
uint32_t W_idx = OW_idx * s0 + KW_idx_b * d0 - p0;
|
||||
#endif
|
||||
uint32_t src_idx =
|
||||
min(max(W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13, 0), p.Cin * p.N * p.W * p.H - 1);
|
||||
uint32_t src_idx = W_idx + H_idx * p.nb11 + Cin_idx_b * p.nb12 + N_idx * p.nb13;
|
||||
// skip clamp when address can't go OOB
|
||||
if (aligned == 0 || !hw_in_bounds || !stride_in_bounds) {
|
||||
src_idx = min(max(src_idx, 0), p.Cin * p.N * p.W * p.H - 1);
|
||||
}
|
||||
float val = src_data[src_idx];
|
||||
if (CRS_idx_b >= CRS || NPQ_idx >= NPQ
|
||||
|| H_idx >= p.H || W_idx >= p.W // Lower bound checks aren't necessary. (idx >= 0x80000000 for such case)
|
||||
bool oob = false;
|
||||
if (aligned == 0 && (CRS_idx_b >= CRS || NPQ_idx >= NPQ)) {
|
||||
oob = true;
|
||||
}
|
||||
// also catches lower-bound underflow (idx wraps to 0x80000000+)
|
||||
if (!hw_in_bounds && (H_idx >= p.H || W_idx >= p.W)) {
|
||||
oob = true;
|
||||
}
|
||||
#ifdef TRANSPOSE
|
||||
|| (H_idx_x_s1 - H_idx * s1 != 0) || (W_idx_x_s0 - W_idx * s0 != 0)
|
||||
if (!stride_in_bounds &&
|
||||
((H_idx_x_s1 - H_idx * s1 != 0) || (W_idx_x_s0 - W_idx * s0 != 0))) {
|
||||
oob = true;
|
||||
}
|
||||
#endif
|
||||
) {
|
||||
if (oob) {
|
||||
val = 0.0;
|
||||
}
|
||||
Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
|
||||
@@ -303,6 +376,23 @@ void main() {
|
||||
coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
matC = coopMatMulAdd(matA, matB, matC);
|
||||
#elif defined(COOPMAT)
|
||||
// each subgroup multiplies its grid of fragments per TK-sized CRS chunk
|
||||
[[unroll]] for (uint k_step = 0; k_step < BS_CRS / TK; k_step++) {
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a[cms_per_row];
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
const uint a_off = (warp_r * WM + cm_row * TM) * Ash_stride + k_step * TK;
|
||||
coopMatLoad(cache_a[cm_row], Ash, a_off, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
}
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
coopmat<float16_t, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
|
||||
const uint b_off = k_step * TK * Bsh_stride + warp_c * WN + cm_col * TN;
|
||||
coopMatLoad(cache_b, Bsh, b_off, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a[cm_row], cache_b, sums[cm_col * cms_per_row + cm_row]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (T_y * TS_K < K) {
|
||||
UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
|
||||
@@ -325,8 +415,51 @@ void main() {
|
||||
barrier();
|
||||
}
|
||||
/* Save C* */
|
||||
#if defined(COOPMAT2) || defined(COOPMAT)
|
||||
// stage matC into Csh, then write to dst with coalesced NPQ-contiguous stores
|
||||
#ifdef COOPMAT
|
||||
const bool use_staged_store = true;
|
||||
#else
|
||||
const bool use_staged_store = (csh_store != 0);
|
||||
#endif
|
||||
if (use_staged_store) {
|
||||
#ifdef COOPMAT
|
||||
// cm1: each subgroup stores its fragment grid into its Csh slot
|
||||
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
|
||||
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
|
||||
const uint csh_off = (warp_r * WM + cm_row * TM) * Csh_stride + warp_c * WN + cm_col * TN;
|
||||
coopMatStore(sums[cm_col * cms_per_row + cm_row], Csh, csh_off, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
}
|
||||
}
|
||||
#else
|
||||
coopMatStore(matC, Csh, 0, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
|
||||
#endif
|
||||
barrier();
|
||||
|
||||
// cooperative shmem->global: WG threads spread across BS_NPQ (the
|
||||
// contiguous direction of dst), each iter covers store_rows_per_iter K-rows
|
||||
const uint32_t store_rows_per_iter = WG_SIZE / BS_NPQ;
|
||||
const uint32_t store_iters = BS_K / store_rows_per_iter;
|
||||
const uint32_t k_thread_offset = tid / BS_NPQ;
|
||||
const uint32_t npq_thread = tid % BS_NPQ;
|
||||
[[unroll]] for (uint32_t i = 0; i < store_iters; i++) {
|
||||
uint32_t k_local = i * store_rows_per_iter + k_thread_offset;
|
||||
uint32_t K_idx = B_idx_K * BS_K + k_local;
|
||||
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + npq_thread;
|
||||
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL);
|
||||
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL);
|
||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
|
||||
dst_data[dst_idx] = D_TYPE(Csh[k_local * Csh_stride + npq_thread]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef COOPMAT2
|
||||
coopMatPerElementNV(matC, matC, perElemOpStore);
|
||||
else {
|
||||
coopMatPerElementNV(matC, matC, perElemOpStore);
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
if (T_y * TS_K < K) {
|
||||
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
|
||||
@@ -337,7 +470,7 @@ void main() {
|
||||
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
|
||||
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
|
||||
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
|
||||
if (K_idx < K && NPQ_idx < NPQ) {
|
||||
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
|
||||
dst_data[dst_idx] = regC[T_ly][T_lx];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,21 +5,60 @@
|
||||
#include "types.glsl"
|
||||
|
||||
#if defined(DATA_A_F32)
|
||||
FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) {
|
||||
return data_a[a_offset + ib];
|
||||
}
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
return vec4(data_a[a_offset + ib ], data_a[a_offset + ib + 1],
|
||||
data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]);
|
||||
}
|
||||
vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) {
|
||||
return vec4(data_a[a_offset + ib ], data_a[a_offset + ib + 1],
|
||||
data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_F16)
|
||||
FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) {
|
||||
return data_a[a_offset + ib];
|
||||
}
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
return vec4(data_a[a_offset + ib ], data_a[a_offset + ib + 1],
|
||||
data_a[a_offset + ib + 2], data_a[a_offset + ib + 3]);
|
||||
}
|
||||
vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) {
|
||||
const vec2 a = data_a_packed32[(a_offset + ib)/2];
|
||||
const vec2 b = data_a_packed32[(a_offset + ib)/2 + 1];
|
||||
return vec4(a, b);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_BF16)
|
||||
FLOAT_TYPE dequantize1(uint ib, uint iqs, uint a_offset) {
|
||||
return bf16_to_fp32(data_a[a_offset + ib]);
|
||||
}
|
||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||
return vec2(bf16_to_fp32(data_a[a_offset + ib]), bf16_to_fp32(data_a[a_offset + ib + 1]));
|
||||
}
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||
return vec4(bf16_to_fp32(data_a[a_offset + ib ]), bf16_to_fp32(data_a[a_offset + ib + 1]),
|
||||
bf16_to_fp32(data_a[a_offset + ib + 2]), bf16_to_fp32(data_a[a_offset + ib + 3]));
|
||||
}
|
||||
vec4 dequantize4_2aligned(uint ib, uint iqs, uint a_offset) {
|
||||
const uint a = data_a_packed32[(a_offset + ib)/2];
|
||||
const uint b = data_a_packed32[(a_offset + ib)/2 + 1];
|
||||
return vec4(uintBitsToFloat((a & 0x0000ffff) << 16),
|
||||
uintBitsToFloat( a & 0xffff0000),
|
||||
uintBitsToFloat((b & 0x0000ffff) << 16),
|
||||
uintBitsToFloat( b & 0xffff0000));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_0)
|
||||
|
||||
@@ -1,4 +1,12 @@
|
||||
|
||||
// Each format defines a scalar dequantFunc<T> plus a V=4 dequantFunc<T>_v
|
||||
// passed as the optional vector decoder to coopMatLoadTensorNV via
|
||||
// GL_NV_cooperative_matrix_decode_vector. When the driver doesn't support
|
||||
// the extension, ggml-vulkan.cpp strips it from the compiled SPIR-V.
|
||||
#ifdef GL_NV_cooperative_matrix_decode_vector
|
||||
#extension GL_NV_cooperative_matrix_decode_vector : enable
|
||||
#endif
|
||||
|
||||
#include "types.glsl"
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufF32 {
|
||||
@@ -25,6 +33,19 @@ float16_t dequantFuncQ1_0(const in decodeBufQ1_0 bl, const in uint blockCoords[2
|
||||
return bit != 0u ? d : -d;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ1_0_v(const in decodeBufQ1_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const float16_t md = -d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint qs_nib = uint(bl.block.qs[idx >> 3]) >> (idx & 0x4u);
|
||||
return f16vec4(
|
||||
(qs_nib & 1u) != 0u ? d : md,
|
||||
(qs_nib & 2u) != 0u ? d : md,
|
||||
(qs_nib & 4u) != 0u ? d : md,
|
||||
(qs_nib & 8u) != 0u ? d : md);
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 {
|
||||
block_q4_0_packed16 block;
|
||||
};
|
||||
@@ -42,10 +63,28 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ4_0_v(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint shift = (idx & 0x10) >> 2; // 0 or 4
|
||||
const uint qs_i = (idx & 0xE) >> 1; // even, in {0,2,4,6}
|
||||
const uint qsw = uint32_t(bl.block.qs[qs_i ])
|
||||
| (uint32_t(bl.block.qs[qs_i + 1u]) << 16);
|
||||
// shift in {0,4}: per-byte mask 0x0F isolates the wanted nibble in each byte.
|
||||
const uint q4 = (qsw >> shift) & 0x0F0F0F0Fu;
|
||||
const u8vec4 q = unpack8(q4);
|
||||
return f16vec4((vec4(q) - vec4(8.0)) * vec4(float(d)));
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 {
|
||||
block_q4_1 block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1_packed32 {
|
||||
block_q4_1_packed32 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
@@ -60,10 +99,27 @@ float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ4_1_v(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ4_1_packed32 bl32 = decodeBufQ4_1_packed32(bl);
|
||||
const float16_t d = bl.block.d;
|
||||
const float16_t m = bl.block.m;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint shift = (idx & 0x10) >> 2; // 0 or 4
|
||||
const uint qs_w = (idx & 0xC) >> 2; // iqs / 4 in [0,4)
|
||||
const uint qsw = uint32_t(bl32.block.qs[qs_w]);
|
||||
const u8vec4 q = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
|
||||
return f16vec4(vec4(q) * vec4(float(d)) + vec4(float(m)));
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 {
|
||||
block_q5_0 block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0_packed16 {
|
||||
block_q5_0_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
@@ -82,10 +138,32 @@ float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ5_0_v(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ5_0_packed16 bl16 = decodeBufQ5_0_packed16(bl);
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint shift = (idx & 0x10) >> 2; // 0 or 4
|
||||
const uint qs_i = (idx & 0xC) >> 1; // packed16 word index, in {0,2,4,6}
|
||||
const uint qsw = uint32_t(bl16.block.qs[qs_i ])
|
||||
| (uint32_t(bl16.block.qs[qs_i + 1u]) << 16);
|
||||
const u8vec4 ql = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
|
||||
|
||||
const uint uint_qh = uint(bl16.block.qh[1]) << 16 | uint(bl16.block.qh[0]);
|
||||
const uint qh_pack = uint_qh >> idx; // bits 0..3 = element idx..idx+3 high bits
|
||||
const uvec4 qh_high = (uvec4(qh_pack, qh_pack >> 1u, qh_pack >> 2u, qh_pack >> 3u) & uvec4(0x01u)) << 4u;
|
||||
|
||||
return f16vec4((vec4(ql) + vec4(qh_high) - vec4(16.0)) * vec4(float(d)));
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 {
|
||||
block_q5_1 block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1_packed32 {
|
||||
block_q5_1_packed32 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
@@ -105,6 +183,23 @@ float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ5_1_v(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ5_1_packed32 bl32 = decodeBufQ5_1_packed32(bl);
|
||||
const float16_t d = bl.block.d;
|
||||
const float16_t m = bl.block.m;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint shift = (idx & 0x10) >> 2; // 0 or 4
|
||||
const uint qs_w = (idx & 0xC) >> 2; // iqs / 4 in [0,4)
|
||||
const uint qsw = uint32_t(bl32.block.qs[qs_w]);
|
||||
const u8vec4 ql = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
|
||||
|
||||
const uint qh_pack = bl.block.qh >> idx; // bits 0..3 = element idx..idx+3 high bits
|
||||
const uvec4 qh_high = (uvec4(qh_pack, qh_pack >> 1u, qh_pack >> 2u, qh_pack >> 3u) & uvec4(0x01u)) << 4u;
|
||||
|
||||
return f16vec4((vec4(ql) + vec4(qh_high)) * vec4(float(d)) + vec4(float(m)));
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 {
|
||||
block_q8_0_packed16 block;
|
||||
};
|
||||
@@ -121,6 +216,17 @@ float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ8_0_v(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint base = idx >> 1u;
|
||||
const uint w = uint(uint16_t(bl.block.qs[base]))
|
||||
| (uint(uint16_t(bl.block.qs[base + 1u])) << 16u);
|
||||
const i8vec4 qi = unpack8(int32_t(w));
|
||||
return f16vec4(vec4(qi) * vec4(float(d)));
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K {
|
||||
block_q2_K block;
|
||||
};
|
||||
@@ -129,6 +235,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2
|
||||
block_q2_K_packed16 block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K_packed32 {
|
||||
block_q2_K_packed32 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl);
|
||||
@@ -147,10 +257,36 @@ float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ2_K_v(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ2_K_packed32 bl32 = decodeBufQ2_K_packed32(bl);
|
||||
const f16vec2 dm = bl.block.dm;
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint scalesi = idx >> 4; // 0..15
|
||||
const uint qsshift = (idx & 0x60) >> 4; // 0,2,4,6
|
||||
|
||||
// qs_i (packed16) = ((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1) is even for idx % 4 == 0,
|
||||
// so qs_w (packed32) = qs_i / 2 = ((idx & 0x80) >> 4) + ((idx & 0x1Cu) >> 2).
|
||||
const uint qs_w = ((idx & 0x80) >> 4) + ((idx & 0x1Cu) >> 2);
|
||||
const uint qsw = uint32_t(bl32.block.qs[qs_w]);
|
||||
const uint qs4 = (qsw >> qsshift) & 0x03030303u;
|
||||
const u8vec4 qi = unpack8(qs4);
|
||||
|
||||
const uint scales = bl.block.scales[scalesi];
|
||||
const float16_t d_sub = dm.x * float16_t(scales & 0xF);
|
||||
const float16_t m_sub = dm.y * float16_t(scales >> 4);
|
||||
return f16vec4(vec4(qi) * vec4(float(d_sub)) - vec4(float(m_sub)));
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K {
|
||||
block_q3_K block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K_packed16 {
|
||||
block_q3_K_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const uint idx = coordInBlock[1];
|
||||
@@ -179,6 +315,47 @@ float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ3_K_v(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ3_K_packed16 bl16 = decodeBufQ3_K_packed16(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint n = idx >> 7; // 0,1
|
||||
const uint is = idx >> 4; // 0..15
|
||||
const uint halfsplit = (idx & 0x60) >> 5; // 0,1,2,3
|
||||
const uint qsshift = halfsplit << 1; // 0,2,4,6
|
||||
const uint hbit = (n << 2) + halfsplit; // 0..7 (bit position in hmask byte)
|
||||
|
||||
uint32_t scaleidx0 = (is < 8) ? is : (is - 8);
|
||||
uint32_t scaleidx0shift = (is < 8) ? 0u : 4u;
|
||||
uint32_t scaleidx1 = is + 8 - (is / 4) * 4;
|
||||
uint32_t scaleidx1shift = (is / 4) * 2;
|
||||
|
||||
const int8_t us = int8_t(
|
||||
((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) |
|
||||
(((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4));
|
||||
const float16_t dl = bl.block.d * float16_t(int(us) - 32);
|
||||
|
||||
// For idx % 4 == 0: (idx & 0x1F) == (idx & 0x1C) is a multiple of 4.
|
||||
const uint qsi = (n << 5) + (idx & 0x1Cu);
|
||||
const uint hmi = (idx & 0x1Cu);
|
||||
|
||||
// Two adjacent uint16 packed16 reads, combined into a uint32 in registers.
|
||||
// After this: byte j of qsw / hmw holds the data for element idx+j.
|
||||
const uint qsw = uint32_t(bl16.block.qs[qsi >> 1])
|
||||
| (uint32_t(bl16.block.qs[(qsi >> 1) + 1u]) << 16);
|
||||
const uint hmw = uint32_t(bl16.block.hmask[hmi >> 1])
|
||||
| (uint32_t(bl16.block.hmask[(hmi >> 1) + 1u]) << 16);
|
||||
|
||||
// qsshift in {0,2,4,6} and hbit in {0..7}: per-byte masks isolate the wanted bits
|
||||
// with no inter-byte leakage.
|
||||
const uint ql4 = (qsw >> qsshift) & 0x03030303u;
|
||||
const uint qh4 = (hmw >> hbit) & 0x01010101u;
|
||||
|
||||
const ivec4 q = ivec4(unpack8(ql4 | (qh4 << 2))) - ivec4(4);
|
||||
return f16vec4(vec4(q) * vec4(float(dl)));
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K {
|
||||
block_q4_K block;
|
||||
};
|
||||
@@ -187,6 +364,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4
|
||||
block_q4_K_packed16 block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed32 {
|
||||
block_q4_K_packed32 block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed128 {
|
||||
block_q4_K_packed128 block;
|
||||
};
|
||||
@@ -334,6 +515,55 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2
|
||||
return float16_t(ret);
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ4_K_v(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ4_K_packed32 bl32 = decodeBufQ4_K_packed32(bl);
|
||||
decodeBufQ4_K_packed128 bl128 = decodeBufQ4_K_packed128(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint is = idx >> 5; // 0..7
|
||||
|
||||
#if defined(IS_MUL_MM2) && defined(DATA_A_Q4_K)
|
||||
vec2 v = shAscales[is * shAscales_stride + (blockCoords[0] % BM)];
|
||||
float d = v.x;
|
||||
float m = v.y;
|
||||
#else
|
||||
uvec4 v = bl128.block.q4k[0];
|
||||
const vec2 loadd = vec2(unpackFloat2x16(v.x));
|
||||
|
||||
uint32_t sc;
|
||||
uint32_t mbyte;
|
||||
|
||||
uint32_t scale0 = v.y;
|
||||
uint32_t scale4 = v.z;
|
||||
uint32_t scale8 = v.w;
|
||||
|
||||
uint32_t sc_lo = scale0;
|
||||
uint32_t mb_lo = scale4;
|
||||
uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
|
||||
uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
|
||||
|
||||
sc = is < 4 ? sc_lo : sc_hi;
|
||||
mbyte = is < 4 ? mb_lo : mb_hi;
|
||||
sc = sc >> (8 * (is & 3));
|
||||
mbyte = mbyte >> (8 * (is & 3));
|
||||
sc &= 0x3F;
|
||||
mbyte &= 0x3F;
|
||||
|
||||
const float d = loadd.x * float(sc);
|
||||
const float m = loadd.y * float(mbyte);
|
||||
#endif
|
||||
|
||||
// idx in [0,256); vector decode uses idx a multiple of 4. packed32 word index:
|
||||
// (qs_i >> 1) == (idx >> 6) * 8 + ((idx & 0x1E) >> 2). sh is 0 or 4 only, so a
|
||||
// single (w >> sh) & 0x0F0F0F0F isolates all four nibbles without inter-byte leakage.
|
||||
const uint sh = (idx & 0x20u) >> 3u;
|
||||
const uint w = uint32_t(bl32.block.qs[(idx >> 6) * 8u + ((idx & 0x1Eu) >> 2)]);
|
||||
const u8vec4 q = unpack8((w >> sh) & 0x0F0F0F0Fu);
|
||||
|
||||
return f16vec4(vec4(d) * vec4(q) - vec4(m));
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K {
|
||||
block_q5_K block;
|
||||
};
|
||||
@@ -346,6 +576,10 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5
|
||||
block_q5_K_packed128 block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed32 {
|
||||
block_q5_K_packed32 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
|
||||
@@ -399,6 +633,58 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2
|
||||
return float16_t(ret);
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ5_K_v(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ5_K_packed32 bl32 = decodeBufQ5_K_packed32(bl);
|
||||
decodeBufQ5_K_packed128 bl128 = decodeBufQ5_K_packed128(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint is = idx >> 5;
|
||||
|
||||
#if defined(IS_MUL_MM2) && defined(DATA_A_Q5_K)
|
||||
vec2 v = shAscales[is * shAscales_stride + (blockCoords[0] % BM)];
|
||||
float d = v.x;
|
||||
float m = v.y;
|
||||
#else
|
||||
uvec4 v = bl128.block.q5k[0];
|
||||
|
||||
const f16vec2 loadd = unpackFloat2x16(v.x);
|
||||
|
||||
uint32_t sc;
|
||||
uint32_t mbyte;
|
||||
|
||||
uint32_t scale0 = v.y;
|
||||
uint32_t scale4 = v.z;
|
||||
uint32_t scale8 = v.w;
|
||||
|
||||
uint32_t sc_lo = scale0;
|
||||
uint32_t mb_lo = scale4;
|
||||
uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
|
||||
uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
|
||||
|
||||
sc = is < 4 ? sc_lo : sc_hi;
|
||||
mbyte = is < 4 ? mb_lo : mb_hi;
|
||||
sc = sc >> (8 * (is & 3));
|
||||
mbyte = mbyte >> (8 * (is & 3));
|
||||
sc &= 0x3F;
|
||||
mbyte &= 0x3F;
|
||||
|
||||
const float16_t d = loadd.x * float16_t(sc);
|
||||
const float16_t m = loadd.y * float16_t(mbyte);
|
||||
#endif
|
||||
|
||||
// sh is 0 or 4; mask 0x0F0F0F0F covers the four nibbles regardless (no inter-byte leakage).
|
||||
const uint sh = (idx & 0x20u) >> 3u;
|
||||
const uint qs_w = (idx >> 6) * 8u + ((idx & 0x1Eu) >> 2);
|
||||
const uint qh_w = (idx & 0x1Eu) >> 2;
|
||||
|
||||
const uint ql4 = (uint32_t(bl32.block.qs[qs_w]) >> sh) & 0x0F0F0F0Fu;
|
||||
// qh stores bit `is` per element across 4 consecutive bytes; one shift+mask handles all 4.
|
||||
const uint qh4 = ((uint32_t(bl32.block.qh[qh_w]) >> is) & 0x01010101u) << 4u;
|
||||
|
||||
const u8vec4 qi = unpack8(ql4 | qh4);
|
||||
return f16vec4(vec4(qi) * vec4(d) - vec4(m));
|
||||
}
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K {
|
||||
block_q6_K block;
|
||||
};
|
||||
@@ -431,6 +717,35 @@ float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncQ6_K_v(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint b = (idx & 0x40) >> 6;
|
||||
const uint qhshift = (idx & 0x60) >> 4; // 0,2,4,6
|
||||
const uint is = idx >> 4;
|
||||
const uint sh = b * 4; // 0 or 4
|
||||
|
||||
const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
|
||||
|
||||
const uint ql_i = ((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1);
|
||||
const uint qh_i = ((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1);
|
||||
|
||||
// Two adjacent uint16 packed16 reads, combined into a uint32 in registers.
|
||||
// After this: byte j of qlw / qhw holds the data for element idx+j.
|
||||
const uint qlw = uint32_t(bl16.block.ql[ql_i ]) | (uint32_t(bl16.block.ql[ql_i + 1]) << 16);
|
||||
const uint qhw = uint32_t(bl16.block.qh[qh_i ]) | (uint32_t(bl16.block.qh[qh_i + 1]) << 16);
|
||||
|
||||
// sh in {0,4} and qhshift in {0,2,4,6}: per-byte masks 0x0F / 0x03 keep only the
|
||||
// wanted bits with no inter-byte leakage; place qh's 2 bits at nibble high position.
|
||||
const uint ql4 = (qlw >> sh) & 0x0F0F0F0Fu;
|
||||
const uint qh4 = ((qhw >> qhshift) & 0x03030303u) << 4u;
|
||||
|
||||
const ivec4 qi = ivec4(unpack8(ql4 | qh4));
|
||||
return f16vec4((vec4(qi) - vec4(32.0f)) * vec4(float(dscale)));
|
||||
}
|
||||
|
||||
#if defined(DATA_A_IQ1_S)
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ1_S {
|
||||
block_iq1_s block;
|
||||
@@ -453,6 +768,29 @@ float16_t dequantFuncIQ1_S(const in decodeBufIQ1_S bl, const in uint blockCoords
|
||||
float16_t ret = float16_t(dl) * (float16_t(bitfieldExtract(int(grid), 2 * int(idx % 8), 2)) + float16_t(delta));
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncIQ1_S_v(const in decodeBufIQ1_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint ib32 = idx >> 5;
|
||||
const uint ib8 = idx >> 3;
|
||||
const int i8b = int(idx & 4); // 0 or 4
|
||||
|
||||
const uint qh = bl.block.qh[ib32];
|
||||
const uint qs = bl.block.qs[ib8];
|
||||
const float dl = float(d) * float(2 * bitfieldExtract(qh, 12, 3) + 1);
|
||||
const float delta = ((qh & 0x8000u) != 0u) ? -IQ1S_DELTA : IQ1S_DELTA;
|
||||
const uint grid = iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)];
|
||||
|
||||
const ivec4 q = ivec4(
|
||||
bitfieldExtract(int(grid), 2 * (i8b + 0), 2),
|
||||
bitfieldExtract(int(grid), 2 * (i8b + 1), 2),
|
||||
bitfieldExtract(int(grid), 2 * (i8b + 2), 2),
|
||||
bitfieldExtract(int(grid), 2 * (i8b + 3), 2));
|
||||
return f16vec4((vec4(q) + vec4(delta)) * dl);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ1_M)
|
||||
@@ -485,6 +823,33 @@ float16_t dequantFuncIQ1_M(const in decodeBufIQ1_M bl, const in uint blockCoords
|
||||
float16_t ret = d * float16_t(dl) * (float16_t(bitfieldExtract(int(grid), 2 * i8, 2)) + float16_t(delta));
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncIQ1_M_v(const in decodeBufIQ1_M bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufIQ1_M_packed64 bl64 = decodeBufIQ1_M_packed64(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
uvec2 scales = unpack32(bl64.block.scales);
|
||||
const float16_t d = uint16BitsToHalf(uint16_t(((scales.x & 0xF000) >> 12) | ((scales.x & 0xF0000000) >> 24) | ((scales.y & 0xF000) >> 4) | ((scales.y & 0xF0000000) >> 16)));
|
||||
|
||||
const uint ib8 = idx >> 3;
|
||||
const uint ib16 = idx >> 4;
|
||||
const int i8b = int(idx & 4); // 0 or 4 -- i8 base for the V=4 group
|
||||
|
||||
const uint sc = bl.block.scales[ib8 / 8];
|
||||
const uint qs = bl.block.qs[ib8];
|
||||
const uint qh = bl.block.qh[ib16] >> (4 * (ib8 & 1));
|
||||
const float dl = 2.0 * float(bitfieldExtract(sc, 3 * int(ib16 & 3), 3)) + 1.0;
|
||||
const float delta = ((qh & 8u) != 0u) ? -IQ1S_DELTA : IQ1S_DELTA;
|
||||
const uint grid = iq1s_grid[qs | ((qh & 7u) << 8)];
|
||||
|
||||
const ivec4 q = ivec4(
|
||||
bitfieldExtract(int(grid), 2 * (i8b + 0), 2),
|
||||
bitfieldExtract(int(grid), 2 * (i8b + 1), 2),
|
||||
bitfieldExtract(int(grid), 2 * (i8b + 2), 2),
|
||||
bitfieldExtract(int(grid), 2 * (i8b + 3), 2));
|
||||
return f16vec4((vec4(q) + vec4(delta)) * (float(d) * dl));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ2_XXS)
|
||||
@@ -520,6 +885,33 @@ float16_t dequantFuncIQ2_XXS(const in decodeBufIQ2_XXS bl, const in uint blockCo
|
||||
vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
|
||||
return float16_t(ret[idx & 1]);
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncIQ2_XXS_v(const in decodeBufIQ2_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufIQ2_XXS_packed16 bl16 = decodeBufIQ2_XXS_packed16(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint ib32 = idx >> 5;
|
||||
const uint ib8 = (idx & 0x18) >> 3;
|
||||
const uint iqs = 8 * ib32 + ib8;
|
||||
|
||||
const uint qs = bl.block.qs[iqs];
|
||||
const uint signscale = pack32(u16vec2(bl16.block.qs[4*ib32+2], bl16.block.qs[4*ib32+3]));
|
||||
const float dscale = float(bl.block.d) * 0.25 * (0.5 + float(signscale >> 28));
|
||||
|
||||
uint sign = bitfieldExtract(signscale, 7 * int(ib8), 7);
|
||||
sign |= bitCount(sign) << 7;
|
||||
const uint sb = sign >> (idx & 7u);
|
||||
|
||||
const uint g2 = iq2xxs_grid[qs][(idx & 4) >> 2];
|
||||
const u8vec4 g = unpack8(g2);
|
||||
|
||||
return f16vec4(
|
||||
dscale * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
|
||||
dscale * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
|
||||
dscale * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
|
||||
dscale * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ2_XS)
|
||||
@@ -548,6 +940,31 @@ float16_t dequantFuncIQ2_XS(const in decodeBufIQ2_XS bl, const in uint blockCoor
|
||||
vec2 ret = dscale * g * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf);
|
||||
return float16_t(ret[idx & 1]);
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncIQ2_XS_v(const in decodeBufIQ2_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint is = idx >> 5;
|
||||
const uint sshift = (idx & 0x10) >> 2;
|
||||
const uint iqs = idx >> 3;
|
||||
|
||||
const uint16_t qs = bl.block.qs[iqs];
|
||||
const float dscale = float(bl.block.d) * 0.25 * (0.5 + float((bl.block.scales[is] >> sshift) & 0xF));
|
||||
|
||||
uint sign = uint(qs >> 9);
|
||||
sign |= bitCount(sign) << 7;
|
||||
const uint sb = sign >> (idx & 7u);
|
||||
|
||||
const uint g2 = iq2xs_grid[qs & 0x1FF][(idx & 4) >> 2];
|
||||
const u8vec4 g = unpack8(g2);
|
||||
|
||||
return f16vec4(
|
||||
dscale * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
|
||||
dscale * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
|
||||
dscale * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
|
||||
dscale * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ2_S)
|
||||
@@ -576,6 +993,32 @@ float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords
|
||||
const vec2 v = db * vec2(sign01) * vec2(unpack8(g2));
|
||||
return float16_t(v[idx & 1]);
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncIQ2_S_v(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint ib32 = idx >> 5;
|
||||
const uint ib8 = idx >> 3;
|
||||
const uint qhshift = 2 * (ib8 % 4);
|
||||
|
||||
const uint scale = (bl.block.scales[ib32] >> ((idx & 0x10) >> 2)) & 0xf;
|
||||
const uint qs = bl.block.qs[ib8];
|
||||
const uint qh = bl.block.qh[ib32];
|
||||
const uint sb = uint(bl.block.qs[QUANT_K / 8 + ib8]) >> (idx & 0x6u);
|
||||
|
||||
const float d = float(bl.block.d);
|
||||
const float db = d * 0.25 * (0.5 + scale);
|
||||
|
||||
const uint g2 = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 4) >> 2];
|
||||
const u8vec4 g = unpack8(g2);
|
||||
|
||||
return f16vec4(
|
||||
db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
|
||||
db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
|
||||
db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
|
||||
db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ3_XXS)
|
||||
@@ -609,6 +1052,32 @@ float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCo
|
||||
const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy);
|
||||
return float16_t(v[idx & 1]);
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncIQ3_XXS_v(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufIQ3_XXS_packed16 bl16 = decodeBufIQ3_XXS_packed16(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint iqs = idx >> 2;
|
||||
const uint is = QUANT_K / 4 + ((idx & 0xE0) >> 3);
|
||||
|
||||
const float d = float(bl.block.d);
|
||||
const uint qs = bl.block.qs[iqs];
|
||||
const uint signs = pack32(u16vec2(bl16.block.qs[is/2+0], bl16.block.qs[is/2+1]));
|
||||
const float db = d * 0.5 * (0.5 + (signs >> 28));
|
||||
|
||||
const uint sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
|
||||
const uint sb = (sign7 | (bitCount(sign7) << 7)) >> (idx & 0x6u);
|
||||
|
||||
const uint grid = iq3xxs_grid[qs];
|
||||
const u8vec4 g = unpack8(grid);
|
||||
|
||||
return f16vec4(
|
||||
db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
|
||||
db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
|
||||
db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
|
||||
db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ3_S)
|
||||
@@ -635,6 +1104,30 @@ float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords
|
||||
|
||||
return float16_t(v[idx & 1]);
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncIQ3_S_v(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint iqs = idx >> 2;
|
||||
const uint iqh = idx >> 5;
|
||||
|
||||
const float d = float(bl.block.d);
|
||||
const uint qs = bl.block.qs[iqs];
|
||||
const uint qh = bl.block.qh[iqh];
|
||||
const uint sb = uint(bl.block.signs[iqs / 2]) >> (idx & 0x6u);
|
||||
const uint scale = bl.block.scales[iqs / 16];
|
||||
const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
|
||||
|
||||
const uint grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)];
|
||||
const u8vec4 g = unpack8(grid);
|
||||
|
||||
return f16vec4(
|
||||
db * float(g.x) * ((sb & 1u) != 0u ? -1.0 : 1.0),
|
||||
db * float(g.y) * ((sb & 2u) != 0u ? -1.0 : 1.0),
|
||||
db * float(g.z) * ((sb & 4u) != 0u ? -1.0 : 1.0),
|
||||
db * float(g.w) * ((sb & 8u) != 0u ? -1.0 : 1.0));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ4_XS)
|
||||
@@ -642,6 +1135,10 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4
|
||||
block_iq4_xs block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufIQ4_XS_packed32 {
|
||||
block_iq4_xs_packed32 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
@@ -657,6 +1154,30 @@ float16_t dequantFuncIQ4_XS(const in decodeBufIQ4_XS bl, const in uint blockCoor
|
||||
float16_t ret = d * float16_t(int(sl | (sh << 4)) - 32) * float16_t(kvalues_iq4nl[q]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncIQ4_XS_v(const in decodeBufIQ4_XS bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufIQ4_XS_packed32 bl32 = decodeBufIQ4_XS_packed32(bl);
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
|
||||
const uint ib32 = idx >> 5; // 0..7
|
||||
const uint sl = (bl32.block.scales_l >> (4 * ib32)) & 0xF;
|
||||
const uint sh = (uint(bl32.block.scales_h) >> (2 * ib32)) & 0x3;
|
||||
const uint qshift = (idx & 0x10) >> 2; // {0, 4}
|
||||
const uint qs_w = 4 * ib32 + ((idx & 0xC) >> 2); // iqs / 4, in [0,32)
|
||||
|
||||
const float16_t dl = d * float16_t(int(sl | (sh << 4)) - 32);
|
||||
|
||||
const uint qsw = bl32.block.qs[qs_w];
|
||||
const u8vec4 qv = unpack8((qsw >> qshift) & 0x0F0F0F0Fu);
|
||||
const vec4 ret = vec4(
|
||||
float(kvalues_iq4nl[qv.x]),
|
||||
float(kvalues_iq4nl[qv.y]),
|
||||
float(kvalues_iq4nl[qv.z]),
|
||||
float(kvalues_iq4nl[qv.w])) * float(dl);
|
||||
return f16vec4(ret);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
@@ -664,6 +1185,10 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4
|
||||
block_iq4_nl block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL_packed16 {
|
||||
block_iq4_nl_packed16 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float16_t d = bl.block.d;
|
||||
@@ -676,6 +1201,24 @@ float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoor
|
||||
float16_t ret = float16_t(kvalues_iq4nl[qs]) * d;
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncIQ4_NL_v(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufIQ4_NL_packed16 bl16 = decodeBufIQ4_NL_packed16(bl);
|
||||
const float16_t d = bl.block.d;
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint shift = (idx & 0x10) >> 2; // 0 or 4
|
||||
const uint qs_i = (idx & 0xC) >> 1; // packed16 word index, in {0,2,4,6}
|
||||
const uint qsw = uint32_t(bl16.block.qs[qs_i ])
|
||||
| (uint32_t(bl16.block.qs[qs_i + 1u]) << 16);
|
||||
// shift in {0,4}: per-byte mask 0x0F isolates the wanted nibble in each byte.
|
||||
const u8vec4 q = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
|
||||
return f16vec4(
|
||||
float(d) * float(kvalues_iq4nl[q.x]),
|
||||
float(d) * float(kvalues_iq4nl[q.y]),
|
||||
float(d) * float(kvalues_iq4nl[q.z]),
|
||||
float(d) * float(kvalues_iq4nl[q.w]));
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_MXFP4)
|
||||
@@ -695,6 +1238,26 @@ float16_t dequantFuncMXFP4(const in decodeBufMXFP4 bl, const in uint blockCoords
|
||||
float16_t ret = float16_t(kvalues_mxfp4[qs] * d * 0.5);
|
||||
return ret;
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncMXFP4_v(const in decodeBufMXFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const float d = e8m0_to_fp32(bl.block.e);
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint iqs = idx & 0xF;
|
||||
const uint shift = (idx & 0x10) >> 2;
|
||||
uvec4 qv = uvec4(
|
||||
uint(bl.block.qs[iqs]),
|
||||
uint(bl.block.qs[iqs + 1u]),
|
||||
uint(bl.block.qs[iqs + 2u]),
|
||||
uint(bl.block.qs[iqs + 3u]));
|
||||
qv = (qv >> shift) & 0xFu;
|
||||
const vec4 ret = vec4(
|
||||
float(kvalues_mxfp4[qv.x]),
|
||||
float(kvalues_mxfp4[qv.y]),
|
||||
float(kvalues_mxfp4[qv.z]),
|
||||
float(kvalues_mxfp4[qv.w])) * d * 0.5f;
|
||||
return f16vec4(ret);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_NVFP4)
|
||||
@@ -702,6 +1265,10 @@ layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufNVF
|
||||
block_nvfp4 block;
|
||||
};
|
||||
|
||||
layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufNVFP4_packed32 {
|
||||
block_nvfp4_packed32 block;
|
||||
};
|
||||
|
||||
float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
const uint idx = coordInBlock[1];
|
||||
@@ -713,56 +1280,97 @@ float16_t dequantFuncNVFP4(const in decodeBufNVFP4 bl, const in uint blockCoords
|
||||
qs = (qs >> shift) & 0xF;
|
||||
return float16_t(kvalues_mxfp4[qs] * d * 0.5);
|
||||
}
|
||||
|
||||
f16vec4 dequantFuncNVFP4_v(const in decodeBufNVFP4 bl, const in uint blockCoords[2], const in uint coordInBlock[2])
|
||||
{
|
||||
decodeBufNVFP4_packed32 bl32 = decodeBufNVFP4_packed32(bl);
|
||||
const uint idx = coordInBlock[1];
|
||||
const uint sub = idx >> 4;
|
||||
const uint qs_w = ((idx & 0x30) >> 3) + ((idx & 0x4u) >> 2); // iqs / 4, in [0,8)
|
||||
const uint shift = (idx & 0x8) >> 1;
|
||||
const float d = ue4m3_to_fp32(bl.block.d[sub]);
|
||||
|
||||
const uint qsw = uint32_t(bl32.block.qs[qs_w]);
|
||||
const u8vec4 qv = unpack8((qsw >> shift) & 0x0F0F0F0Fu);
|
||||
const vec4 ret = vec4(
|
||||
float(kvalues_mxfp4[qv.x]),
|
||||
float(kvalues_mxfp4[qv.y]),
|
||||
float(kvalues_mxfp4[qv.z]),
|
||||
float(kvalues_mxfp4[qv.w])) * d * 0.5f;
|
||||
return f16vec4(ret);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q1_0)
|
||||
#define dequantFuncA dequantFuncQ1_0
|
||||
#define dequantFuncA_v dequantFuncQ1_0_v
|
||||
#elif defined(DATA_A_Q4_0)
|
||||
#define dequantFuncA dequantFuncQ4_0
|
||||
#define dequantFuncA_v dequantFuncQ4_0_v
|
||||
#elif defined(DATA_A_Q4_1)
|
||||
#define dequantFuncA dequantFuncQ4_1
|
||||
#define dequantFuncA_v dequantFuncQ4_1_v
|
||||
#elif defined(DATA_A_Q5_0)
|
||||
#define dequantFuncA dequantFuncQ5_0
|
||||
#define dequantFuncA_v dequantFuncQ5_0_v
|
||||
#elif defined(DATA_A_Q5_1)
|
||||
#define dequantFuncA dequantFuncQ5_1
|
||||
#define dequantFuncA_v dequantFuncQ5_1_v
|
||||
#elif defined(DATA_A_Q8_0)
|
||||
#define dequantFuncA dequantFuncQ8_0
|
||||
#define dequantFuncA_v dequantFuncQ8_0_v
|
||||
#elif defined(DATA_A_Q2_K)
|
||||
#define dequantFuncA dequantFuncQ2_K
|
||||
#define dequantFuncA_v dequantFuncQ2_K_v
|
||||
#elif defined(DATA_A_Q3_K)
|
||||
#define dequantFuncA dequantFuncQ3_K
|
||||
#define dequantFuncA_v dequantFuncQ3_K_v
|
||||
#elif defined(DATA_A_Q4_K)
|
||||
#define dequantFuncA dequantFuncQ4_K
|
||||
#define dequantFuncA_v dequantFuncQ4_K_v
|
||||
#define fetch_scales fetch_scalesQ4_K
|
||||
#define store_scales store_scalesQ4_K
|
||||
#elif defined(DATA_A_Q5_K)
|
||||
#define dequantFuncA dequantFuncQ5_K
|
||||
#define dequantFuncA_v dequantFuncQ5_K_v
|
||||
#define fetch_scales fetch_scalesQ5_K
|
||||
#define store_scales store_scalesQ4_K
|
||||
#elif defined(DATA_A_Q6_K)
|
||||
#define dequantFuncA dequantFuncQ6_K
|
||||
#define dequantFuncA_v dequantFuncQ6_K_v
|
||||
#elif defined(DATA_A_IQ1_S)
|
||||
#define dequantFuncA dequantFuncIQ1_S
|
||||
#define dequantFuncA_v dequantFuncIQ1_S_v
|
||||
#elif defined(DATA_A_IQ1_M)
|
||||
#define dequantFuncA dequantFuncIQ1_M
|
||||
#define dequantFuncA_v dequantFuncIQ1_M_v
|
||||
#elif defined(DATA_A_IQ2_XXS)
|
||||
#define dequantFuncA dequantFuncIQ2_XXS
|
||||
#define dequantFuncA_v dequantFuncIQ2_XXS_v
|
||||
#elif defined(DATA_A_IQ2_XS)
|
||||
#define dequantFuncA dequantFuncIQ2_XS
|
||||
#define dequantFuncA_v dequantFuncIQ2_XS_v
|
||||
#elif defined(DATA_A_IQ2_S)
|
||||
#define dequantFuncA dequantFuncIQ2_S
|
||||
#define dequantFuncA_v dequantFuncIQ2_S_v
|
||||
#elif defined(DATA_A_IQ3_XXS)
|
||||
#define dequantFuncA dequantFuncIQ3_XXS
|
||||
#define dequantFuncA_v dequantFuncIQ3_XXS_v
|
||||
#elif defined(DATA_A_IQ3_S)
|
||||
#define dequantFuncA dequantFuncIQ3_S
|
||||
#define dequantFuncA_v dequantFuncIQ3_S_v
|
||||
#elif defined(DATA_A_IQ4_XS)
|
||||
#define dequantFuncA dequantFuncIQ4_XS
|
||||
#define dequantFuncA_v dequantFuncIQ4_XS_v
|
||||
#elif defined(DATA_A_IQ4_NL)
|
||||
#define dequantFuncA dequantFuncIQ4_NL
|
||||
#define dequantFuncA_v dequantFuncIQ4_NL_v
|
||||
#elif defined(DATA_A_MXFP4)
|
||||
#define dequantFuncA dequantFuncMXFP4
|
||||
#define dequantFuncA_v dequantFuncMXFP4_v
|
||||
#elif defined(DATA_A_NVFP4)
|
||||
#define dequantFuncA dequantFuncNVFP4
|
||||
#define dequantFuncA_v dequantFuncNVFP4_v
|
||||
#elif defined(DATA_A_F32)
|
||||
#define dequantFuncA dequantFuncF32
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
#version 460
|
||||
|
||||
#extension GL_NV_cooperative_matrix_decode_vector : require
|
||||
|
||||
void main()
|
||||
{
|
||||
}
|
||||
@@ -11,6 +11,9 @@
|
||||
#extension GL_KHR_memory_scope_semantics : enable
|
||||
#extension GL_KHR_cooperative_matrix : enable
|
||||
#extension GL_NV_cooperative_matrix2 : enable
|
||||
#ifdef GL_NV_cooperative_matrix_decode_vector
|
||||
#extension GL_NV_cooperative_matrix_decode_vector : enable
|
||||
#endif
|
||||
#extension GL_EXT_buffer_reference : enable
|
||||
#extension GL_KHR_shader_subgroup_ballot : enable
|
||||
#extension GL_KHR_shader_subgroup_vote : enable
|
||||
@@ -54,6 +57,41 @@ float16_t faDecodeV(const decodeBufFA_V bl_in, const uint blockCoords[2], const
|
||||
}
|
||||
}
|
||||
|
||||
// V=4 vector decode for K/V; dispatches to per-format _v decoders.
|
||||
f16vec4 faDecodeKVector(const decodeBufFA_K bl_in, const uint blockCoords[2], const uint coordInBlock[2]) {
|
||||
switch (FaTypeK) {
|
||||
case 0u: return f16vec4(decodeBufF32(bl_in).block);
|
||||
case 2u: return dequantFuncQ4_0_v(decodeBufQ4_0(bl_in), blockCoords, coordInBlock);
|
||||
case 3u: return dequantFuncQ4_1_v(decodeBufQ4_1(bl_in), blockCoords, coordInBlock);
|
||||
case 6u: return dequantFuncQ5_0_v(decodeBufQ5_0(bl_in), blockCoords, coordInBlock);
|
||||
case 7u: return dequantFuncQ5_1_v(decodeBufQ5_1(bl_in), blockCoords, coordInBlock);
|
||||
case 8u: return dequantFuncQ8_0_v(decodeBufQ8_0(bl_in), blockCoords, coordInBlock);
|
||||
case 41u: return dequantFuncQ1_0_v(decodeBufQ1_0(bl_in), blockCoords, coordInBlock);
|
||||
default: return f16vec4(0);
|
||||
}
|
||||
}
|
||||
|
||||
f16vec4 faDecodeVVector(const decodeBufFA_V bl_in, const uint blockCoords[2], const uint coordInBlock[2]) {
|
||||
switch (FaTypeV) {
|
||||
case 0u: return f16vec4(decodeBufF32(bl_in).block);
|
||||
case 2u: return dequantFuncQ4_0_v(decodeBufQ4_0(bl_in), blockCoords, coordInBlock);
|
||||
case 3u: return dequantFuncQ4_1_v(decodeBufQ4_1(bl_in), blockCoords, coordInBlock);
|
||||
case 6u: return dequantFuncQ5_0_v(decodeBufQ5_0(bl_in), blockCoords, coordInBlock);
|
||||
case 7u: return dequantFuncQ5_1_v(decodeBufQ5_1(bl_in), blockCoords, coordInBlock);
|
||||
case 8u: return dequantFuncQ8_0_v(decodeBufQ8_0(bl_in), blockCoords, coordInBlock);
|
||||
case 41u: return dequantFuncQ1_0_v(decodeBufQ1_0(bl_in), blockCoords, coordInBlock);
|
||||
default: return f16vec4(0);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GL_NV_cooperative_matrix_decode_vector
|
||||
#define FADECODEK , faDecodeK, faDecodeKVector
|
||||
#define FADECODEV , faDecodeV, faDecodeVVector
|
||||
#else
|
||||
#define FADECODEK , faDecodeK
|
||||
#define FADECODEV , faDecodeV
|
||||
#endif
|
||||
|
||||
layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
|
||||
layout (binding = 1) readonly buffer K {uint8_t data_k[];};
|
||||
layout (binding = 2) readonly buffer V {uint8_t data_v[];};
|
||||
@@ -259,7 +297,7 @@ void main() {
|
||||
// F16: bs_k==1 (direct load). F32: bs_k==4 (vec4 / dequantFuncF32). Q4/Q8 family: bs_k==32. Q1_0: bs_k==128.
|
||||
const bool k_use_decode = (bs_k > 1u);
|
||||
if (k_use_decode) {
|
||||
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose, faDecodeK);
|
||||
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose FADECODEK);
|
||||
} else {
|
||||
coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose);
|
||||
}
|
||||
@@ -325,7 +363,7 @@ void main() {
|
||||
uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23;
|
||||
const bool v_use_decode = (bs_v > 1u);
|
||||
if (v_use_decode) {
|
||||
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad), faDecodeV);
|
||||
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) FADECODEV);
|
||||
} else {
|
||||
coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad));
|
||||
}
|
||||
|
||||
69
ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp
Normal file
69
ggml/src/ggml-vulkan/vulkan-shaders/fwht.comp
Normal file
@@ -0,0 +1,69 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_control_flow_attributes : require
|
||||
#extension GL_KHR_shader_subgroup_basic : enable
|
||||
#extension GL_KHR_shader_subgroup_shuffle : enable
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
|
||||
|
||||
layout(constant_id = 0) const uint WARP_SIZE = 32;
|
||||
layout(constant_id = 1) const uint N = 128;
|
||||
|
||||
layout(push_constant) uniform parameter
|
||||
{
|
||||
uint n_rows;
|
||||
uint src_offset;
|
||||
uint dst_offset;
|
||||
float scale;
|
||||
};
|
||||
|
||||
layout(binding = 0, std430) readonly buffer A { float data_a[]; };
|
||||
layout(binding = 1, std430) writeonly buffer D { float data_d[]; };
|
||||
|
||||
const uint EL_W = N / WARP_SIZE;
|
||||
|
||||
void main() {
|
||||
const uint lane = gl_SubgroupInvocationID;
|
||||
for (uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_SubgroupID;
|
||||
row < n_rows;
|
||||
row += gl_NumWorkGroups.x * gl_WorkGroupSize.y) {
|
||||
const uint row_offset = row * N;
|
||||
|
||||
float reg[EL_W];
|
||||
|
||||
[[unroll]]
|
||||
for (uint i = 0; i < EL_W; ++i) {
|
||||
reg[i] = data_a[src_offset + row_offset + i * WARP_SIZE + lane] * scale;
|
||||
}
|
||||
|
||||
[[unroll]]
|
||||
for (uint h = 1; h < WARP_SIZE; h <<= 1) {
|
||||
[[unroll]]
|
||||
for (uint j = 0; j < EL_W; ++j) {
|
||||
const float val = reg[j];
|
||||
const float val2 = subgroupShuffleXor(val, h);
|
||||
reg[j] = (lane & h) == 0 ? val + val2 : val2 - val;
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]]
|
||||
for (uint h = WARP_SIZE; h < N; h <<= 1) {
|
||||
const uint step = h / WARP_SIZE;
|
||||
[[unroll]]
|
||||
for (uint j = 0; j < EL_W; j += 2 * step) {
|
||||
[[unroll]]
|
||||
for (uint k = 0; k < step; ++k) {
|
||||
const float x = reg[j + k];
|
||||
const float y = reg[j + k + step];
|
||||
reg[j + k] = x + y;
|
||||
reg[j + k + step] = x - y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]]
|
||||
for (uint i = 0; i < EL_W; ++i) {
|
||||
data_d[dst_offset + row_offset + i * WARP_SIZE + lane] = reg[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10,12 +10,38 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
#if !defined(DATA_A_F32) && !defined(DATA_A_F16) && !defined(DATA_A_BF16)
|
||||
#define K_PER_ITER 8
|
||||
#else
|
||||
#define K_PER_ITER 2
|
||||
#define K_PER_ITER 4
|
||||
#endif
|
||||
|
||||
|
||||
uint a_offset, b_offset, d_offset, y_offset;
|
||||
|
||||
vec4 load_b(const uint j, const uint iybs, const uint iqs, const bool lastiter, out bool OOB_y, out bool OOB_z, out bool OOB_w) {
|
||||
// Check if the latter elements are OOB, and don't fetch B or accumulate it.
|
||||
OOB_y = lastiter && (iybs + iqs + y_offset >= p.ncols);
|
||||
OOB_z = lastiter && (iybs + iqs + y_offset*2 >= p.ncols);
|
||||
OOB_w = lastiter && (iybs + iqs + y_offset*3 >= p.ncols);
|
||||
|
||||
if (!OOB_w) {
|
||||
return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
|
||||
FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]),
|
||||
FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*2]),
|
||||
FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*3]));
|
||||
} else if (!OOB_z) {
|
||||
return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
|
||||
FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]),
|
||||
FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset*2]),
|
||||
0);
|
||||
} else if (!OOB_y) {
|
||||
return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
|
||||
FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]),
|
||||
0, 0);
|
||||
} else {
|
||||
return vec4(FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]),
|
||||
0, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
|
||||
{
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
@@ -25,6 +51,8 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
|
||||
|
||||
#if K_PER_ITER == 8
|
||||
#if QUANT_R == 2
|
||||
// Note that we end up fetching bogus elements here, but its fine as they'll be
|
||||
// within an accessible block.
|
||||
const vec4 bv02 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]);
|
||||
const vec4 bv13 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4]);
|
||||
const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
|
||||
@@ -34,18 +62,11 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
|
||||
const vec4 bv1 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4 + 1]);
|
||||
#endif
|
||||
#else
|
||||
// Check if the second of the pair of elements is OOB, and don't fetch B or
|
||||
// accumulate it. We still fetch a pair of elements for A, which is fine for
|
||||
// quantized formats since they'll be within the same block. We should
|
||||
// probably skip fetching the second element for F16/F32, but as of now we
|
||||
// still do.
|
||||
const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
|
||||
bool OOB_y;
|
||||
bool OOB_z;
|
||||
bool OOB_w;
|
||||
|
||||
FLOAT_TYPE b0 = 0, b1 = 0;
|
||||
b0 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]);
|
||||
if (!OOB) {
|
||||
b1 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]);
|
||||
}
|
||||
const vec4 b = load_b(j, iybs, iqs, lastiter, OOB_y, OOB_z, OOB_w);
|
||||
#endif
|
||||
uint ibi = first_row*p.ncols;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
@@ -71,22 +92,60 @@ void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const
|
||||
|
||||
temp[j][n] += rowtmp;
|
||||
#else
|
||||
const vec2 v = dequantize(ib, iqs, a_offset);
|
||||
|
||||
// matrix multiplication
|
||||
temp[j][n] = fma(FLOAT_TYPE(v.x), b0, temp[j][n]);
|
||||
if (!OOB) {
|
||||
temp[j][n] = fma(FLOAT_TYPE(v.y), b1, temp[j][n]);
|
||||
if (!OOB_w) {
|
||||
const vec4 v = dequantize4(ib, iqs, a_offset);
|
||||
temp[j][n] += dot(v, b);
|
||||
} else if (!OOB_z) {
|
||||
const vec2 v0 = dequantize(ib, iqs, a_offset);
|
||||
const FLOAT_TYPE v1 = dequantize1(ib + 2/QUANT_R, iqs, a_offset);
|
||||
const vec3 v = vec3(v0.x, v0.y, v1);
|
||||
const vec3 b0 = vec3(b.x, b.y, b.z);
|
||||
temp[j][n] += dot(v, b0);
|
||||
} else if (!OOB_y) {
|
||||
const vec2 v0 = dequantize(ib, iqs, a_offset);
|
||||
const vec2 b0 = vec2(b.x, b.y);
|
||||
temp[j][n] += dot(v0, b0);
|
||||
} else {
|
||||
const FLOAT_TYPE v = dequantize1(ib, iqs, a_offset);
|
||||
temp[j][n] = fma(v, b.x, temp[j][n]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)
|
||||
void iter_aligned_nonquant(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i)
|
||||
{
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
|
||||
const uint iqs = 0; // quant index
|
||||
const uint iybs = col; // y block start index
|
||||
|
||||
const vec4 b = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4];
|
||||
|
||||
uint ibi = first_row*p.ncols;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib = (ibi + col)/QUANT_K; // block index
|
||||
ibi += p.ncols;
|
||||
|
||||
const vec4 v = dequantize4_2aligned(ib, iqs, a_offset);
|
||||
|
||||
// matrix multiplication
|
||||
temp[j][n] += dot(v, b);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
const bool is_aligned_nonquant =
|
||||
p.batch_stride_b % 4 == 0 && b_offset % 4 == 0 &&
|
||||
p.ncols % 4 == 0 && BLOCK_SIZE % 4 == 0 &&
|
||||
K_PER_ITER == 4;
|
||||
|
||||
y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
||||
|
||||
@@ -105,17 +164,26 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
int unroll_count = 4;
|
||||
uint unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||
|
||||
#if K_PER_ITER == 2
|
||||
uint i = 0;
|
||||
|
||||
#if K_PER_ITER == 4
|
||||
// If the K dimension is odd, we need lastiter==true on the last iteration
|
||||
// so OOB is computed correctly. Skip some unrolling to make that happen.
|
||||
if ((p.ncols & 1) != 0 &&
|
||||
if ((p.ncols & 3) != 0 &&
|
||||
unrolled_iters == num_iters &&
|
||||
unrolled_iters > 0) {
|
||||
unrolled_iters -= unroll_count;
|
||||
}
|
||||
if (is_aligned_nonquant) {
|
||||
while (i < unrolled_iters) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
|
||||
uint i = 0;
|
||||
while (i < unrolled_iters) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
@@ -123,18 +191,30 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
#if K_PER_ITER == 4
|
||||
}
|
||||
#endif
|
||||
|
||||
unroll_count = 2;
|
||||
unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||
|
||||
#if K_PER_ITER == 2
|
||||
if ((p.ncols & 1) != 0 &&
|
||||
#if K_PER_ITER == 4
|
||||
if ((p.ncols & 3) != 0 &&
|
||||
unrolled_iters == num_iters &&
|
||||
unrolled_iters > 0) {
|
||||
unrolled_iters -= unroll_count;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (is_aligned_nonquant) {
|
||||
while (i < unrolled_iters && is_aligned_nonquant) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
while (i < unrolled_iters) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
@@ -142,10 +222,25 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
#if K_PER_ITER == 4
|
||||
}
|
||||
#endif
|
||||
|
||||
#if K_PER_ITER == 4
|
||||
if (is_aligned_nonquant) {
|
||||
while (i < num_iters) {
|
||||
iter_aligned_nonquant(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
while (i < num_iters) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER, true);
|
||||
i++;
|
||||
}
|
||||
#if K_PER_ITER == 4
|
||||
}
|
||||
#endif
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
@@ -164,6 +259,6 @@ void main() {
|
||||
if (first_row >= p.stride_d) {
|
||||
return;
|
||||
}
|
||||
compute_outputs(first_row, p.stride_d - first_row);
|
||||
compute_outputs(first_row, min(NUM_ROWS, p.stride_d - first_row));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,10 +71,12 @@ layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#if QUANT_K > 1
|
||||
#define DECODEFUNCA , dequantFuncA
|
||||
|
||||
#include "dequant_funcs_cm2.glsl"
|
||||
|
||||
#if defined(dequantFuncA_v) && defined(GL_NV_cooperative_matrix_decode_vector)
|
||||
#define DECODEFUNCA , dequantFuncA, dequantFuncA_v
|
||||
#else
|
||||
#define DECODEFUNCA , dequantFuncA
|
||||
#endif
|
||||
#else
|
||||
#define DECODEFUNCA
|
||||
#endif
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#else
|
||||
#define A_TYPE float16_t
|
||||
#endif
|
||||
#define A_TYPE_PACKED32 f16vec2
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_BF16)
|
||||
@@ -44,6 +45,7 @@
|
||||
#else
|
||||
#define A_TYPE uint16_t
|
||||
#endif
|
||||
#define A_TYPE_PACKED32 uint32_t
|
||||
#endif
|
||||
|
||||
#define QUANT_K_Q4_0 32
|
||||
@@ -1722,11 +1724,18 @@ struct block_nvfp4
|
||||
uint8_t qs[QUANT_K_NVFP4 / 2];
|
||||
};
|
||||
|
||||
struct block_nvfp4_packed32
|
||||
{
|
||||
uint32_t d[QUANT_K_NVFP4 / 16 / 4];
|
||||
uint32_t qs[QUANT_K_NVFP4 / 2 / 4];
|
||||
};
|
||||
|
||||
#if defined(DATA_A_NVFP4)
|
||||
#define QUANT_K QUANT_K_NVFP4
|
||||
#define QUANT_R QUANT_R_NVFP4
|
||||
#define QUANT_AUXF 1
|
||||
#define A_TYPE block_nvfp4
|
||||
#define A_TYPE_PACKED32 block_nvfp4_packed32
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
|
||||
|
||||
@@ -798,9 +798,11 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("repeat_i32", "repeat.comp", {{"A_TYPE", "int32_t"}, {"D_TYPE", "int32_t"}});
|
||||
string_to_spv("repeat_back_f32", "repeat_back.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("repeat_i16", "repeat.comp", {{"A_TYPE", "int16_t"}, {"D_TYPE", "int16_t"}});
|
||||
|
||||
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
|
||||
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||
@@ -932,6 +934,7 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("argmax_f32", "argmax.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "int"}}));
|
||||
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("fwht_f32", "fwht.comp", {});
|
||||
string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
|
||||
string_to_spv("cumsum_f32", "cumsum.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("cumsum_multipass1_f32", "cumsum_multipass1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
@@ -984,8 +987,16 @@ void process_shaders() {
|
||||
string_to_spv(name + (unroll ? "_unroll" : ""), "conv2d_mm.comp", defines);
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
if (unroll) {
|
||||
defines["COOPMAT2"] = "1";
|
||||
string_to_spv(name, "conv2d_mm.comp", defines, true, false, true);
|
||||
auto cm2_defines = defines;
|
||||
cm2_defines["COOPMAT2"] = "1";
|
||||
string_to_spv(name, "conv2d_mm.comp", cm2_defines, true, false, true);
|
||||
}
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (unroll) {
|
||||
auto cm1_defines = defines;
|
||||
cm1_defines["COOPMAT"] = "1";
|
||||
string_to_spv(name, "conv2d_mm.comp", cm1_defines, true, true, false);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@
|
||||
#define WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG 4
|
||||
#define WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG 4
|
||||
|
||||
// default size for legacy matrix multiplication
|
||||
// default size for reg-tile matrix multiplication
|
||||
#define WEBGPU_MUL_MAT_WG_SIZE 256
|
||||
|
||||
// Same hash combine function as in boost
|
||||
@@ -93,6 +93,8 @@ struct ggml_webgpu_shader_lib_context {
|
||||
uint32_t sg_mat_k = 0;
|
||||
uint32_t min_subgroup_size = 0;
|
||||
uint32_t max_subgroup_size = 0;
|
||||
bool supports_dot_product = false;
|
||||
std::string vendor;
|
||||
};
|
||||
|
||||
struct webgpu_pipeline {
|
||||
@@ -850,31 +852,15 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
|
||||
|
||||
/** Matrix Multiplication **/
|
||||
|
||||
struct ggml_webgpu_legacy_mul_mat_pipeline_key {
|
||||
ggml_type src0_type;
|
||||
ggml_type src1_type;
|
||||
|
||||
bool operator==(const ggml_webgpu_legacy_mul_mat_pipeline_key & other) const {
|
||||
return src0_type == other.src0_type && src1_type == other.src1_type;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_legacy_mul_mat_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_legacy_mul_mat_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.src0_type);
|
||||
ggml_webgpu_hash_combine(seed, key.src1_type);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_mul_mat_vec_pipeline_key {
|
||||
ggml_type src0_type;
|
||||
ggml_type src1_type;
|
||||
int vectorized;
|
||||
bool use_mmvq;
|
||||
|
||||
bool operator==(const ggml_webgpu_mul_mat_vec_pipeline_key & other) const {
|
||||
return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized;
|
||||
return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized &&
|
||||
use_mmvq == other.use_mmvq;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -884,6 +870,7 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key_hash {
|
||||
ggml_webgpu_hash_combine(seed, key.src0_type);
|
||||
ggml_webgpu_hash_combine(seed, key.src1_type);
|
||||
ggml_webgpu_hash_combine(seed, key.vectorized);
|
||||
ggml_webgpu_hash_combine(seed, key.use_mmvq);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
@@ -894,6 +881,20 @@ struct ggml_webgpu_mul_mat_vec_shader_decisions {
|
||||
uint32_t vec_size;
|
||||
};
|
||||
|
||||
struct ggml_webgpu_quantize_q8_pipeline_key {
|
||||
ggml_type src0_type;
|
||||
|
||||
bool operator==(const ggml_webgpu_quantize_q8_pipeline_key & other) const { return src0_type == other.src0_type; }
|
||||
};
|
||||
|
||||
struct ggml_webgpu_quantize_q8_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_quantize_q8_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.src0_type);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_mul_mat_pipeline_key {
|
||||
ggml_type src0_type;
|
||||
ggml_type src1_type;
|
||||
@@ -1051,6 +1052,36 @@ struct ggml_webgpu_soft_max_pipeline_key_hash {
|
||||
}
|
||||
};
|
||||
|
||||
/** MMVQ **/
|
||||
|
||||
inline bool ggml_webgpu_can_use_mmvq(const ggml_tensor * src0,
|
||||
const ggml_tensor * src1,
|
||||
bool supports_dot_product,
|
||||
const std::string & vendor) {
|
||||
if (src1->ne[1] == 1) {
|
||||
bool supports_dp4a = vendor == "amd" || vendor == "intel" || vendor == "nvidia";
|
||||
if (supports_dp4a && supports_dot_product) {
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F32:
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return src0->ne[0] % 4 == 0;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
class ggml_webgpu_shader_lib {
|
||||
wgpu::Device device;
|
||||
pre_wgsl::Preprocessor preprocessor;
|
||||
@@ -1099,14 +1130,12 @@ class ggml_webgpu_shader_lib {
|
||||
webgpu_pipeline,
|
||||
ggml_webgpu_flash_attn_blk_pipeline_key_hash>
|
||||
flash_attn_blk_pipelines;
|
||||
std::unordered_map<ggml_webgpu_legacy_mul_mat_pipeline_key,
|
||||
webgpu_pipeline,
|
||||
ggml_webgpu_legacy_mul_mat_pipeline_key_hash>
|
||||
mul_mat_legacy_pipelines; // legacy mul_mat (non-subgroup/non-regtile/non-vec)
|
||||
std::unordered_map<ggml_webgpu_mul_mat_vec_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_vec_pipeline_key_hash>
|
||||
mul_mat_vec_pipelines; // fast mat-vec (n==1)
|
||||
std::unordered_map<ggml_webgpu_mul_mat_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_pipeline_key_hash>
|
||||
mul_mat_fast_pipelines; // fast mat-mat (reg-tile or subgroup)
|
||||
std::unordered_map<ggml_webgpu_quantize_q8_pipeline_key, webgpu_pipeline, ggml_webgpu_quantize_q8_pipeline_key_hash>
|
||||
quantize_q8_pipelines;
|
||||
std::unordered_map<int, webgpu_pipeline> mul_mat_id_gather_pipelines; // key is fixed
|
||||
std::unordered_map<ggml_webgpu_mul_mat_id_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_id_pipeline_key_hash>
|
||||
mul_mat_id_pipelines; // src0_type/src1_type
|
||||
@@ -1631,7 +1660,7 @@ class ggml_webgpu_shader_lib {
|
||||
key.type = context.dst->type;
|
||||
key.d_state = (int) context.src0->ne[0];
|
||||
key.xbc_overlap = ggml_webgpu_tensor_overlap(context.src1, context.src4) &&
|
||||
ggml_webgpu_tensor_overlap(context.src1, context.src5);
|
||||
ggml_webgpu_tensor_overlap(context.src1, context.src5);
|
||||
|
||||
auto it = ssm_scan_pipelines.find(key);
|
||||
if (it != ssm_scan_pipelines.end()) {
|
||||
@@ -1744,6 +1773,44 @@ class ggml_webgpu_shader_lib {
|
||||
return pad_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_quantize_q8_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_quantize_q8_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
|
||||
auto it = quantize_q8_pipelines.find(key);
|
||||
if (it != quantize_q8_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
const char * shader_src = wgsl_quantize_q8;
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "quantize_q8";
|
||||
|
||||
uint32_t wg_size = WEBGPU_MUL_MAT_VEC_WG_SIZE;
|
||||
|
||||
defines.push_back("SRC1_INNER_TYPE=f32");
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
|
||||
const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type);
|
||||
std::string src0_name = src0_traits->type_name;
|
||||
std::string type_upper = src0_name;
|
||||
variant += "_" + src0_name;
|
||||
std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
|
||||
|
||||
defines.push_back("MUL_ACC_" + type_upper);
|
||||
defines.push_back("Q8_1_T");
|
||||
|
||||
defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
|
||||
variant += context.supports_subgroups ? "_sg_reduce" : "_wg_reduce";
|
||||
|
||||
auto processed = preprocessor.preprocess(shader_src, defines);
|
||||
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
|
||||
decisions->wg_size = wg_size;
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
pipeline.context = decisions;
|
||||
quantize_q8_pipelines[key] = pipeline;
|
||||
return quantize_q8_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_mul_mat_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_mul_mat_vec_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
@@ -1752,6 +1819,8 @@ class ggml_webgpu_shader_lib {
|
||||
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
|
||||
1 :
|
||||
0;
|
||||
key.use_mmvq =
|
||||
ggml_webgpu_can_use_mmvq(context.src0, context.src1, context.supports_dot_product, context.vendor);
|
||||
|
||||
auto it = mul_mat_vec_pipelines.find(key);
|
||||
if (it != mul_mat_vec_pipelines.end()) {
|
||||
@@ -1788,6 +1857,19 @@ class ggml_webgpu_shader_lib {
|
||||
defines.push_back("U32_DEQUANT_HELPERS");
|
||||
defines.push_back("SRC0_INNER_TYPE=u32");
|
||||
switch (context.src0->type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
if (key.use_mmvq) {
|
||||
defines.push_back("LEGACY_QUANTS");
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
if (key.use_mmvq) {
|
||||
defines.push_back("K_QUANTS");
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
@@ -1840,6 +1922,11 @@ class ggml_webgpu_shader_lib {
|
||||
outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
|
||||
}
|
||||
|
||||
if (key.use_mmvq) {
|
||||
defines.push_back("MMVQ");
|
||||
defines.push_back("Q8_1_T");
|
||||
}
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
|
||||
defines.push_back(std::string("OUTPUTS_PER_WG=") + std::to_string(outputs_per_wg));
|
||||
defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
|
||||
@@ -2018,100 +2105,6 @@ class ggml_webgpu_shader_lib {
|
||||
return mul_mat_fast_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_mul_mat_legacy_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_legacy_mul_mat_pipeline_key key = {};
|
||||
key.src0_type = context.src0->type;
|
||||
key.src1_type = context.src1->type;
|
||||
|
||||
auto it = mul_mat_legacy_pipelines.find(key);
|
||||
if (it != mul_mat_legacy_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::vector<std::string> defines;
|
||||
std::string variant = "mul_mat";
|
||||
|
||||
switch (context.src1->type) {
|
||||
case GGML_TYPE_F32:
|
||||
defines.push_back("SRC1_TYPE=f32");
|
||||
variant += "_f32";
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
defines.push_back("SRC1_TYPE=f16");
|
||||
variant += "_f16";
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported src1 type for mul_mat legacy shader");
|
||||
}
|
||||
|
||||
const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type);
|
||||
const char * src0_name = src0_traits->type_name;
|
||||
|
||||
switch (context.src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
defines.push_back("SRC0_TYPE=f32");
|
||||
defines.push_back("FLOAT");
|
||||
variant += "_f32";
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
defines.push_back("SRC0_TYPE=f16");
|
||||
defines.push_back("FLOAT");
|
||||
variant += "_f16";
|
||||
break;
|
||||
default:
|
||||
{
|
||||
std::string type_upper = src0_name;
|
||||
std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
|
||||
|
||||
switch (context.src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
{
|
||||
// Quantized types using u32 buffers for portability.
|
||||
defines.push_back("SRC0_TYPE=u32");
|
||||
defines.push_back("U32_DEQUANT_HELPERS");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
defines.push_back(std::string("SRC0_TYPE=") + src0_name);
|
||||
}
|
||||
}
|
||||
|
||||
defines.push_back("BYTE_HELPERS");
|
||||
defines.push_back(type_upper + "_T");
|
||||
defines.push_back(type_upper);
|
||||
defines.push_back(type_upper + "_SCALE_MIN");
|
||||
defines.push_back(type_upper + "_TABLES");
|
||||
defines.push_back(type_upper + "_GRID");
|
||||
|
||||
variant += std::string("_") + src0_name;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_mul_mat, defines);
|
||||
|
||||
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
|
||||
decisions->wg_size = WEBGPU_MUL_MAT_WG_SIZE;
|
||||
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
pipeline.context = decisions;
|
||||
mul_mat_legacy_pipelines[key] = pipeline;
|
||||
return mul_mat_legacy_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_mul_mat_id_gather_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
auto it = mul_mat_id_gather_pipelines.find(1);
|
||||
if (it != mul_mat_id_gather_pipelines.end()) {
|
||||
|
||||
@@ -94,14 +94,6 @@ static inline uint32_t ggml_webgpu_u32_from_f32(float value) {
|
||||
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
|
||||
#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
|
||||
|
||||
// For operations which process a row in parallel, this seems like a reasonable
|
||||
// default
|
||||
#define WEBGPU_ROW_SPLIT_WG_SIZE 64
|
||||
|
||||
// Track https://github.com/gpuweb/gpuweb/issues/5315 for fixes to
|
||||
// implementations so this can be removed, necessary only for get_rows right now
|
||||
#define WEBGPU_MAX_WG_SIZE 288
|
||||
|
||||
/* End Constants */
|
||||
|
||||
// This is a "fake" base pointer, since WebGPU buffers do not have pointers to
|
||||
@@ -181,6 +173,7 @@ struct webgpu_capabilities {
|
||||
wgpu::Limits limits;
|
||||
bool supports_subgroups = false;
|
||||
bool supports_subgroup_matrix = false;
|
||||
bool supports_dot_product = false;
|
||||
|
||||
uint32_t sg_mat_m = 0;
|
||||
uint32_t sg_mat_n = 0;
|
||||
@@ -210,6 +203,8 @@ struct webgpu_global_context_struct {
|
||||
wgpu::Buffer memset_params_buf;
|
||||
webgpu_pipeline memset_pipeline;
|
||||
|
||||
std::string vendor;
|
||||
|
||||
// TODO: We should rework the CPU profiling time handling to make it more useful. ref: https://github.com/ggml-org/llama.cpp/pull/22050
|
||||
#ifdef GGML_WEBGPU_CPU_PROFILE
|
||||
// Profiling: labeled CPU time in ms (total)
|
||||
@@ -259,6 +254,7 @@ struct webgpu_context_struct {
|
||||
wgpu::Buffer set_rows_host_error_buf;
|
||||
wgpu::CommandEncoder active_command_encoder;
|
||||
wgpu::ComputePassEncoder active_compute_pass;
|
||||
bool batch_compute_passes = true;
|
||||
|
||||
size_t memset_bytes_per_thread;
|
||||
|
||||
@@ -590,9 +586,18 @@ static webgpu_encoded_op ggml_backend_webgpu_build_multi(webgpu_context &
|
||||
}
|
||||
#else
|
||||
for (size_t i = 0; i < dispatches.size(); i++) {
|
||||
ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
|
||||
ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
|
||||
ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
|
||||
if (ctx->batch_compute_passes) {
|
||||
ctx->active_compute_pass.SetPipeline(dispatches[i].pipeline.pipeline);
|
||||
ctx->active_compute_pass.SetBindGroup(0, bind_groups[i]);
|
||||
ctx->active_compute_pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second,
|
||||
1);
|
||||
} else {
|
||||
wgpu::ComputePassEncoder pass = ctx->active_command_encoder.BeginComputePass();
|
||||
pass.SetPipeline(dispatches[i].pipeline.pipeline);
|
||||
pass.SetBindGroup(0, bind_groups[i]);
|
||||
pass.DispatchWorkgroups(dispatches[i].workgroups.first, dispatches[i].workgroups.second, 1);
|
||||
pass.End();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -618,7 +623,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx,
|
||||
size_t size) {
|
||||
std::vector<uint32_t> params = { (uint32_t) offset, (uint32_t) size, value };
|
||||
std::vector<wgpu::BindGroupEntry> entries = { ggml_webgpu_make_bind_group_entry(0, buf, 0, buf.GetSize()) };
|
||||
size_t bytes_per_wg = WEBGPU_MAX_WG_SIZE * ctx->capabilities.memset_bytes_per_thread;
|
||||
size_t bytes_per_wg = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.memset_bytes_per_thread;
|
||||
uint32_t wg_x = CEIL_DIV(size + 3, bytes_per_wg);
|
||||
|
||||
ctx->queue.WriteBuffer(ctx->memset_params_buf, 0, params.data(), params.size() * sizeof(uint32_t));
|
||||
@@ -736,8 +741,11 @@ static webgpu_encoded_op ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst),
|
||||
};
|
||||
|
||||
uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
||||
uint32_t wg_x;
|
||||
uint32_t wg_y;
|
||||
uint32_t total_wg = CEIL_DIV(ne, decisions->wg_size);
|
||||
compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_set(webgpu_context & ctx,
|
||||
@@ -961,9 +969,10 @@ static webgpu_encoded_op ggml_webgpu_conv_2d(webgpu_context & ctx,
|
||||
|
||||
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
|
||||
|
||||
uint32_t wg_x;
|
||||
uint32_t wg_y;
|
||||
uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
|
||||
uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
|
||||
uint32_t wg_y = CEIL_DIV(total_wg, wg_x);
|
||||
compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
|
||||
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
@@ -1051,9 +1060,10 @@ static webgpu_encoded_op ggml_webgpu_im2col(webgpu_context & ctx,
|
||||
|
||||
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
|
||||
|
||||
uint32_t wg_x;
|
||||
uint32_t wg_y;
|
||||
uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
|
||||
uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
|
||||
uint32_t wg_y = CEIL_DIV(total_wg, wg_x);
|
||||
compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
|
||||
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
@@ -1348,7 +1358,7 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx,
|
||||
shader_lib_ctx.src0 = src;
|
||||
shader_lib_ctx.src1 = nullptr;
|
||||
shader_lib_ctx.dst = dst;
|
||||
shader_lib_ctx.max_wg_size = WEBGPU_MAX_WG_SIZE;
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
|
||||
webgpu_pipeline pipeline = ctx->shader_lib->get_get_rows_pipeline(shader_lib_ctx);
|
||||
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
|
||||
@@ -1384,6 +1394,58 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx,
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_quantize_q8_dispatch(webgpu_context & ctx,
|
||||
ggml_tensor * src0,
|
||||
ggml_tensor * src1,
|
||||
ggml_tensor * dst,
|
||||
std::vector<webgpu_dispatch_desc> & dispatches) {
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
|
||||
|
||||
shader_lib_ctx.src0 = src0;
|
||||
shader_lib_ctx.src1 = src1;
|
||||
shader_lib_ctx.dst = dst;
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups;
|
||||
|
||||
webgpu_pipeline qq8_pipeline = ctx->shader_lib->get_quantize_q8_pipeline(shader_lib_ctx);
|
||||
|
||||
// quantize_q8 pipeline
|
||||
const size_t dst_offset = ggml_webgpu_tensor_offset(dst);
|
||||
const size_t q8_src1_align_offset = ROUNDUP_POW2(
|
||||
dst_offset + ggml_nbytes(dst), ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
|
||||
const size_t q8_src1_binding_size =
|
||||
ROUNDUP_POW2(src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)),
|
||||
WEBGPU_STORAGE_BUF_BINDING_MULT);
|
||||
|
||||
std::vector<uint32_t> q8_params = {
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
|
||||
(uint32_t) src1->ne[0],
|
||||
(uint32_t) src1->ne[2],
|
||||
(uint32_t) src1->ne[3],
|
||||
};
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> q8_entries = {
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src1),
|
||||
ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), q8_src1_align_offset, q8_src1_binding_size)
|
||||
};
|
||||
|
||||
auto q8_decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(qq8_pipeline.context.get());
|
||||
|
||||
uint32_t q8_wg_size = q8_decisions->wg_size;
|
||||
uint32_t q8_wg_x = 1;
|
||||
uint32_t q8_wg_y = 1;
|
||||
const uint32_t wg_per_vec = (src0->ne[0] / 4 + (q8_wg_size - 1)) / q8_wg_size;
|
||||
const uint32_t q8_total_wg = src1->ne[2] * src1->ne[3] * wg_per_vec;
|
||||
const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
compute_2d_workgroups(q8_total_wg, max_wg_per_dim, q8_wg_x, q8_wg_y);
|
||||
|
||||
dispatches.push_back({
|
||||
qq8_pipeline, std::move(q8_params), std::move(q8_entries), { q8_wg_x, q8_wg_y }
|
||||
});
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
ggml_tensor * src0,
|
||||
ggml_tensor * src1,
|
||||
@@ -1391,47 +1453,9 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
// Determine if this is a mat-vec operation
|
||||
bool is_vec = (dst->ne[1] == 1);
|
||||
|
||||
// Determine if we should use fast path
|
||||
bool use_fast = false;
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16:
|
||||
use_fast = (src0->type == GGML_TYPE_F16);
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
// TODO: implement better mat-mat for k-quants, mat-vec for all k-quants except q6_K
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_MXFP4:
|
||||
use_fast = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// use MMVQ path for mat-vec
|
||||
bool use_mmvq = ggml_webgpu_can_use_mmvq(src0, src1, ctx->global_ctx->capabilities.supports_dot_product,
|
||||
ctx->global_ctx->vendor);
|
||||
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
|
||||
|
||||
@@ -1446,16 +1470,20 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
shader_lib_ctx.sg_mat_k = ctx->global_ctx->capabilities.sg_mat_k;
|
||||
shader_lib_ctx.min_subgroup_size = ctx->global_ctx->capabilities.min_subgroup_size;
|
||||
shader_lib_ctx.max_subgroup_size = ctx->global_ctx->capabilities.max_subgroup_size;
|
||||
shader_lib_ctx.supports_dot_product = ctx->global_ctx->capabilities.supports_dot_product;
|
||||
shader_lib_ctx.vendor = ctx->global_ctx->vendor;
|
||||
|
||||
// Get or create pipeline
|
||||
webgpu_pipeline pipeline;
|
||||
webgpu_pipeline pipeline;
|
||||
std::vector<webgpu_dispatch_desc> dispatches;
|
||||
|
||||
if (use_fast && is_vec) {
|
||||
if (is_vec) {
|
||||
if (use_mmvq) {
|
||||
ggml_webgpu_quantize_q8_dispatch(ctx, src0, src1, dst, dispatches);
|
||||
}
|
||||
pipeline = ctx->shader_lib->get_mul_mat_vec_pipeline(shader_lib_ctx);
|
||||
} else if (use_fast) {
|
||||
pipeline = ctx->shader_lib->get_mul_mat_fast_pipeline(shader_lib_ctx);
|
||||
} else {
|
||||
pipeline = ctx->shader_lib->get_mul_mat_legacy_pipeline(shader_lib_ctx);
|
||||
pipeline = ctx->shader_lib->get_mul_mat_fast_pipeline(shader_lib_ctx);
|
||||
}
|
||||
|
||||
// Build params
|
||||
@@ -1479,25 +1507,31 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
};
|
||||
|
||||
// Build bind group entries
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0),
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1),
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
|
||||
};
|
||||
std::vector<wgpu::BindGroupEntry> entries = {};
|
||||
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0));
|
||||
if (use_mmvq) {
|
||||
auto & mmvq_qq8_entry = dispatches[0].bind_group_entries[1];
|
||||
entries.push_back(ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), mmvq_qq8_entry.offset,
|
||||
mmvq_qq8_entry.size));
|
||||
} else {
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1));
|
||||
}
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst));
|
||||
|
||||
// Calculate workgroup dimensions
|
||||
uint32_t wg_x = 1;
|
||||
uint32_t wg_y = 1;
|
||||
const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
|
||||
if (use_fast && is_vec) {
|
||||
if (is_vec) {
|
||||
auto * decisions = static_cast<ggml_webgpu_mul_mat_vec_shader_decisions *>(pipeline.context.get());
|
||||
|
||||
uint32_t batches = dst->ne[2] * dst->ne[3];
|
||||
uint32_t output_groups = CEIL_DIV(dst->ne[0], decisions->outputs_per_wg);
|
||||
uint32_t total_wg = output_groups * batches;
|
||||
compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
|
||||
} else if (use_fast) {
|
||||
} else {
|
||||
auto * decisions = static_cast<ggml_webgpu_mul_mat_shader_decisions *>(pipeline.context.get());
|
||||
|
||||
// Fast-path tiled/subgroup calculations
|
||||
@@ -1518,15 +1552,13 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
}
|
||||
uint32_t total_wg = wg_m * wg_n * dst->ne[2] * dst->ne[3];
|
||||
compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
|
||||
|
||||
} else { // legacy
|
||||
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
|
||||
uint32_t wg_size = decisions->wg_size;
|
||||
uint32_t total_wg = CEIL_DIV(dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3], wg_size);
|
||||
compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
|
||||
}
|
||||
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
dispatches.push_back({
|
||||
pipeline, std::move(params), std::move(entries), { wg_x, wg_y }
|
||||
});
|
||||
|
||||
return ggml_backend_webgpu_build_multi(ctx, dispatches);
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_mul_mat_id_vec(webgpu_context & ctx,
|
||||
@@ -1654,14 +1686,11 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
|
||||
gathered_count_ids_binding_size),
|
||||
};
|
||||
|
||||
const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
|
||||
const uint32_t gather_total_wg = param_n_expert;
|
||||
const uint32_t gather_wg_x = std::min(gather_total_wg, max_wg_per_dim);
|
||||
const uint32_t gather_wg_y = CEIL_DIV(gather_total_wg, gather_wg_x);
|
||||
// n_expert is much less than maxComputeWorkgroupsPerDimension (e.g., n_exeprt=256 at Qwen3.5-35B-A3B)
|
||||
const uint32_t gather_wg_x = param_n_expert;
|
||||
|
||||
dispatches.push_back({
|
||||
gather_pipeline, std::move(gather_params), std::move(gather_entries), { gather_wg_x, gather_wg_y }
|
||||
gather_pipeline, std::move(gather_params), std::move(gather_entries), { gather_wg_x, 1 }
|
||||
});
|
||||
|
||||
// params for mul_mat_id.wgsl
|
||||
@@ -1713,7 +1742,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
|
||||
uint32_t max_wg_n = CEIL_DIV(total_gathered, tile_n_s) + max_active_experts;
|
||||
uint32_t total_wg = wg_m * max_wg_n;
|
||||
|
||||
compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
|
||||
compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
|
||||
|
||||
dispatches.push_back({
|
||||
main_pipeline, std::move(main_params), std::move(main_entries), { wg_x, wg_y }
|
||||
@@ -1956,10 +1985,10 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
|
||||
std::vector<wgpu::BindGroupEntry> reduce_entries;
|
||||
if (use_vec_reduce) {
|
||||
const uint32_t reduce_sg_size = ctx->global_ctx->capabilities.max_subgroup_size;
|
||||
const uint32_t reduce_wg_size =
|
||||
std::max(reduce_sg_size, (uint32_t) std::min<uint64_t>(
|
||||
(uint64_t) nwg * reduce_sg_size,
|
||||
ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup));
|
||||
const uint32_t reduce_wg_size = std::max(
|
||||
reduce_sg_size,
|
||||
(uint32_t) std::min<uint64_t>((uint64_t) nwg * reduce_sg_size,
|
||||
ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup));
|
||||
ggml_webgpu_shader_lib_context reduce_shader_ctx = shader_lib_ctx;
|
||||
reduce_shader_ctx.max_wg_size = reduce_wg_size;
|
||||
reduce_pipeline = ctx->shader_lib->get_flash_attn_vec_reduce_pipeline(reduce_shader_ctx);
|
||||
@@ -2736,10 +2765,12 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor *
|
||||
block_size, npr, nrows
|
||||
};
|
||||
|
||||
const uint32_t total_wg_init = npr * nrows;
|
||||
const uint32_t max_wg = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
const uint32_t wg_x_init = std::min(total_wg_init, max_wg);
|
||||
const uint32_t wg_y_init = CEIL_DIV(total_wg_init, wg_x_init);
|
||||
uint32_t wg_x_init;
|
||||
uint32_t wg_y_init;
|
||||
const uint32_t total_wg_init = npr * nrows;
|
||||
const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
compute_2d_workgroups(total_wg_init, max_wg_per_dim, wg_x_init, wg_y_init);
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> init_entries = {
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src),
|
||||
ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(dst), init_align_offset, init_binding_size)
|
||||
@@ -2796,9 +2827,11 @@ static webgpu_encoded_op ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor *
|
||||
ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(dst), align_out, size_out)
|
||||
};
|
||||
|
||||
uint32_t wg_x_merge;
|
||||
uint32_t wg_y_merge;
|
||||
const uint32_t total_wg_merge = nm * nrows;
|
||||
const uint32_t wg_x_merge = std::min(total_wg_merge, max_wg);
|
||||
const uint32_t wg_y_merge = CEIL_DIV(total_wg_merge, wg_x_merge);
|
||||
compute_2d_workgroups(total_wg_merge, max_wg_per_dim, wg_x_merge, wg_y_merge);
|
||||
|
||||
dispatches.push_back({
|
||||
argsort_merge_pipeline, std::move(merge_params), std::move(merge_entries), { wg_x_merge, wg_y_merge }
|
||||
});
|
||||
@@ -2918,9 +2951,12 @@ static webgpu_encoded_op ggml_webgpu_upscale(webgpu_context ctx, ggml_tensor * s
|
||||
|
||||
webgpu_pipeline pipeline = ctx->shader_lib->get_upscale_pipeline(shader_lib_ctx);
|
||||
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
|
||||
uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
|
||||
uint32_t wg_x = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
|
||||
uint32_t wg_y = CEIL_DIV(total_wg, wg_x);
|
||||
|
||||
uint32_t wg_x;
|
||||
uint32_t wg_y;
|
||||
uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
|
||||
compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
|
||||
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
|
||||
@@ -3110,18 +3146,16 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
||||
uint32_t num_batched_kernels = 0;
|
||||
uint32_t num_inflight_batches = 0;
|
||||
bool contains_set_rows = false;
|
||||
bool batch_compute_passes = true;
|
||||
int num_encoded_ops = 1;
|
||||
int node_idx = 0;
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ctx->profile_timestamp_query_count = 0;
|
||||
batch_compute_passes = false;
|
||||
std::vector<std::string> profile_pipeline_names;
|
||||
#endif
|
||||
|
||||
ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
|
||||
if (batch_compute_passes) {
|
||||
if (ctx->batch_compute_passes) {
|
||||
ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
|
||||
}
|
||||
|
||||
@@ -3148,7 +3182,7 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
||||
|
||||
// reset state for next batch
|
||||
ctx->active_command_encoder = ctx->global_ctx->device.CreateCommandEncoder();
|
||||
if (batch_compute_passes) {
|
||||
if (ctx->batch_compute_passes) {
|
||||
ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
|
||||
}
|
||||
ctx->param_arena.reset();
|
||||
@@ -3548,8 +3582,8 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
|
||||
const uint32_t kv_tile = decisions.kv_tile;
|
||||
|
||||
const uint32_t vec_nwg_cap = ctx->webgpu_global_ctx->capabilities.min_subgroup_size;
|
||||
uint32_t nwg = 1u;
|
||||
const uint64_t kv_span = (uint64_t) std::max(1u, kv_tile);
|
||||
uint32_t nwg = 1u;
|
||||
const uint64_t kv_span = (uint64_t) std::max(1u, kv_tile);
|
||||
while ((2u * nwg * kv_span) < (uint64_t) K->ne[1] && nwg < vec_nwg_cap) {
|
||||
nwg <<= 1;
|
||||
}
|
||||
@@ -3582,6 +3616,22 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
|
||||
}
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
const ggml_tensor * src0 = tensor->src[0];
|
||||
const ggml_tensor * src1 = tensor->src[1];
|
||||
bool use_mmvq =
|
||||
ggml_webgpu_can_use_mmvq(src0, src1, ctx->webgpu_global_ctx->capabilities.supports_dot_product,
|
||||
ctx->webgpu_global_ctx->vendor);
|
||||
if (use_mmvq) {
|
||||
const size_t q8_src1_size =
|
||||
src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32));
|
||||
res = ROUNDUP_POW2(res + q8_src1_size +
|
||||
ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment,
|
||||
WEBGPU_STORAGE_BUF_BINDING_MULT);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
const ggml_tensor * src0 = tensor->src[0];
|
||||
@@ -3658,13 +3708,13 @@ static ggml_guid_t ggml_backend_webgpu_guid(void) {
|
||||
|
||||
static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
|
||||
// we use the maximum workgroup size for the memset pipeline
|
||||
size_t max_threads = WEBGPU_MAX_WG_SIZE * ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
size_t max_threads = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
// Size the bytes_per_thread so that the largest buffer size can be handled
|
||||
ctx->capabilities.memset_bytes_per_thread =
|
||||
CEIL_DIV(ctx->capabilities.limits.maxStorageBufferBindingSize, max_threads);
|
||||
std::vector<wgpu::ConstantEntry> constants(2);
|
||||
constants[0].key = "wg_size";
|
||||
constants[0].value = WEBGPU_MAX_WG_SIZE;
|
||||
constants[0].value = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
constants[1].key = "bytes_per_thread";
|
||||
constants[1].value = ctx->capabilities.memset_bytes_per_thread;
|
||||
ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
|
||||
@@ -3707,12 +3757,16 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
ctx->webgpu_global_ctx->adapter.GetInfo(&info);
|
||||
ctx->webgpu_global_ctx->command_submit_batch_size = ggml_backend_webgpu_get_command_submit_batch_size();
|
||||
ctx->webgpu_global_ctx->max_inflight_batches = ggml_backend_webgpu_get_max_inflight_batches();
|
||||
ctx->webgpu_global_ctx->vendor = info.vendor;
|
||||
wgpu::SupportedFeatures features;
|
||||
ctx->webgpu_global_ctx->adapter.GetFeatures(&features);
|
||||
// we require f16 support
|
||||
GGML_ASSERT(ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
|
||||
ctx->webgpu_global_ctx->capabilities.supports_subgroups =
|
||||
ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);
|
||||
// for dot4I8packed
|
||||
ctx->webgpu_global_ctx->capabilities.supports_dot_product = ctx->webgpu_global_ctx->instance.HasWGSLLanguageFeature(
|
||||
wgpu::WGSLLanguageFeatureName::Packed4x8IntegerDotProduct);
|
||||
|
||||
bool valid_subgroup_matrix_config = false;
|
||||
#ifndef __EMSCRIPTEN__
|
||||
@@ -3839,6 +3893,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
|
||||
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf");
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
webgpu_ctx->batch_compute_passes = false;
|
||||
ggml_webgpu_create_buffer(
|
||||
webgpu_ctx->global_ctx->device, webgpu_ctx->profile_timestamp_dev_buf, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
|
||||
wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc, "profile_timestamp_dev_buf");
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user