mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-12 14:03:20 +02:00
Compare commits
148 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
093e3f1feb | ||
|
|
663445b0de | ||
|
|
7675c555a1 | ||
|
|
5e1c3aed40 | ||
|
|
c496fe0b1d | ||
|
|
e57bb87ced | ||
|
|
f3a4b1659c | ||
|
|
108009f5c7 | ||
|
|
d337252acf | ||
|
|
af6f91db47 | ||
|
|
a7b8d35f78 | ||
|
|
6eba72b71c | ||
|
|
fedf034a98 | ||
|
|
8726392d3d | ||
|
|
c04621711a | ||
|
|
0fc16b42e8 | ||
|
|
053b1539c0 | ||
|
|
b3a89c3d9e | ||
|
|
e15898d1c7 | ||
|
|
803f8baf4f | ||
|
|
3600cc2886 | ||
|
|
c7e0a2054b | ||
|
|
3f55f781f1 | ||
|
|
51fa76f172 | ||
|
|
12d0188c0d | ||
|
|
eb3949938e | ||
|
|
e562eece7c | ||
|
|
b47ab7b8e9 | ||
|
|
dd665cc9d4 | ||
|
|
df0c0c7d02 | ||
|
|
b49a8ff96b | ||
|
|
53f925074d | ||
|
|
db38704f01 | ||
|
|
07e4351ce6 | ||
|
|
291f2b6913 | ||
|
|
2c90da4c7e | ||
|
|
ec9e0301fe | ||
|
|
e83ba3e460 | ||
|
|
2b131621e6 | ||
|
|
54a2c7a8cd | ||
|
|
21fcc21ad5 | ||
|
|
dd8ba93416 | ||
|
|
66c92061f5 | ||
|
|
5ca82fc1d7 | ||
|
|
6385b843a8 | ||
|
|
1b8fb8152d | ||
|
|
53ae30640e | ||
|
|
763d06edb7 | ||
|
|
10961339b2 | ||
|
|
d98f2a35fc | ||
|
|
e0e3aa231d | ||
|
|
aa6dff05be | ||
|
|
c962ae3382 | ||
|
|
a3938fb53d | ||
|
|
f7873fc698 | ||
|
|
a68247439b | ||
|
|
26b79b6cb3 | ||
|
|
1e8659e65a | ||
|
|
a3c30846e4 | ||
|
|
1701d4c54f | ||
|
|
bef8176387 | ||
|
|
34b7c0439e | ||
|
|
f3101a8cc6 | ||
|
|
1c49c70d07 | ||
|
|
a8ea03d8ad | ||
|
|
05f6ac6283 | ||
|
|
bc583e3c63 | ||
|
|
72b090da2c | ||
|
|
7fe03e7446 | ||
|
|
952f3953c1 | ||
|
|
81713121ee | ||
|
|
f9cd68398b | ||
|
|
4f81b33e32 | ||
|
|
cdf94a1802 | ||
|
|
a26c4cc11e | ||
|
|
4265a87b59 | ||
|
|
6f180b915c | ||
|
|
03f582ae8f | ||
|
|
88c125f2ac | ||
|
|
d74e94c1b3 | ||
|
|
f13847cfb5 | ||
|
|
79c137f776 | ||
|
|
22229314fc | ||
|
|
9012eb9b45 | ||
|
|
fef693dc6b | ||
|
|
2d38b6e400 | ||
|
|
e121edc432 | ||
|
|
2f099b510f | ||
|
|
aa50ba462f | ||
|
|
de2ef53a4b | ||
|
|
c508256db2 | ||
|
|
40aaa8a403 | ||
|
|
a08c1d2845 | ||
|
|
d785f9c1fd | ||
|
|
4032ca4066 | ||
|
|
515fdbf7ed | ||
|
|
f5cd27b71d | ||
|
|
a2d02d5793 | ||
|
|
17fc817b58 | ||
|
|
2bd1b30f69 | ||
|
|
259469c4b5 | ||
|
|
4c32832c59 | ||
|
|
c3a2624339 | ||
|
|
ffd0eae60b | ||
|
|
b775345d78 | ||
|
|
a70a8a69c2 | ||
|
|
d13d0f6135 | ||
|
|
8a2afb7520 | ||
|
|
9ecf3e66a3 | ||
|
|
faaaff5f94 | ||
|
|
e16c4731c7 | ||
|
|
1dcd01960c | ||
|
|
c10ed6cbcc | ||
|
|
a127ff1780 | ||
|
|
3079e9ac8e | ||
|
|
8a1d206f1d | ||
|
|
797990c4bc | ||
|
|
ab86335760 | ||
|
|
cc74d5be99 | ||
|
|
5be24af73d | ||
|
|
d394a9aedc | ||
|
|
6b56a64690 | ||
|
|
a4e8912dfd | ||
|
|
edbf42edfd | ||
|
|
d643bb2c79 | ||
|
|
8e186ef0e7 | ||
|
|
5fbfe384d4 | ||
|
|
c76532e7ba | ||
|
|
2aa777d86d | ||
|
|
eb0f5c28d3 | ||
|
|
cf4cb59e64 | ||
|
|
0d5c742161 | ||
|
|
42158ae2e8 | ||
|
|
797f2ac062 | ||
|
|
b44890df2e | ||
|
|
33983057d0 | ||
|
|
fb1cab201c | ||
|
|
b7a17463ec | ||
|
|
be0239693c | ||
|
|
a4090d1174 | ||
|
|
b69f1647f9 | ||
|
|
759e37b0d8 | ||
|
|
4245e622e0 | ||
|
|
c9c64dee57 | ||
|
|
c00a2634be | ||
|
|
e298d2fbd0 | ||
|
|
f0adb80bf7 | ||
|
|
f7c9429c85 |
@@ -1,10 +1,10 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG MUSA_VERSION=rc3.1.1
|
||||
ARG MUSA_VERSION=rc4.0.1
|
||||
# Target the MUSA build image
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
@@ -21,21 +21,14 @@ RUN apt-get update && \
|
||||
libcurl4-openssl-dev \
|
||||
libgomp1
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Use the default MUSA archs if not specified
|
||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -48,3 +48,7 @@ end_of_line = unset
|
||||
charset = unset
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
[vendor/miniaudio/miniaudio.h]
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
30
.github/workflows/build-linux-cross.yml
vendored
30
.github/workflows/build-linux-cross.yml
vendored
@@ -26,12 +26,12 @@ jobs:
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-riscv64-linux-gnu \
|
||||
g++-14-riscv64-linux-gnu \
|
||||
libcurl4-openssl-dev:riscv64
|
||||
g++-14-riscv64-linux-gnu
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
@@ -72,12 +72,12 @@ jobs:
|
||||
glslc \
|
||||
gcc-14-riscv64-linux-gnu \
|
||||
g++-14-riscv64-linux-gnu \
|
||||
libvulkan-dev:riscv64 \
|
||||
libcurl4-openssl-dev:riscv64
|
||||
libvulkan-dev:riscv64
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
@@ -118,12 +118,12 @@ jobs:
|
||||
build-essential \
|
||||
glslc \
|
||||
crossbuild-essential-arm64 \
|
||||
libvulkan-dev:arm64 \
|
||||
libcurl4-openssl-dev:arm64
|
||||
libvulkan-dev:arm64
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
@@ -163,12 +163,12 @@ jobs:
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-powerpc64le-linux-gnu \
|
||||
g++-14-powerpc64le-linux-gnu \
|
||||
libcurl4-openssl-dev:ppc64el
|
||||
g++-14-powerpc64le-linux-gnu
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
@@ -209,12 +209,12 @@ jobs:
|
||||
glslc \
|
||||
gcc-14-powerpc64le-linux-gnu \
|
||||
g++-14-powerpc64le-linux-gnu \
|
||||
libvulkan-dev:ppc64el \
|
||||
libcurl4-openssl-dev:ppc64el
|
||||
libvulkan-dev:ppc64el
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
|
||||
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -351,7 +351,7 @@ jobs:
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
|
||||
container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
282
.github/workflows/release.yml
vendored
282
.github/workflows/release.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Create Release
|
||||
name: Release
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
@@ -227,6 +227,69 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
||||
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||
|
||||
windows-cpu:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- arch: 'x64'
|
||||
- arch: 'arm64'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-cpu-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install Ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
with:
|
||||
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
||||
|
||||
- name: Build
|
||||
shell: cmd
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
|
||||
-DGGML_OPENMP=ON ^
|
||||
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
|
||||
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
|
||||
@@ -237,52 +300,30 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64'
|
||||
- backend: 'vulkan'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
|
||||
#- build: 'openblas-x64'
|
||||
# arch: 'x64'
|
||||
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
arch: 'x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'cpu-arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
|
||||
- build: 'opencl-adreno-arm64'
|
||||
defines: '-DGGML_VULKAN=ON'
|
||||
target: 'ggml-vulkan'
|
||||
- backend: 'opencl-adreno'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
target: 'ggml-opencl'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.build }}
|
||||
key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
mkdir $env:RUNNER_TEMP/openblas
|
||||
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'vulkan-x64' }}
|
||||
if: ${{ matrix.backend == 'vulkan' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
@@ -296,7 +337,7 @@ jobs:
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
if: ${{ matrix.build == 'opencl-adreno-arm64' }}
|
||||
if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
|
||||
run: |
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||
cd OpenCL-Headers
|
||||
@@ -314,46 +355,22 @@ jobs:
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build-arm64-release --target install --config release
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
with:
|
||||
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
|
||||
cmake --build build --config Release --target ${{ matrix.target }}
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
|
||||
windows-cuda:
|
||||
runs-on: windows-2019
|
||||
@@ -366,8 +383,6 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -386,45 +401,30 @@ jobs:
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_CPU=OFF ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
-DLLAMA_CURL=OFF
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
run: |
|
||||
@@ -432,13 +432,13 @@ jobs:
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
|
||||
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-sycl:
|
||||
runs-on: windows-latest
|
||||
@@ -451,12 +451,11 @@ jobs:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -469,15 +468,18 @@ jobs:
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
# TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
cmake -G "Ninja" -B build ^
|
||||
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
-DLLAMA_CURL=OFF
|
||||
cmake --build build --target ggml-sycl -j
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
@@ -502,12 +504,12 @@ jobs:
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
7z a llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
path: llama-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
windows-hip:
|
||||
@@ -515,14 +517,14 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
gpu_target: [gfx1100, gfx1101, gfx1030]
|
||||
include:
|
||||
- name: "radeon"
|
||||
gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
@@ -532,7 +534,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-release
|
||||
key: windows-latest-cmake-hip-${{ matrix.name }}-x64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
@@ -550,50 +552,39 @@ jobs:
|
||||
run: |
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
|
||||
-DGGML_BACKEND_DL=ON `
|
||||
-DGGML_NATIVE=OFF `
|
||||
-DGGML_CPU=OFF `
|
||||
-DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
-DLLAMA_CURL=OFF
|
||||
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
||||
7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
@@ -655,14 +646,16 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- ubuntu-22-cpu
|
||||
- ubuntu-22-vulkan
|
||||
- windows
|
||||
- windows-cpu
|
||||
- windows-cuda
|
||||
- windows-sycl
|
||||
- windows-hip
|
||||
- ubuntu-22-cpu
|
||||
- ubuntu-22-vulkan
|
||||
- macOS-arm64
|
||||
- macOS-x64
|
||||
- ios-xcode-build
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -680,10 +673,43 @@ jobs:
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
merge-multiple: true
|
||||
|
||||
- name: Move artifacts
|
||||
id: move_artifacts
|
||||
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||
run: |
|
||||
mkdir -p release
|
||||
|
||||
echo "Adding CPU backend files to existing zips..."
|
||||
for arch in x64 arm64; do
|
||||
cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
|
||||
temp_dir=$(mktemp -d)
|
||||
echo "Extracting CPU backend for $arch..."
|
||||
unzip "$cpu_zip" -d "$temp_dir"
|
||||
|
||||
echo "Adding CPU files to $arch zips..."
|
||||
for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
|
||||
if [[ "$target_zip" == "$cpu_zip" ]]; then
|
||||
continue
|
||||
fi
|
||||
echo "Adding CPU backend to $(basename "$target_zip")"
|
||||
realpath_target_zip=$(realpath "$target_zip")
|
||||
(cd "$temp_dir" && zip -r "$realpath_target_zip" .)
|
||||
done
|
||||
|
||||
rm -rf "$temp_dir"
|
||||
done
|
||||
|
||||
echo "Renaming and moving zips to release..."
|
||||
for zip_file in artifact/llama-bin-win-*.zip; do
|
||||
base_name=$(basename "$zip_file" .zip)
|
||||
zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
|
||||
echo "Moving $zip_file to release/$zip_name"
|
||||
mv "$zip_file" "release/$zip_name"
|
||||
done
|
||||
|
||||
echo "Moving other artifacts..."
|
||||
mv -v artifact/*.zip release
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
@@ -702,7 +728,7 @@ jobs:
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||
for (let file of await fs.readdirSync('./release')) {
|
||||
if (path.extname(file) === '.zip') {
|
||||
console.log('uploadReleaseAsset', file);
|
||||
await github.repos.uploadReleaseAsset({
|
||||
@@ -710,7 +736,7 @@ jobs:
|
||||
repo: context.repo.repo,
|
||||
release_id: release_id,
|
||||
name: file,
|
||||
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||
data: await fs.readFileSync(`./release/${file}`)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
42
.github/workflows/winget.yml
vendored
Normal file
42
.github/workflows/winget.yml
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
name: Update Winget Package
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
schedule:
|
||||
- cron: '28 5 * * *' # Update every day at 5:28 UTC
|
||||
|
||||
jobs:
|
||||
update:
|
||||
name: Update Winget Package
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Install cargo binstall
|
||||
uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
|
||||
|
||||
- name: Install komac
|
||||
run: |
|
||||
cargo binstall komac@2.11.2 -y
|
||||
|
||||
- name: Find latest release
|
||||
id: find_latest_release
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
script: |
|
||||
const { data: releases } = await github.rest.repos.listReleases({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
});
|
||||
console.log("Latest release:", releases[0].tag_name);
|
||||
return releases[0].tag_name;
|
||||
|
||||
- name: Update manifest
|
||||
env:
|
||||
VERSION: ${{ steps.find_latest_release.outputs.result }}
|
||||
run: |
|
||||
echo "Updating manifest..."
|
||||
komac update --version ${{ env.VERSION }} \
|
||||
--urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
|
||||
--token ${{ secrets.WINGET_GITHUB_TOKEN }} \
|
||||
--submit \
|
||||
ggml.llamacpp
|
||||
@@ -37,7 +37,7 @@ range of hardware - locally and in the cloud.
|
||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
|
||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
|
||||
- Vulkan and SYCL backend support
|
||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||
|
||||
@@ -130,6 +130,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
<details>
|
||||
<summary>Bindings</summary>
|
||||
|
||||
- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||
@@ -237,7 +238,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [BLAS](docs/build.md#blas-build) | All |
|
||||
| [BLIS](docs/backend/BLIS.md) | All |
|
||||
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
|
||||
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
||||
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
||||
| [HIP](docs/build.md#hip) | AMD GPU |
|
||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||
@@ -580,3 +581,4 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
|
||||
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
|
||||
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
|
||||
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
|
||||
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
|
||||
|
||||
@@ -54,7 +54,7 @@ docker run --privileged -it \
|
||||
-v $HOME/llama.cpp/ci-cache:/ci-cache \
|
||||
-v $HOME/llama.cpp/ci-results:/ci-results \
|
||||
-v $PWD:/ws -w /ws \
|
||||
mthreads/musa:rc3.1.1-devel-ubuntu22.04
|
||||
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
||||
```
|
||||
|
||||
Inside the container, execute the following commands:
|
||||
|
||||
@@ -58,19 +58,20 @@ add_library(${TARGET} STATIC
|
||||
arg.cpp
|
||||
arg.h
|
||||
base64.hpp
|
||||
chat-parser.cpp
|
||||
chat-parser.h
|
||||
chat.cpp
|
||||
chat.h
|
||||
common.cpp
|
||||
common.h
|
||||
console.cpp
|
||||
console.h
|
||||
json-partial.cpp
|
||||
json-partial.h
|
||||
json-schema-to-grammar.cpp
|
||||
json.hpp
|
||||
llguidance.cpp
|
||||
log.cpp
|
||||
log.h
|
||||
minja/chat-template.hpp
|
||||
minja/minja.hpp
|
||||
ngram-cache.cpp
|
||||
ngram-cache.h
|
||||
regex-partial.cpp
|
||||
@@ -143,7 +144,7 @@ if (LLAMA_LLGUIDANCE)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
||||
endif ()
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||
|
||||
|
||||
292
common/arg.cpp
292
common/arg.cpp
@@ -1,10 +1,11 @@
|
||||
#include "gguf.h" // for reading GGUF splits
|
||||
#include "arg.h"
|
||||
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "gguf.h" // for reading GGUF splits
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "log.h"
|
||||
#include "sampling.h"
|
||||
#include "chat.h"
|
||||
|
||||
// fix problem with std::min and std::max
|
||||
#if defined(_WIN32)
|
||||
@@ -15,6 +16,9 @@
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
@@ -34,12 +38,10 @@
|
||||
#include <future>
|
||||
#endif
|
||||
|
||||
#include "json-schema-to-grammar.h"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
std::initializer_list<enum llama_example> mmproj_examples = {
|
||||
LLAMA_EXAMPLE_LLAVA,
|
||||
LLAMA_EXAMPLE_MTMD,
|
||||
LLAMA_EXAMPLE_SERVER,
|
||||
};
|
||||
|
||||
@@ -242,7 +244,56 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
||||
}
|
||||
|
||||
// download one single file from remote URL to local path
|
||||
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
|
||||
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
|
||||
// Check if the file already exists locally
|
||||
auto file_exists = std::filesystem::exists(path);
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
|
||||
if (file_exists) {
|
||||
if (offline) {
|
||||
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
||||
return true; // skip verification/downloading
|
||||
}
|
||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||
std::ifstream metadata_in(metadata_path);
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||
} else {
|
||||
if (offline) {
|
||||
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
||||
return false;
|
||||
}
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||
struct common_load_model_from_url_headers {
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
};
|
||||
|
||||
common_load_model_from_url_headers headers;
|
||||
bool head_request_ok = false;
|
||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||
|
||||
// Initialize libcurl
|
||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
curl_slist_ptr http_headers;
|
||||
@@ -269,91 +320,47 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||
#endif
|
||||
|
||||
// Check if the file already exists locally
|
||||
auto file_exists = std::filesystem::exists(path);
|
||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
||||
|
||||
if (file_exists) {
|
||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||
std::ifstream metadata_in(metadata_path);
|
||||
if (metadata_in.good()) {
|
||||
try {
|
||||
metadata_in >> metadata;
|
||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||
etag = metadata.at("etag");
|
||||
}
|
||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||
last_modified = metadata.at("lastModified");
|
||||
}
|
||||
} catch (const nlohmann::json::exception & e) {
|
||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||
std::string header(buffer, n_items);
|
||||
std::smatch match;
|
||||
if (std::regex_match(header, match, header_regex)) {
|
||||
const std::string & key = match[1];
|
||||
const std::string & value = match[2];
|
||||
if (std::regex_match(key, match, etag_regex)) {
|
||||
headers->etag = value;
|
||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||
headers->last_modified = value;
|
||||
}
|
||||
}
|
||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||
} else {
|
||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||
}
|
||||
|
||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||
struct common_load_model_from_url_headers {
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
return n_items;
|
||||
};
|
||||
|
||||
common_load_model_from_url_headers headers;
|
||||
bool head_request_ok = false;
|
||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
|
||||
// get ETag to see if the remote file has changed
|
||||
{
|
||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||
// we only allow retrying once for HEAD requests
|
||||
// this is for the use case of using running offline (no internet), retrying can be annoying
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
||||
if (!was_perform_successful) {
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
|
||||
|
||||
std::string header(buffer, n_items);
|
||||
std::smatch match;
|
||||
if (std::regex_match(header, match, header_regex)) {
|
||||
const std::string & key = match[1];
|
||||
const std::string & value = match[2];
|
||||
if (std::regex_match(key, match, etag_regex)) {
|
||||
headers->etag = value;
|
||||
} else if (std::regex_match(key, match, last_modified_regex)) {
|
||||
headers->last_modified = value;
|
||||
}
|
||||
}
|
||||
return n_items;
|
||||
};
|
||||
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
|
||||
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
|
||||
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
|
||||
|
||||
// we only allow retrying once for HEAD requests
|
||||
// this is for the use case of using running offline (no internet), retrying can be annoying
|
||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
|
||||
if (!was_perform_successful) {
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code == 200) {
|
||||
head_request_ok = true;
|
||||
} else {
|
||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
head_request_ok = false;
|
||||
}
|
||||
long http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||
if (http_code == 200) {
|
||||
head_request_ok = true;
|
||||
} else {
|
||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||
head_request_ok = false;
|
||||
}
|
||||
|
||||
// if head_request_ok is false, we don't have the etag or last-modified headers
|
||||
@@ -460,12 +467,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
||||
|
||||
// download multiple files from remote URLs to local paths
|
||||
// the input is a vector of pairs <url, path>
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
||||
// Prepare download in parallel
|
||||
std::vector<std::future<bool>> futures_download;
|
||||
for (auto const & item : urls) {
|
||||
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
|
||||
return common_download_file_single(it.first, it.second, bearer_token);
|
||||
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
|
||||
return common_download_file_single(it.first, it.second, bearer_token, offline);
|
||||
}, item));
|
||||
}
|
||||
|
||||
@@ -481,14 +488,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
|
||||
|
||||
static bool common_download_model(
|
||||
const common_params_model & model,
|
||||
const std::string & bearer_token) {
|
||||
const std::string & bearer_token,
|
||||
bool offline) {
|
||||
// Basic validation of the model.url
|
||||
if (model.url.empty()) {
|
||||
LOG_ERR("%s: invalid model url\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!common_download_file_single(model.url, model.path, bearer_token)) {
|
||||
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -547,7 +555,7 @@ static bool common_download_model(
|
||||
}
|
||||
|
||||
// Download in parallel
|
||||
common_download_file_multiple(urls, bearer_token);
|
||||
common_download_file_multiple(urls, bearer_token, offline);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -608,7 +616,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
||||
*
|
||||
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
||||
*/
|
||||
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
|
||||
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
|
||||
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
||||
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
||||
std::string hf_repo = parts[0];
|
||||
@@ -638,20 +646,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
||||
long res_code = 0;
|
||||
std::string res_str;
|
||||
bool use_cache = false;
|
||||
try {
|
||||
auto res = common_remote_get_content(url, params);
|
||||
res_code = res.first;
|
||||
res_str = std::string(res.second.data(), res.second.size());
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("error: failed to get manifest: %s\n", e.what());
|
||||
LOG_WRN("try reading from cache\n");
|
||||
// try to read from cache
|
||||
if (!offline) {
|
||||
try {
|
||||
auto res = common_remote_get_content(url, params);
|
||||
res_code = res.first;
|
||||
res_str = std::string(res.second.data(), res.second.size());
|
||||
} catch (const std::exception & e) {
|
||||
LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
if (res_code == 0) {
|
||||
if (std::filesystem::exists(cached_response_path)) {
|
||||
LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
|
||||
res_str = read_file(cached_response_path);
|
||||
res_code = 200;
|
||||
use_cache = true;
|
||||
} catch (const std::exception & e) {
|
||||
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
offline ? "error: failed to get manifest (offline mode)"
|
||||
: "error: failed to get manifest (check your internet connection)");
|
||||
}
|
||||
}
|
||||
std::string ggufFile;
|
||||
@@ -698,24 +711,25 @@ bool common_has_curl() {
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
|
||||
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
|
||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool common_download_model(
|
||||
const common_params_model &,
|
||||
const std::string &) {
|
||||
const std::string &,
|
||||
bool) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
|
||||
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||
return {};
|
||||
}
|
||||
@@ -742,7 +756,8 @@ struct handle_model_result {
|
||||
static handle_model_result common_params_handle_model(
|
||||
struct common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
const std::string & model_path_default) {
|
||||
const std::string & model_path_default,
|
||||
bool offline) {
|
||||
handle_model_result result;
|
||||
// handle pre-fill default model path and url based on hf_repo and hf_file
|
||||
{
|
||||
@@ -750,7 +765,7 @@ static handle_model_result common_params_handle_model(
|
||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||
if (model.hf_file.empty()) {
|
||||
if (model.path.empty()) {
|
||||
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
|
||||
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
|
||||
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
||||
exit(1); // built without CURL, error message already printed
|
||||
}
|
||||
@@ -791,7 +806,7 @@ static handle_model_result common_params_handle_model(
|
||||
|
||||
// then, download it if needed
|
||||
if (!model.url.empty()) {
|
||||
bool ok = common_download_model(model, bearer_token);
|
||||
bool ok = common_download_model(model, bearer_token, offline);
|
||||
if (!ok) {
|
||||
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
||||
exit(1);
|
||||
@@ -934,7 +949,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
|
||||
// handle model and download
|
||||
{
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
@@ -944,12 +959,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
// only download mmproj if the current example is using it
|
||||
for (auto & ex : mmproj_examples) {
|
||||
if (ctx_arg.ex == ex) {
|
||||
common_params_handle_model(params.mmproj, params.hf_token, "");
|
||||
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
|
||||
break;
|
||||
}
|
||||
}
|
||||
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
||||
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
|
||||
}
|
||||
|
||||
if (params.escape) {
|
||||
@@ -1333,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--prio"}, "N",
|
||||
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
|
||||
string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
|
||||
[](common_params & params, int prio) {
|
||||
if (prio < 0 || prio > 3) {
|
||||
if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
params.cpuparams.priority = (enum ggml_sched_priority) prio;
|
||||
@@ -1445,6 +1460,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.n_keep = value;
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--swa-full"},
|
||||
string_format("use full-size SWA cache (default: %s)\n"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.swa_full = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||
add_opt(common_arg(
|
||||
{"--no-context-shift"},
|
||||
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
||||
@@ -1670,7 +1693,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.warmup = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
|
||||
add_opt(common_arg(
|
||||
{"--spm-infill"},
|
||||
string_format(
|
||||
@@ -2057,13 +2080,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.grp_attn_w = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(common_arg(
|
||||
{"-dkvc", "--dump-kv-cache"},
|
||||
"verbose print of the KV cache",
|
||||
[](common_params & params) {
|
||||
params.dump_kv_cache = true;
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"-nkvo", "--no-kv-offload"},
|
||||
"disable KV offload",
|
||||
@@ -2232,12 +2248,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
|
||||
add_opt(common_arg(
|
||||
{"--image"}, "FILE",
|
||||
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
||||
{"--image", "--audio"}, "FILE",
|
||||
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.image.emplace_back(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
||||
).set_examples({LLAMA_EXAMPLE_MTMD}));
|
||||
if (llama_supports_rpc()) {
|
||||
add_opt(common_arg(
|
||||
{"--rpc"}, "SERVERS",
|
||||
@@ -2847,15 +2863,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
||||
add_opt(common_arg(
|
||||
{"--reasoning-format"}, "FORMAT",
|
||||
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
|
||||
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
|
||||
"only supported for non-streamed responses",
|
||||
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
||||
"- none: leaves thoughts unparsed in `message.content`\n"
|
||||
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
||||
"(default: deepseek)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
||||
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
||||
else { std::invalid_argument("invalid value"); }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
||||
add_opt(common_arg(
|
||||
{"--reasoning-budget"}, "N",
|
||||
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
||||
[](common_params & params, int value) {
|
||||
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
||||
params.reasoning_budget = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template"}, "JINJA_TEMPLATE",
|
||||
string_format(
|
||||
@@ -2867,7 +2892,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.chat_template = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
||||
string_format(
|
||||
@@ -2954,7 +2979,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
||||
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
||||
else { std::invalid_argument("invalid value"); }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
||||
add_opt(common_arg(
|
||||
@@ -2986,6 +3011,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
common_log_set_verbosity_thold(INT_MAX);
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--offline"},
|
||||
"Offline mode: forces use of cache, prevents network access",
|
||||
[](common_params & params) {
|
||||
params.offline = true;
|
||||
}
|
||||
).set_env("LLAMA_OFFLINE"));
|
||||
add_opt(common_arg(
|
||||
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
||||
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|
||||
|
||||
380
common/chat-parser.cpp
Normal file
380
common/chat-parser.cpp
Normal file
@@ -0,0 +1,380 @@
|
||||
#include "chat-parser.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "regex-partial.h"
|
||||
|
||||
#include <optional>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
|
||||
: input_(input), is_partial_(is_partial), syntax_(syntax)
|
||||
{
|
||||
result_.role = "assistant";
|
||||
|
||||
while (true) {
|
||||
std::string id = std::to_string(std::rand());
|
||||
if (input.find(id) == std::string::npos) {
|
||||
healing_marker_ = id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string common_chat_msg_parser::str(const common_string_range & rng) const {
|
||||
GGML_ASSERT(rng.begin <= rng.end);
|
||||
return input_.substr(rng.begin, rng.end - rng.begin);
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::add_content(const std::string &content) {
|
||||
result_.content += content;
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
|
||||
result_.reasoning_content += reasoning_content;
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
|
||||
if (name.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
common_chat_tool_call tool_call;
|
||||
tool_call.name = name;
|
||||
tool_call.arguments = arguments;
|
||||
tool_call.id = id;
|
||||
|
||||
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
||||
result_.tool_calls.emplace_back(tool_call);
|
||||
return true;
|
||||
}
|
||||
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
||||
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
|
||||
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
|
||||
std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
|
||||
return add_tool_call(name, id, arguments);
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::add_tool_calls(const json & arr) {
|
||||
for (const auto & item : arr) {
|
||||
if (!add_tool_call(item)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
void common_chat_msg_parser::finish() {
|
||||
if (!is_partial_ && pos_ != input_.size()) {
|
||||
throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
|
||||
}
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::consume_spaces() {
|
||||
const auto length = input_.size();
|
||||
auto consumed = false;
|
||||
while (pos_ < length && std::isspace(input_[pos_])) {
|
||||
++pos_;
|
||||
consumed = true;
|
||||
}
|
||||
return consumed;
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
|
||||
auto pos = pos_;
|
||||
for (auto i = 0u; i < literal.size(); ++i) {
|
||||
if (pos >= input_.size()) {
|
||||
return false;
|
||||
}
|
||||
if (input_[pos] != literal[i]) {
|
||||
return false;
|
||||
}
|
||||
++pos;
|
||||
}
|
||||
pos_ = pos;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_literal(const std::string & literal) {
|
||||
auto idx = input_.find(literal, pos_);
|
||||
if (idx != std::string::npos) {
|
||||
find_regex_result res;
|
||||
res.prelude = input_.substr(pos_, idx - pos_);
|
||||
auto end = idx + literal.size();
|
||||
res.groups.emplace_back(common_string_range{idx, end});
|
||||
move_to(end);
|
||||
return res;
|
||||
}
|
||||
if (is_partial_) {
|
||||
idx = string_find_partial_stop(input_, literal);
|
||||
if (idx != std::string::npos && idx >= pos_) {
|
||||
find_regex_result res;
|
||||
res.prelude = input_.substr(pos_, idx - pos_);
|
||||
auto end = input_.size();
|
||||
res.groups.emplace_back(common_string_range{idx, end});
|
||||
move_to(end);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::consume_literal(const std::string & literal) {
|
||||
if (!try_consume_literal(literal)) {
|
||||
throw common_chat_msg_partial_exception(literal);
|
||||
}
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
|
||||
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
|
||||
auto stripped_reasoning = string_strip(reasoning);
|
||||
if (stripped_reasoning.empty()) {
|
||||
return;
|
||||
}
|
||||
if (syntax_.reasoning_in_content) {
|
||||
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
|
||||
add_content(stripped_reasoning);
|
||||
if (closed) {
|
||||
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
|
||||
}
|
||||
} else {
|
||||
add_reasoning_content(stripped_reasoning);
|
||||
}
|
||||
};
|
||||
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
|
||||
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
|
||||
if (auto res = try_find_literal(end_think)) {
|
||||
handle_reasoning(res->prelude, /* closed */ true);
|
||||
consume_spaces();
|
||||
return true;
|
||||
}
|
||||
auto rest = consume_rest();
|
||||
if (!rest.empty()) {
|
||||
handle_reasoning(rest, /* closed */ !is_partial());
|
||||
}
|
||||
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
|
||||
// if (!syntax_.thinking_forced_open) {
|
||||
// throw common_chat_msg_partial_exception(end_think);
|
||||
// }
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string common_chat_msg_parser::consume_rest() {
|
||||
auto rest = input_.substr(pos_);
|
||||
pos_ = input_.size();
|
||||
return rest;
|
||||
}
|
||||
|
||||
// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
|
||||
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
|
||||
auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
|
||||
pos_ = m.groups[0].end;
|
||||
|
||||
if (add_prelude_to_content) {
|
||||
add_content(prelude);
|
||||
}
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
|
||||
if (is_partial()) {
|
||||
throw common_chat_msg_partial_exception(regex.str());
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
return find_regex_result{prelude, m.groups};
|
||||
}
|
||||
|
||||
common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
|
||||
if (auto result = try_consume_regex(regex)) {
|
||||
return *result;
|
||||
}
|
||||
throw common_chat_msg_partial_exception(regex.str());
|
||||
}
|
||||
|
||||
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
|
||||
auto m = regex.search(input_, pos_);
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
|
||||
if (is_partial()) {
|
||||
throw common_chat_msg_partial_exception(regex.str());
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
if (m.groups[0].begin != pos_) {
|
||||
// Didn't match at the current position.
|
||||
return std::nullopt;
|
||||
}
|
||||
pos_ = m.groups[0].end;
|
||||
|
||||
return find_regex_result {
|
||||
/* .prelude = */ "",
|
||||
m.groups,
|
||||
};
|
||||
}
|
||||
|
||||
std::optional<common_json> common_chat_msg_parser::try_consume_json() {
|
||||
auto it = input_.cbegin() + pos_;
|
||||
const auto end = input_.cend();
|
||||
common_json result;
|
||||
if (!common_json_parse(it, end, healing_marker_, result)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
pos_ = std::distance(input_.cbegin(), it);
|
||||
if (result.healing_marker.marker.empty()) {
|
||||
// No healing marker, just return the parsed json
|
||||
return result;
|
||||
}
|
||||
if (!is_partial()) {
|
||||
throw common_chat_msg_partial_exception("JSON");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
common_json common_chat_msg_parser::consume_json() {
|
||||
if (auto result = try_consume_json()) {
|
||||
return *result;
|
||||
}
|
||||
throw common_chat_msg_partial_exception("JSON");
|
||||
}
|
||||
|
||||
common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths,
|
||||
const std::vector<std::vector<std::string>> & content_paths
|
||||
) {
|
||||
if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
|
||||
return *result;
|
||||
}
|
||||
throw common_chat_msg_partial_exception("JSON");
|
||||
}
|
||||
|
||||
std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths,
|
||||
const std::vector<std::vector<std::string>> & content_paths
|
||||
) {
|
||||
auto partial = try_consume_json();
|
||||
if (!partial) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto is_arguments_path = [&](const std::vector<std::string> & path) {
|
||||
return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
|
||||
};
|
||||
auto is_content_path = [&](const std::vector<std::string> & path) {
|
||||
return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
|
||||
};
|
||||
|
||||
if (partial->healing_marker.marker.empty()) {
|
||||
if (args_paths.empty()) {
|
||||
// No arguments to dump, and JSON was parsed fully.
|
||||
return consume_json_result {
|
||||
partial->json,
|
||||
/* .is_partial = */ false,
|
||||
};
|
||||
}
|
||||
if (is_arguments_path({})) {
|
||||
// Entire JSON is the arguments and was parsed fully.
|
||||
return consume_json_result {
|
||||
partial->json.dump(),
|
||||
/* .is_partial = */ false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
|
||||
|
||||
auto found_healing_marker = false;
|
||||
std::vector<std::string> path;
|
||||
std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
|
||||
if (is_arguments_path(path)) {
|
||||
auto arguments = j.dump();
|
||||
if (is_partial() && !partial->healing_marker.marker.empty()) {
|
||||
auto idx = arguments.find(partial->healing_marker.json_dump_marker);
|
||||
if (idx != std::string::npos) {
|
||||
arguments.resize(idx);
|
||||
found_healing_marker = true;
|
||||
}
|
||||
if (arguments == "\"") {
|
||||
// This happens because of completing `:"$magic` after `"arguments"`
|
||||
arguments = "";
|
||||
}
|
||||
}
|
||||
return arguments;
|
||||
}
|
||||
if (is_content_path(path)) {
|
||||
if (!j.is_string()) {
|
||||
throw std::runtime_error("Content path must be a string");
|
||||
}
|
||||
std::string str = j;
|
||||
auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
|
||||
if (idx != std::string::npos) {
|
||||
str.resize(idx);
|
||||
found_healing_marker = true;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
if (j.is_object()) {
|
||||
auto obj = json::object();
|
||||
for (const auto & p : j.items()) {
|
||||
const auto & key = p.key();
|
||||
const auto & value = p.value();
|
||||
const std::string key_str = key; // NOLINT
|
||||
auto idx = key_str.find(healing_marker_);
|
||||
if (idx != std::string::npos) {
|
||||
found_healing_marker = true;
|
||||
break;
|
||||
}
|
||||
path.push_back(key_str);
|
||||
if (value.is_string()) {
|
||||
const std::string value_str = value;
|
||||
if (value_str.find(healing_marker_) != std::string::npos) {
|
||||
found_healing_marker = true;
|
||||
if (is_content_path(path)) {
|
||||
if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
|
||||
// The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
|
||||
obj[key] = remove_unsupported_healings_and_dump_args(value);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
obj[key] = value;
|
||||
} else {
|
||||
obj[key] = remove_unsupported_healings_and_dump_args(value);
|
||||
}
|
||||
path.pop_back();
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
if (j.is_array()) {
|
||||
auto arr = json::array();
|
||||
for (const auto & value : j) {
|
||||
if (value.is_string()) {
|
||||
std::string str = value;
|
||||
auto idx = str.find(healing_marker_);
|
||||
if (idx != std::string::npos) {
|
||||
// Don't heal array values that aren't in the arguments.
|
||||
found_healing_marker = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
arr.push_back(remove_unsupported_healings_and_dump_args(value));
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
return j;
|
||||
};
|
||||
|
||||
auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
|
||||
LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
|
||||
return consume_json_result {
|
||||
cleaned,
|
||||
/* .is_partial = */ found_healing_marker,
|
||||
};
|
||||
}
|
||||
118
common/chat-parser.h
Normal file
118
common/chat-parser.h
Normal file
@@ -0,0 +1,118 @@
|
||||
#pragma once
|
||||
|
||||
#include "chat.h"
|
||||
#include "json-partial.h"
|
||||
#include "regex-partial.h"
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
class common_chat_msg_partial_exception : public std::runtime_error {
|
||||
public:
|
||||
common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
|
||||
};
|
||||
|
||||
class common_chat_msg_parser {
|
||||
std::string input_;
|
||||
bool is_partial_;
|
||||
common_chat_syntax syntax_;
|
||||
std::string healing_marker_;
|
||||
|
||||
size_t pos_ = 0;
|
||||
common_chat_msg result_;
|
||||
|
||||
public:
|
||||
common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
const std::string & input() const { return input_; }
|
||||
size_t pos() const { return pos_; }
|
||||
const std::string & healing_marker() const { return healing_marker_; }
|
||||
const bool & is_partial() const { return is_partial_; }
|
||||
const common_chat_msg & result() const { return result_; }
|
||||
const common_chat_syntax & syntax() const { return syntax_; }
|
||||
|
||||
void move_to(size_t pos) {
|
||||
if (pos > input_.size()) {
|
||||
throw std::runtime_error("Invalid position!");
|
||||
}
|
||||
pos_ = pos;
|
||||
}
|
||||
void move_back(size_t n) {
|
||||
if (pos_ < n) {
|
||||
throw std::runtime_error("Can't move back that far!");
|
||||
}
|
||||
pos_ -= n;
|
||||
}
|
||||
|
||||
// Get the substring of the input at the given range
|
||||
std::string str(const common_string_range & rng) const;
|
||||
|
||||
// Appends to the result.content field
|
||||
void add_content(const std::string & content);
|
||||
|
||||
// Appends to the result.reasoning_content field
|
||||
void add_reasoning_content(const std::string & reasoning_content);
|
||||
|
||||
// Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
|
||||
bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
|
||||
|
||||
// Adds a tool call using the "name", "id" and "arguments" fields of the json object
|
||||
bool add_tool_call(const nlohmann::ordered_json & tool_call);
|
||||
|
||||
// Adds an array of tool calls using their "name", "id" and "arguments" fields.
|
||||
bool add_tool_calls(const nlohmann::ordered_json & arr);
|
||||
|
||||
void finish();
|
||||
|
||||
bool consume_spaces();
|
||||
|
||||
void consume_literal(const std::string & literal);
|
||||
|
||||
bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
|
||||
|
||||
std::string consume_rest();
|
||||
|
||||
struct find_regex_result {
|
||||
std::string prelude;
|
||||
std::vector<common_string_range> groups;
|
||||
};
|
||||
|
||||
std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
|
||||
|
||||
bool try_consume_literal(const std::string & literal);
|
||||
|
||||
std::optional<find_regex_result> try_find_literal(const std::string & literal);
|
||||
|
||||
find_regex_result consume_regex(const common_regex & regex);
|
||||
|
||||
std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
|
||||
|
||||
std::optional<common_json> try_consume_json();
|
||||
common_json consume_json();
|
||||
|
||||
struct consume_json_result {
|
||||
nlohmann::ordered_json value;
|
||||
bool is_partial;
|
||||
};
|
||||
|
||||
/*
|
||||
Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
|
||||
|
||||
By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
|
||||
e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
|
||||
|
||||
But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
|
||||
- with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
|
||||
- with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
|
||||
*/
|
||||
consume_json_result consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||
);
|
||||
std::optional<consume_json_result> try_consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||
);
|
||||
};
|
||||
1338
common/chat.cpp
1338
common/chat.cpp
File diff suppressed because it is too large
Load Diff
@@ -3,6 +3,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include <functional>
|
||||
#include <chrono>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -13,11 +14,19 @@ struct common_chat_tool_call {
|
||||
std::string name;
|
||||
std::string arguments;
|
||||
std::string id;
|
||||
|
||||
bool operator==(const common_chat_tool_call & other) const {
|
||||
return name == other.name && arguments == other.arguments && id == other.id;
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_msg_content_part {
|
||||
std::string type;
|
||||
std::string text;
|
||||
|
||||
bool operator==(const common_chat_msg_content_part & other) const {
|
||||
return type == other.type && text == other.text;
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_msg {
|
||||
@@ -28,6 +37,51 @@ struct common_chat_msg {
|
||||
std::string reasoning_content;
|
||||
std::string tool_name;
|
||||
std::string tool_call_id;
|
||||
|
||||
template <class T> T to_json_oaicompat() const;
|
||||
|
||||
bool empty() const {
|
||||
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
|
||||
}
|
||||
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
|
||||
for (auto i = 0u; i < tool_calls.size(); i++) {
|
||||
if (ids_cache.size() <= i) {
|
||||
auto id = tool_calls[i].id;
|
||||
if (id.empty()) {
|
||||
id = gen_tool_call_id();
|
||||
}
|
||||
ids_cache.push_back(id);
|
||||
}
|
||||
tool_calls[i].id = ids_cache[i];
|
||||
}
|
||||
}
|
||||
bool operator==(const common_chat_msg & other) const {
|
||||
return role == other.role
|
||||
&& content == other.content
|
||||
&& content_parts == other.content_parts
|
||||
&& tool_calls == other.tool_calls
|
||||
&& reasoning_content == other.reasoning_content
|
||||
&& tool_name == other.tool_name
|
||||
&& tool_call_id == other.tool_call_id;
|
||||
}
|
||||
bool operator!=(const common_chat_msg & other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_msg_diff {
|
||||
// std::string reasoning_content_delta;
|
||||
std::string content_delta;
|
||||
size_t tool_call_index = std::string::npos;
|
||||
common_chat_tool_call tool_call_delta;
|
||||
|
||||
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
|
||||
|
||||
bool operator==(const common_chat_msg_diff & other) const {
|
||||
return content_delta == other.content_delta
|
||||
&& tool_call_index == other.tool_call_index
|
||||
&& tool_call_delta == other.tool_call_delta;
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_tool {
|
||||
@@ -49,14 +103,11 @@ enum common_chat_format {
|
||||
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
||||
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
||||
COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
|
||||
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
||||
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
|
||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
||||
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
|
||||
|
||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
||||
};
|
||||
@@ -71,7 +122,8 @@ struct common_chat_templates_inputs {
|
||||
std::vector<common_chat_tool> tools;
|
||||
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
bool parallel_tool_calls = false;
|
||||
bool extract_reasoning = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
bool enable_thinking = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
};
|
||||
|
||||
@@ -80,11 +132,21 @@ struct common_chat_params {
|
||||
std::string prompt;
|
||||
std::string grammar;
|
||||
bool grammar_lazy = false;
|
||||
bool thinking_forced_open = false;
|
||||
std::vector<common_grammar_trigger> grammar_triggers;
|
||||
std::vector<std::string> preserved_tokens;
|
||||
std::vector<std::string> additional_stops;
|
||||
};
|
||||
|
||||
struct common_chat_syntax {
|
||||
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
|
||||
bool reasoning_in_content = false;
|
||||
bool thinking_forced_open = false;
|
||||
bool parse_tool_calls = true;
|
||||
};
|
||||
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
|
||||
|
||||
@@ -121,8 +183,9 @@ std::string common_chat_format_example(
|
||||
const struct common_chat_templates * tmpls,
|
||||
bool use_jinja);
|
||||
|
||||
std::string common_chat_format_name(common_chat_format format);
|
||||
common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
|
||||
const char* common_chat_format_name(common_chat_format format);
|
||||
const char* common_reasoning_format_name(common_reasoning_format format);
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
||||
|
||||
@@ -135,3 +198,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
|
||||
// T can be std::string containing JSON or nlohmann::ordered_json
|
||||
template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
|
||||
template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
|
||||
|
||||
template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
|
||||
|
||||
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
|
||||
DWORD p = NORMAL_PRIORITY_CLASS;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
||||
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
|
||||
int p = 0;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_LOW: p = 5; break;
|
||||
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
||||
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
||||
@@ -849,7 +851,7 @@ std::string fs_get_cache_directory() {
|
||||
if (getenv("LLAMA_CACHE")) {
|
||||
cache_directory = std::getenv("LLAMA_CACHE");
|
||||
} else {
|
||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
|
||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
cache_directory = std::getenv("XDG_CACHE_HOME");
|
||||
} else {
|
||||
@@ -903,13 +905,16 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
@@ -1136,6 +1141,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.op_offload = !params.no_op_offload;
|
||||
cparams.swa_full = params.swa_full;
|
||||
|
||||
if (params.reranking) {
|
||||
cparams.embeddings = true;
|
||||
@@ -1328,81 +1334,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
|
||||
return text;
|
||||
}
|
||||
|
||||
//
|
||||
// KV cache utils
|
||||
//
|
||||
|
||||
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
||||
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
||||
|
||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
||||
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||
|
||||
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||
llama_seq_id * cs_curr = view.cells_sequences;
|
||||
|
||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
||||
if (i % row_size == 0) {
|
||||
printf("\n%5d: ", i);
|
||||
}
|
||||
int seq_count = 0;
|
||||
for (int j = 0; j < view.n_seq_max; j++) {
|
||||
if (cs_curr[j] >= 0) { seq_count++; }
|
||||
}
|
||||
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
|
||||
}
|
||||
|
||||
printf("\n=== Done dumping\n");
|
||||
}
|
||||
|
||||
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
||||
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||
|
||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
||||
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||
|
||||
std::unordered_map<llama_seq_id, size_t> seqs;
|
||||
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||
llama_seq_id * cs_curr = view.cells_sequences;
|
||||
|
||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
||||
for (int j = 0; j < view.n_seq_max; j++) {
|
||||
if (cs_curr[j] < 0) { continue; }
|
||||
if (seqs.find(cs_curr[j]) == seqs.end()) {
|
||||
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||
const size_t sz = seqs.size();
|
||||
seqs[cs_curr[j]] = sz;
|
||||
}
|
||||
}
|
||||
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||
}
|
||||
|
||||
printf("=== Sequence legend: ");
|
||||
for (const auto & it : seqs) {
|
||||
printf("%zu=%d, ", it.second, it.first);
|
||||
}
|
||||
printf("'+'=other sequence ids");
|
||||
|
||||
c_curr = view.cells;
|
||||
cs_curr = view.cells_sequences;
|
||||
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
||||
if (i % row_size == 0) {
|
||||
printf("\n%5d: ", i);
|
||||
}
|
||||
for (int j = 0; j < view.n_seq_max; j++) {
|
||||
if (cs_curr[j] >= 0) {
|
||||
const auto & it = seqs.find(cs_curr[j]);
|
||||
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
|
||||
} else {
|
||||
putchar('.');
|
||||
}
|
||||
}
|
||||
putchar(' ');
|
||||
}
|
||||
|
||||
printf("\n=== Done dumping\n");
|
||||
}
|
||||
|
||||
//
|
||||
// Embedding utils
|
||||
//
|
||||
|
||||
@@ -76,7 +76,7 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_SERVER,
|
||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||
LLAMA_EXAMPLE_LLAVA,
|
||||
LLAMA_EXAMPLE_MTMD,
|
||||
LLAMA_EXAMPLE_LOOKUP,
|
||||
LLAMA_EXAMPLE_PARALLEL,
|
||||
LLAMA_EXAMPLE_TTS,
|
||||
@@ -115,7 +115,7 @@ enum common_grammar_trigger_type {
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
||||
};
|
||||
|
||||
struct common_grammar_trigger {
|
||||
@@ -291,6 +291,7 @@ struct common_params {
|
||||
int32_t verbosity = 0;
|
||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||
bool offline = false;
|
||||
|
||||
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
@@ -323,13 +324,13 @@ struct common_params {
|
||||
bool flash_attn = false; // flash attention
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool ctx_shift = true; // context shift on inifinite text generation
|
||||
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool display_prompt = true; // print prompt before generation
|
||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
@@ -368,6 +369,7 @@ struct common_params {
|
||||
bool use_jinja = false; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
int reasoning_budget = -1;
|
||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
@@ -621,16 +623,6 @@ std::string common_detokenize(
|
||||
const std::vector<llama_token> & tokens,
|
||||
bool special = true);
|
||||
|
||||
//
|
||||
// KV cache utils
|
||||
//
|
||||
|
||||
// Dump the KV cache view with the number of sequences per cell.
|
||||
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||
|
||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||
|
||||
//
|
||||
// Embedding utils
|
||||
//
|
||||
|
||||
256
common/json-partial.cpp
Normal file
256
common/json-partial.cpp
Normal file
@@ -0,0 +1,256 @@
|
||||
#include "json-partial.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <string>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
enum common_json_stack_element_type {
|
||||
COMMON_JSON_STACK_ELEMENT_OBJECT,
|
||||
COMMON_JSON_STACK_ELEMENT_KEY,
|
||||
COMMON_JSON_STACK_ELEMENT_ARRAY,
|
||||
};
|
||||
|
||||
struct common_json_stack_element {
|
||||
common_json_stack_element_type type;
|
||||
std::string key;
|
||||
};
|
||||
|
||||
bool common_json_parse(
|
||||
const std::string & input,
|
||||
const std::string & healing_marker,
|
||||
common_json & out)
|
||||
{
|
||||
std::string::const_iterator it = input.begin();
|
||||
const auto end = input.end();
|
||||
return common_json_parse(it, end, healing_marker, out);
|
||||
}
|
||||
|
||||
bool common_json_parse(
|
||||
std::string::const_iterator & it,
|
||||
const std::string::const_iterator & end,
|
||||
const std::string & healing_marker,
|
||||
common_json & out)
|
||||
{
|
||||
// // https://json.nlohmann.me/features/parsing/sax_interface/
|
||||
struct json_error_locator : public nlohmann::json_sax<json> {
|
||||
std::size_t position;
|
||||
bool found_error;
|
||||
std::string last_token;
|
||||
std::string exception_message;
|
||||
std::vector<common_json_stack_element> stack;
|
||||
|
||||
json_error_locator() : position(0), found_error(false) {}
|
||||
|
||||
bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
|
||||
this->position = position - 1;
|
||||
this->found_error = true;
|
||||
this->last_token = last_token;
|
||||
this->exception_message = ex.what();
|
||||
return false;
|
||||
}
|
||||
void close_value() {
|
||||
if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
|
||||
stack.pop_back();
|
||||
}
|
||||
}
|
||||
bool null() override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool boolean(bool) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_integer(number_integer_t) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_unsigned(number_unsigned_t) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool number_float(number_float_t, const string_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool string(string_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool binary(binary_t &) override { // NOLINT
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool start_object(std::size_t) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
|
||||
return true;
|
||||
}
|
||||
bool end_object() override {
|
||||
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
|
||||
stack.pop_back();
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
bool key(string_t & key) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
|
||||
return true;
|
||||
}
|
||||
bool start_array(std::size_t) override { // NOLINT
|
||||
stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
|
||||
return true;
|
||||
}
|
||||
bool end_array() override {
|
||||
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
|
||||
stack.pop_back();
|
||||
close_value();
|
||||
return true;
|
||||
}
|
||||
};
|
||||
json_error_locator err_loc;
|
||||
auto start = it;
|
||||
json::sax_parse(it, end, &err_loc);
|
||||
|
||||
if (err_loc.found_error) {
|
||||
it = start;
|
||||
auto temptative_end = it + err_loc.position;
|
||||
// LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
|
||||
|
||||
auto input = std::string(it, temptative_end);
|
||||
try {
|
||||
out.json = json::parse(input);
|
||||
// out.json = json::parse(it, temptative_end);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
} catch (const std::exception & ex) {
|
||||
// No, needs healing.
|
||||
LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
|
||||
}
|
||||
auto can_parse = [](const std::string & str) {
|
||||
try {
|
||||
auto _ = json::parse(str); // NOLINT
|
||||
return true;
|
||||
} catch (const std::exception &) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
if (!healing_marker.empty() && !err_loc.stack.empty()) {
|
||||
std::string str(it, temptative_end);
|
||||
auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
|
||||
if (last_non_sp_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
|
||||
}
|
||||
auto last_non_sp_char = str[last_non_sp_pos];
|
||||
// Used to detect stops on a number, which may not be complete.
|
||||
auto was_maybe_number = [&]() {
|
||||
if (!str.empty() && std::isspace(str.back())) {
|
||||
return false;
|
||||
}
|
||||
return std::isdigit(last_non_sp_char) ||
|
||||
last_non_sp_char == '.' ||
|
||||
last_non_sp_char == 'e' ||
|
||||
last_non_sp_char == 'E' ||
|
||||
last_non_sp_char == '-';
|
||||
};
|
||||
|
||||
std::string closing;
|
||||
for (size_t i = err_loc.stack.size(); i > 0; i--) {
|
||||
auto & el = err_loc.stack[i - 1];
|
||||
if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
|
||||
closing += "}";
|
||||
} else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
|
||||
closing += "]";
|
||||
} else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
|
||||
throw std::runtime_error("Unexpected stack element type");
|
||||
}
|
||||
}
|
||||
|
||||
const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
|
||||
|
||||
if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
|
||||
// We're inside an object value
|
||||
if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
|
||||
// Was about to create an object value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + ": 1" + closing)) {
|
||||
str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
|
||||
} else if (last_non_sp_char == '{' && can_parse(str + closing)) {
|
||||
// Was about to create an object
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (can_parse(str + "\"" + closing)) {
|
||||
// Was inside an object value string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
||||
// Was inside an object value string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
||||
} else {
|
||||
// find last :
|
||||
auto last_pos = str.find_last_of(':');
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
|
||||
}
|
||||
// Cutting back to opening : for object value
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
|
||||
if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
|
||||
// Was about to create an array value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
} else if (can_parse(str + "\"" + closing)) {
|
||||
// Was inside an array value string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
|
||||
// Was inside an array value string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
|
||||
} else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
|
||||
// Had just finished a value
|
||||
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
|
||||
} else {
|
||||
auto last_pos = str.find_last_of("[,");
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
|
||||
}
|
||||
// Cutting back to last [ or , for array value
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
|
||||
if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
|
||||
(last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
|
||||
// Was about to create an object key+value
|
||||
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
|
||||
// Was about to create an object key+value
|
||||
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
|
||||
} else if (can_parse(str + "\": 1" + closing)) {
|
||||
// Was inside an object key string
|
||||
str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
|
||||
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
|
||||
// Was inside an object key string after an escape
|
||||
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
|
||||
} else {
|
||||
auto last_pos = str.find_last_of(':');
|
||||
if (last_pos == std::string::npos) {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
|
||||
}
|
||||
// fprintf(stderr, "Cutting back to last : for object key+value\n");
|
||||
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
|
||||
}
|
||||
// fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
|
||||
out.json = json::parse(str);
|
||||
it = temptative_end;
|
||||
return true;
|
||||
}
|
||||
// TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
|
||||
// fprintf(stderr, "Closing: TODO\n");
|
||||
return false;
|
||||
}
|
||||
out.json = json::parse(it, end);
|
||||
it = end;
|
||||
return true;
|
||||
}
|
||||
38
common/json-partial.h
Normal file
38
common/json-partial.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
|
||||
struct common_healing_marker {
|
||||
// Raw marker.
|
||||
std::string marker;
|
||||
|
||||
// Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
|
||||
std::string json_dump_marker;
|
||||
};
|
||||
|
||||
// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
|
||||
struct common_json {
|
||||
nlohmann::ordered_json json;
|
||||
|
||||
common_healing_marker healing_marker;
|
||||
};
|
||||
|
||||
// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
|
||||
//
|
||||
// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
|
||||
// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
|
||||
// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
|
||||
//
|
||||
// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
|
||||
bool common_json_parse(
|
||||
const std::string & input,
|
||||
const std::string & healing_marker,
|
||||
common_json & out);
|
||||
|
||||
// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
|
||||
bool common_json_parse(
|
||||
std::string::const_iterator & it,
|
||||
const std::string::const_iterator & end,
|
||||
const std::string & healing_marker,
|
||||
common_json & out);
|
||||
@@ -1,8 +1,9 @@
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
#include <nlohmann/json_fwd.hpp>
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
|
||||
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
||||
bool force_gbnf = false);
|
||||
|
||||
@@ -161,7 +161,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||
#endif // LLAMA_USE_LLGUIDANCE
|
||||
} else {
|
||||
std::vector<std::string> patterns_at_start;
|
||||
std::vector<std::string> trigger_patterns;
|
||||
std::vector<std::string> patterns_anywhere;
|
||||
std::vector<llama_token> trigger_tokens;
|
||||
for (const auto & trigger : params.grammar_triggers) {
|
||||
@@ -173,10 +173,13 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
|
||||
{
|
||||
const auto & pattern = trigger.value;
|
||||
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
|
||||
patterns_anywhere.push_back(trigger.value);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
||||
{
|
||||
trigger_patterns.push_back(trigger.value);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
|
||||
@@ -190,10 +193,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> trigger_patterns;
|
||||
if (!patterns_at_start.empty()) {
|
||||
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
|
||||
}
|
||||
if (!patterns_anywhere.empty()) {
|
||||
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ class SentencePieceTokenTypes(IntEnum):
|
||||
|
||||
class ModelType(IntEnum):
|
||||
TEXT = 1
|
||||
VISION = 2
|
||||
MMPROJ = 2
|
||||
|
||||
|
||||
AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
|
||||
@@ -54,7 +54,7 @@ AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
|
||||
class ModelBase:
|
||||
_model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
|
||||
ModelType.TEXT: {},
|
||||
ModelType.VISION: {},
|
||||
ModelType.MMPROJ: {},
|
||||
}
|
||||
|
||||
dir_model: Path
|
||||
@@ -88,7 +88,7 @@ class ModelBase:
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
|
||||
if type(self) is ModelBase or \
|
||||
type(self) is TextModel or \
|
||||
type(self) is VisionModel:
|
||||
type(self) is MmprojModel:
|
||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||
|
||||
self.dir_model = dir_model
|
||||
@@ -309,6 +309,7 @@ class ModelBase:
|
||||
gguf.MODEL_TENSOR.POSNET_NORM1,
|
||||
gguf.MODEL_TENSOR.POSNET_NORM2,
|
||||
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
|
||||
gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
|
||||
)
|
||||
)
|
||||
or not new_name.endswith(".weight")
|
||||
@@ -422,23 +423,26 @@ class ModelBase:
|
||||
try:
|
||||
# for security reason, we don't allow loading remote code by default
|
||||
# if a model need remote code, we will fallback to config.json
|
||||
return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
|
||||
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load model config from {dir_model}: {e}")
|
||||
logger.warning("Trying to load config.json instead")
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
if "llm_config" in config:
|
||||
# rename for InternVL
|
||||
config["text_config"] = config["llm_config"]
|
||||
return config
|
||||
if "llm_config" in config:
|
||||
# rename for InternVL
|
||||
config["text_config"] = config["llm_config"]
|
||||
if "thinker_config" in config:
|
||||
# rename for Qwen2.5-Omni
|
||||
config["text_config"] = config["thinker_config"]["text_config"]
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
||||
assert names
|
||||
|
||||
def func(modelcls: AnyModel) -> AnyModel:
|
||||
model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT
|
||||
model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT
|
||||
for name in names:
|
||||
cls._model_classes[model_type][name] = modelcls
|
||||
return modelcls
|
||||
@@ -519,15 +523,15 @@ class TextModel(ModelBase):
|
||||
self.gguf_writer.add_context_length(n_ctx)
|
||||
logger.info(f"gguf: context length = {n_ctx}")
|
||||
|
||||
if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
|
||||
if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None:
|
||||
self.gguf_writer.add_embedding_length(n_embd)
|
||||
logger.info(f"gguf: embedding length = {n_embd}")
|
||||
|
||||
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
|
||||
if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
|
||||
self.gguf_writer.add_feed_forward_length(n_ff)
|
||||
logger.info(f"gguf: feed forward length = {n_ff}")
|
||||
|
||||
if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
|
||||
if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None:
|
||||
self.gguf_writer.add_head_count(n_head)
|
||||
logger.info(f"gguf: head count = {n_head}")
|
||||
|
||||
@@ -670,12 +674,12 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
||||
# ref: https://huggingface.co/tiiuae/falcon-7b
|
||||
res = "falcon"
|
||||
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
||||
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
|
||||
res = "falcon3"
|
||||
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
||||
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
||||
res = "bert-bge"
|
||||
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
||||
# ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
|
||||
res = "falcon3"
|
||||
if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
|
||||
# ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
|
||||
res = "bert-bge-large"
|
||||
@@ -727,9 +731,6 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
||||
res = "jina-v2-code"
|
||||
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
|
||||
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
||||
res = "chatglm-bpe"
|
||||
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
||||
# ref: https://huggingface.co/LumiOpen/Viking-7B
|
||||
res = "viking"
|
||||
@@ -760,9 +761,6 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
|
||||
# ref: https://huggingface.co/facebook/chameleon-7b
|
||||
res = "chameleon"
|
||||
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
||||
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
||||
res = "minerva-7b"
|
||||
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
|
||||
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
|
||||
res = "roberta-bpe"
|
||||
@@ -793,15 +791,24 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
|
||||
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
res = "llama4"
|
||||
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
|
||||
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
|
||||
res = "glm4"
|
||||
if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
|
||||
# ref: https://huggingface.co/mistral-community/pixtral-12b
|
||||
res = "pixtral"
|
||||
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
|
||||
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
|
||||
res = "seed-coder"
|
||||
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
||||
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
||||
res = "chatglm-bpe"
|
||||
if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
|
||||
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
||||
res = "chatglm-bpe"
|
||||
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
|
||||
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
|
||||
res = "glm4"
|
||||
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
||||
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
||||
res = "minerva-7b"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -1040,6 +1047,10 @@ class TextModel(ModelBase):
|
||||
special_vocab.chat_template = "rwkv-world"
|
||||
# hack: Add '\n\n' as the EOT token to make it chat normally
|
||||
special_vocab._set_special_token("eot", 261)
|
||||
# hack: Override these as they have already been set (incorrectly)
|
||||
special_vocab.special_token_ids["bos"] = 0
|
||||
special_vocab.special_token_ids["eos"] = 0
|
||||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
|
||||
@@ -1114,60 +1125,116 @@ class TextModel(ModelBase):
|
||||
self.gguf_writer.add_pooling_type(pooling_type)
|
||||
|
||||
|
||||
class VisionModel(ModelBase):
|
||||
model_type = ModelType.VISION
|
||||
model_arch = gguf.MODEL_ARCH.CLIP_VISION
|
||||
class MmprojModel(ModelBase):
|
||||
model_type = ModelType.MMPROJ
|
||||
model_arch = gguf.MODEL_ARCH.MMPROJ
|
||||
preprocessor_config: dict[str, Any]
|
||||
global_config: dict[str, Any]
|
||||
|
||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
|
||||
|
||||
has_vision_encoder: bool = True # by default
|
||||
has_audio_encoder: bool = False
|
||||
|
||||
# for models having multiple encoders, we need to separate their hparams
|
||||
hparams_vision: dict[str, Any] | None = None
|
||||
hparams_audio: dict[str, Any] | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
|
||||
raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
|
||||
if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
|
||||
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
|
||||
|
||||
# get n_embd of the text model
|
||||
if "text_config" not in self.hparams:
|
||||
self.hparams["text_config"] = {}
|
||||
if "audio_config" not in self.hparams:
|
||||
self.hparams["audio_config"] = {}
|
||||
text_config = {**self.hparams, **self.hparams["text_config"]}
|
||||
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
|
||||
assert self.n_embd_text > 0, "n_embd not found in hparams"
|
||||
|
||||
if "vision_config" not in self.hparams:
|
||||
raise ValueError("vision_config not found in hparams")
|
||||
# move vision config to the top level, while preserving the original hparams in global_config
|
||||
self.global_config = self.hparams
|
||||
self.hparams = self.hparams["vision_config"]
|
||||
import copy
|
||||
self.global_config = copy.deepcopy(self.hparams)
|
||||
self.hparams_vision = self.get_vision_config()
|
||||
self.hparams_audio = self.get_audio_config()
|
||||
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
|
||||
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
|
||||
if self.hparams_vision is None and self.hparams_audio is None:
|
||||
raise ValueError("vision_config / audio_config not found in hparams")
|
||||
|
||||
# for compat with vision-only models
|
||||
self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
|
||||
|
||||
# TODO @ngxson : this is a hack to support both vision and audio encoders
|
||||
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
|
||||
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
|
||||
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
|
||||
|
||||
# load preprocessor config
|
||||
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
|
||||
self.preprocessor_config = json.load(f)
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("vision_config")
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("audio_config")
|
||||
|
||||
def set_type(self):
|
||||
self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)
|
||||
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
|
||||
self.gguf_writer.add_vision_has_vision_encoder(True)
|
||||
|
||||
# vision config
|
||||
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
|
||||
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
|
||||
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_vision_block_count(self.block_count)
|
||||
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
|
||||
if self.has_vision_encoder:
|
||||
self.gguf_writer.add_clip_has_vision_encoder(True)
|
||||
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
|
||||
|
||||
# preprocessor config
|
||||
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
|
||||
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
|
||||
# vision config
|
||||
self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
|
||||
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
|
||||
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
|
||||
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
|
||||
|
||||
# preprocessor config
|
||||
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
|
||||
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
|
||||
|
||||
if self.has_audio_encoder:
|
||||
self.gguf_writer.add_clip_has_audio_encoder(True)
|
||||
self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
|
||||
|
||||
# audio config
|
||||
self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
|
||||
self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
|
||||
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
|
||||
|
||||
if not self.has_vision_encoder and not self.has_audio_encoder:
|
||||
raise ValueError("MmprojModel must have either vision or audio encoder")
|
||||
|
||||
def write_vocab(self):
|
||||
raise ValueError("VisionModel does not support vocab writing")
|
||||
raise ValueError("MmprojModel does not support vocab writing")
|
||||
|
||||
def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
||||
assert self.hparams_vision is not None
|
||||
return self._find_param(self.hparams_vision, keys, optional)
|
||||
|
||||
def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
||||
assert self.hparams_audio is not None
|
||||
return self._find_param(self.hparams_audio, keys, optional)
|
||||
|
||||
def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
|
||||
key = next((k for k in keys if k in obj), None)
|
||||
if key is not None:
|
||||
return obj[key]
|
||||
if optional:
|
||||
return None
|
||||
raise KeyError(f"could not find any of: {keys}")
|
||||
|
||||
|
||||
@ModelBase.register("GPTNeoXForCausalLM")
|
||||
@@ -1781,7 +1848,8 @@ class StableLMModel(TextModel):
|
||||
"MistralForCausalLM",
|
||||
"MixtralForCausalLM",
|
||||
"VLlama3ForCausalLM",
|
||||
"LlavaForConditionalGeneration")
|
||||
"LlavaForConditionalGeneration",
|
||||
"LlamaModel")
|
||||
class LlamaModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
undo_permute = True
|
||||
@@ -1861,6 +1929,8 @@ class LlamaModel(TextModel):
|
||||
|
||||
if is_vision_tensor:
|
||||
return [] # skip vision tensors
|
||||
elif self.hf_arch == "LlamaModel":
|
||||
name = "model." + name
|
||||
elif name.startswith("model.text_model"):
|
||||
name = name.replace("text_model.", "") # for SmolVLM
|
||||
elif name.startswith("language_model."):
|
||||
@@ -1951,7 +2021,7 @@ class LlamaModel(TextModel):
|
||||
"LlavaForConditionalGeneration", # pixtral
|
||||
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
||||
)
|
||||
class LlavaVisionModel(VisionModel):
|
||||
class LlavaVisionModel(MmprojModel):
|
||||
img_break_tok_id = -1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@@ -1977,7 +2047,7 @@ class LlavaVisionModel(VisionModel):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if hparams["model_type"] == "pixtral":
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
|
||||
|
||||
# hidden_act
|
||||
@@ -2016,7 +2086,7 @@ class LlavaVisionModel(VisionModel):
|
||||
|
||||
|
||||
@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
|
||||
class SmolVLMModel(VisionModel):
|
||||
class SmolVLMModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if self.hparams["model_type"] == "smolvlm_vision":
|
||||
@@ -2028,7 +2098,7 @@ class SmolVLMModel(VisionModel):
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
@@ -2094,10 +2164,10 @@ class Llama4Model(LlamaModel):
|
||||
|
||||
|
||||
@ModelBase.register("Llama4ForConditionalGeneration")
|
||||
class Llama4VisionModel(VisionModel):
|
||||
class Llama4VisionModel(MmprojModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
|
||||
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
|
||||
assert self.hparams["hidden_act"] == "gelu"
|
||||
@@ -2109,6 +2179,9 @@ class Llama4VisionModel(VisionModel):
|
||||
# process vision tensors
|
||||
if "positional_embedding_vlm" in name and ".weight" not in name:
|
||||
name += ".weight"
|
||||
if "multi_modal_projector.linear_1" in name:
|
||||
# despite the name with number postfix, this is a single fully connected layer
|
||||
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return []
|
||||
|
||||
@@ -2615,7 +2688,7 @@ class QwenModel(TextModel):
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM")
|
||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
|
||||
class Qwen2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||
|
||||
@@ -2639,13 +2712,19 @@ class Qwen2Model(TextModel):
|
||||
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
|
||||
if "language_model." in name:
|
||||
name = name.replace("language_model.", "") # for InternVL
|
||||
if name.startswith("mlp") or name.startswith("vision_model"):
|
||||
# skip visual tensors
|
||||
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
|
||||
or name.startswith("vision_model") or name.startswith("audio_tower"):
|
||||
# skip vision and audio tensors
|
||||
return []
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
@ModelBase.register(
|
||||
"Qwen2VLModel",
|
||||
"Qwen2VLForConditionalGeneration",
|
||||
"Qwen2_5_VLForConditionalGeneration",
|
||||
"Qwen2_5OmniModel",
|
||||
)
|
||||
class Qwen2VLModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
||||
|
||||
@@ -2663,31 +2742,40 @@ class Qwen2VLModel(TextModel):
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
if name.startswith("visual."):
|
||||
# skip visual tensors
|
||||
if name.startswith("thinker."):
|
||||
name = name.replace("thinker.", "")
|
||||
if name.startswith("visual") or name.startswith("audio") or \
|
||||
name.startswith("talker") or name.startswith("token2wav"):
|
||||
# skip multimodal tensors
|
||||
return []
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
class Qwen2VLVisionModel(VisionModel):
|
||||
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
class Qwen2VLVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["image_size"] = self.hparams.get("image_size", 560)
|
||||
assert self.hparams_vision is not None
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
|
||||
# rename config.json values
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
|
||||
self.hparams["num_hidden_layers"] = self.hparams.get("depth")
|
||||
if "embed_dim" in self.hparams: # qwen2vl
|
||||
self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
|
||||
self.hparams["hidden_size"] = self.hparams.get("embed_dim")
|
||||
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
|
||||
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
|
||||
if "embed_dim" in self.hparams_vision: # qwen2vl
|
||||
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
|
||||
self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if self.global_config['model_type'] == 'qwen2_vl':
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL)
|
||||
elif self.global_config['model_type'] == 'qwen2_5_vl':
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL)
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
model_type = self.global_config['model_type']
|
||||
if model_type == 'qwen2_vl':
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
|
||||
elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
|
||||
if model_type == 'qwen2_5_omni':
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
|
||||
else:
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
# find n_wa_pattern (window attention pattern)
|
||||
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
|
||||
@@ -2745,12 +2833,72 @@ class Qwen2VLVisionModel(VisionModel):
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2_5OmniModel")
|
||||
class Qwen25OmniModel(Qwen2VLVisionModel):
|
||||
has_vision_encoder = True
|
||||
has_audio_encoder = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_audio is not None
|
||||
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
|
||||
self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
|
||||
self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_audio is not None
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config["thinker_config"].get("vision_config")
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config["thinker_config"].get("audio_config")
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# SinusoidsPositionEmbedding
|
||||
assert self.hparams_audio is not None
|
||||
max_timescale = 10000
|
||||
length = 1500
|
||||
channels = self.hparams_audio["hidden_size"]
|
||||
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
||||
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
|
||||
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
||||
pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
|
||||
yield ("audio_tower.embed_positions.weight", pos_embd)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, new_name, n_dims # unused
|
||||
if ".conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
return False
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("thinker."):
|
||||
name = name.replace("thinker.", "")
|
||||
|
||||
if name.startswith("audio_tower"):
|
||||
# process audio tensors
|
||||
if "conv1.bias" in name or "conv2.bias" in name:
|
||||
# transpose conv1 and conv2 bias
|
||||
data_torch = data_torch.unsqueeze(-1)
|
||||
if "audio_bos_eos_token" in name:
|
||||
# this tensor is left unused in transformers code
|
||||
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
|
||||
return []
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("InternVisionModel")
|
||||
class InternVisionModel(VisionModel):
|
||||
class InternVisionModel(MmprojModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
|
||||
# hidden_act
|
||||
if hparams["hidden_act"] == "silu":
|
||||
@@ -3541,7 +3689,7 @@ class InternLM3Model(TextModel):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
|
||||
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
|
||||
class BertModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
@@ -3549,11 +3697,21 @@ class BertModel(TextModel):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.vocab_size = None
|
||||
|
||||
if cls_out_labels := self.hparams.get("id2label"):
|
||||
if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0":
|
||||
# Remove dummy labels added by AutoConfig
|
||||
cls_out_labels = None
|
||||
self.cls_out_labels = cls_out_labels
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
self._try_set_pooling_type()
|
||||
|
||||
if self.cls_out_labels:
|
||||
key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
|
||||
self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
|
||||
|
||||
def set_vocab(self):
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
self.vocab_size = len(tokens)
|
||||
@@ -3604,6 +3762,14 @@ class BertModel(TextModel):
|
||||
if name.startswith("cls.seq_relationship"):
|
||||
return []
|
||||
|
||||
if self.cls_out_labels:
|
||||
# For BertForSequenceClassification (direct projection layer)
|
||||
if name == "classifier.weight":
|
||||
name = "classifier.out_proj.weight"
|
||||
|
||||
if name == "classifier.bias":
|
||||
name = "classifier.out_proj.bias"
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def _xlmroberta_tokenizer_init(self) -> None:
|
||||
@@ -3623,62 +3789,111 @@ class BertModel(TextModel):
|
||||
from sentencepiece import sentencepiece_model_pb2 as model
|
||||
|
||||
tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
|
||||
|
||||
tokenizer_json = {}
|
||||
tokenizer_config_json = {}
|
||||
if not tokenizer_path.is_file():
|
||||
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||
tokenizer_path = self.dir_model / 'tokenizer.json'
|
||||
tokenizer_config_path = self.dir_model / 'tokenizer_config.json'
|
||||
|
||||
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
||||
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||
if not tokenizer_path.is_file():
|
||||
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||
|
||||
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
||||
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
||||
from base64 import b64decode
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
|
||||
tokenizer = SentencePieceProcessor()
|
||||
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||
with open(tokenizer_path, "r", encoding="utf-8") as fp:
|
||||
tokenizer_json = json.load(fp)
|
||||
|
||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||
if tokenizer_config_path.is_file():
|
||||
with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
|
||||
tokenizer_config_json = json.load(fp)
|
||||
|
||||
add_prefix = tokenizer.add_prefix_space
|
||||
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
|
||||
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
|
||||
|
||||
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
|
||||
else:
|
||||
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
||||
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||
|
||||
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
||||
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
||||
|
||||
tokenizer = SentencePieceProcessor()
|
||||
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||
|
||||
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
|
||||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
|
||||
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
piece = tokenizer.IdToPiece(token_id)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.GetScore(token_id)
|
||||
if isinstance(tokenizer, SentencePieceProcessor):
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
piece = tokenizer.IdToPiece(token_id)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.GetScore(token_id)
|
||||
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if tokenizer.IsUnknown(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif tokenizer.IsControl(token_id):
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif tokenizer.IsUnused(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.IsByte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if tokenizer.IsUnknown(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif tokenizer.IsControl(token_id):
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif tokenizer.IsUnused(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.IsByte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
|
||||
tokens[token_id] = text
|
||||
scores[token_id] = score
|
||||
toktypes[token_id] = toktype
|
||||
tokens[token_id] = text
|
||||
scores[token_id] = score
|
||||
toktypes[token_id] = toktype
|
||||
else:
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
unk_token = tokenizer_config_json.get("unk_token")
|
||||
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
|
||||
|
||||
if vocab_size > len(tokens):
|
||||
pad_count = vocab_size - len(tokens)
|
||||
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||
for i in range(1, pad_count + 1):
|
||||
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||
for token_id in range(tokenizer.vocab_size):
|
||||
piece = tokenizer._convert_id_to_token(token_id)
|
||||
if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer_json["model"]["vocab"][token_id][1]
|
||||
|
||||
# realign tokens (see HF tokenizer code)
|
||||
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
|
||||
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
|
||||
toktypes = [
|
||||
SentencePieceTokenTypes.CONTROL,
|
||||
SentencePieceTokenTypes.CONTROL,
|
||||
SentencePieceTokenTypes.CONTROL,
|
||||
SentencePieceTokenTypes.UNKNOWN,
|
||||
] + toktypes[3:-1]
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if token_id == unk_token_id:
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif token_id in tokenizer.all_special_ids:
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif token_id in added_vocab.values():
|
||||
toktype = SentencePieceTokenTypes.USER_DEFINED
|
||||
# No reliable way to detect this, but jina doesn't have any
|
||||
# elif tokenizer.IsByte(token_id):
|
||||
# toktype = SentencePieceTokenTypes.BYTE
|
||||
|
||||
tokens[token_id] = text
|
||||
scores[token_id] = score
|
||||
toktypes[token_id] = toktype
|
||||
|
||||
if isinstance(tokenizer, SentencePieceProcessor):
|
||||
# realign tokens (see HF tokenizer code)
|
||||
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
|
||||
scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
|
||||
toktypes = [
|
||||
SentencePieceTokenTypes.CONTROL,
|
||||
SentencePieceTokenTypes.CONTROL,
|
||||
SentencePieceTokenTypes.CONTROL,
|
||||
SentencePieceTokenTypes.UNKNOWN,
|
||||
] + toktypes[3:-1]
|
||||
|
||||
if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
|
||||
# Add mask token missing from sentencepiece.bpe.model
|
||||
tokens[250001] = b'<mask>'
|
||||
scores[250001] = 0.0
|
||||
toktypes[250001] = SentencePieceTokenTypes.CONTROL
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("t5")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
@@ -3698,7 +3913,27 @@ class BertModel(TextModel):
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
|
||||
@ModelBase.register("RobertaModel")
|
||||
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
|
||||
class DistilBertModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_layer_norm_eps(1e-12)
|
||||
logger.info("gguf: layer norm epsilon = 1e-12")
|
||||
super().set_gguf_parameters()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("distilbert."):
|
||||
name = name[11:]
|
||||
|
||||
# These layers act as MLM head, so we don't need them
|
||||
if name.startswith("vocab_"):
|
||||
return []
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
|
||||
class RobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
@@ -4008,11 +4243,11 @@ class Gemma3Model(TextModel):
|
||||
|
||||
|
||||
@ModelBase.register("Gemma3ForConditionalGeneration")
|
||||
class Gemma3VisionModel(VisionModel):
|
||||
class Gemma3VisionModel(MmprojModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3)
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
@@ -5959,6 +6194,65 @@ class ChameleonModel(TextModel):
|
||||
return data_torch
|
||||
|
||||
|
||||
@ModelBase.register("UltravoxModel")
|
||||
class UltravoxModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA # dummy
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
||||
class WhisperEncoderModel(MmprojModel):
|
||||
has_vision_encoder = False # no vision encoder
|
||||
has_audio_encoder = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["hidden_size"] = self.hparams["d_model"]
|
||||
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
||||
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, new_name, n_dims # unused
|
||||
if ".conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
return False
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if name.startswith("language_model."):
|
||||
# skip language model tensors
|
||||
return []
|
||||
|
||||
# prevent clash naming with vision tensors
|
||||
if name.startswith("multi_modal_projector"):
|
||||
name = "audio." + name
|
||||
|
||||
if "conv1.bias" in name or "conv2.bias" in name:
|
||||
# transpose conv1 and conv2 bias
|
||||
data_torch = data_torch.unsqueeze(-1)
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("UltravoxModel")
|
||||
class UltravoxWhisperEncoderModel(WhisperEncoderModel):
|
||||
has_vision_encoder = False # no vision encoder
|
||||
has_audio_encoder = True
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
@@ -6134,13 +6428,15 @@ def split_str_to_n_bytes(split_str: str) -> int:
|
||||
|
||||
|
||||
def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
|
||||
# TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
|
||||
# maybe we should fallback to text model's arch in that case, since not many models have both
|
||||
text_config = hparams.get("text_config", {})
|
||||
vision_config = hparams.get("vision_config", {})
|
||||
arch = hparams["architectures"][0]
|
||||
# if "architectures" is found in the sub-config, use that instead
|
||||
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
|
||||
arch = text_config["architectures"][0]
|
||||
elif model_type == ModelType.VISION and vision_config.get("architectures") is not None:
|
||||
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
|
||||
arch = vision_config["architectures"][0]
|
||||
return arch
|
||||
|
||||
@@ -6203,7 +6499,7 @@ def main() -> None:
|
||||
|
||||
with torch.inference_mode():
|
||||
output_type = ftype_map[args.outtype]
|
||||
model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
|
||||
model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
|
||||
hparams = ModelBase.load_hparams(dir_model)
|
||||
model_architecture = get_model_architecture(hparams, model_type)
|
||||
logger.info(f"Model architecture: {model_architecture}")
|
||||
|
||||
@@ -1,28 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This script downloads the tokenizer models of the specified models from Huggingface and
|
||||
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
||||
#
|
||||
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
||||
# provide the necessary information to llama.cpp via the GGUF header in order to implement
|
||||
# the same pre-tokenizer.
|
||||
#
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
|
||||
#
|
||||
# Instructions:
|
||||
#
|
||||
# - Add a new model to the "models" list
|
||||
# - Run the script with your huggingface token:
|
||||
#
|
||||
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
||||
#
|
||||
# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
|
||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||
#
|
||||
# TODO: generate tokenizer tests for llama.cpp
|
||||
#
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
@@ -32,6 +10,7 @@ import requests
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import argparse
|
||||
|
||||
from hashlib import sha256
|
||||
from enum import IntEnum, auto
|
||||
@@ -41,6 +20,11 @@ logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger("convert_hf_to_gguf_update")
|
||||
sess = requests.Session()
|
||||
|
||||
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
|
||||
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
||||
hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token"
|
||||
hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None
|
||||
|
||||
|
||||
class TOKENIZER_TYPE(IntEnum):
|
||||
SPM = auto()
|
||||
@@ -49,20 +33,49 @@ class TOKENIZER_TYPE(IntEnum):
|
||||
UGM = auto()
|
||||
|
||||
|
||||
DOC_STRING = """
|
||||
This script downloads the tokenizer models of the specified models from Huggingface and
|
||||
generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
|
||||
|
||||
/!\\ It is intended to be used by contributors and is not meant to be run by end users
|
||||
|
||||
This is necessary in order to analyze the type of pre-tokenizer used by the model and
|
||||
provide the necessary information to llama.cpp via the GGUF header in order to implement
|
||||
the same pre-tokenizer.
|
||||
|
||||
ref: https://github.com/ggml-org/llama.cpp/pull/6920
|
||||
|
||||
Instructions:
|
||||
|
||||
- Add a new model to the "models" list
|
||||
- Run the script with your huggingface token
|
||||
By default, token will be read from ~/.cache/huggingface/token
|
||||
- The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
|
||||
- Update llama.cpp with the new pre-tokenizer if necessary
|
||||
"""
|
||||
# TODO: generate tokenizer tests for llama.cpp
|
||||
|
||||
parser = argparse.ArgumentParser(description=DOC_STRING, formatter_class=argparse.RawTextHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--full", action="store_true",
|
||||
help="download full list of models - make sure you have access to all of them",
|
||||
)
|
||||
parser.add_argument(
|
||||
"hf_token",
|
||||
help="optional HF token",
|
||||
nargs="?",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
hf_token = args.hf_token if args.hf_token is not None else hf_token
|
||||
|
||||
if hf_token is None:
|
||||
logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||
# will be updated with time - contributions welcome
|
||||
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
|
||||
if len(sys.argv) == 2:
|
||||
token = sys.argv[1]
|
||||
if not token.startswith("hf_"):
|
||||
logger.info("Huggingface token seems invalid")
|
||||
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: add models here, base models preferred
|
||||
models = [
|
||||
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
||||
@@ -103,7 +116,6 @@ models = [
|
||||
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
||||
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
||||
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
|
||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
|
||||
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
||||
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
||||
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
||||
@@ -114,11 +126,19 @@ models = [
|
||||
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
|
||||
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
|
||||
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
|
||||
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
|
||||
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
pre_computed_hashes = [
|
||||
# chatglm-bpe has 2 hashes, why?
|
||||
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
|
||||
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
|
||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
|
||||
]
|
||||
|
||||
|
||||
def download_file_with_auth(url, token, save_path):
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
@@ -169,9 +189,29 @@ def download_model(model):
|
||||
if os.path.isfile(save_path):
|
||||
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||
continue
|
||||
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||
download_file_with_auth(f"{repo}/resolve/main/{file}", hf_token, save_path)
|
||||
|
||||
|
||||
# get list of existing models and chkhsh from the convert_hf_to_gguf.py file
|
||||
# returns mapping res --> chkhsh
|
||||
def get_existing_models(convert_py):
|
||||
pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
|
||||
matches = re.findall(pattern, convert_py)
|
||||
output = {}
|
||||
for chkhsh, res in matches:
|
||||
output[res] = chkhsh
|
||||
return output
|
||||
|
||||
|
||||
existing_models = {}
|
||||
all_models = models.copy()
|
||||
if not args.full:
|
||||
# Filter out models that already exist in convert_hf_to_gguf.py
|
||||
existing_models = get_existing_models(convert_py)
|
||||
all_models = models.copy()
|
||||
models = [model for model in all_models if model["name"] not in existing_models]
|
||||
|
||||
logging.info(f"Downloading {len(models)} models...")
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
@@ -182,9 +222,10 @@ for model in models:
|
||||
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
||||
|
||||
src_ifs = ""
|
||||
for model in models:
|
||||
for model in [*all_models, *pre_computed_hashes]:
|
||||
name = model["name"]
|
||||
tokt = model["tokt"]
|
||||
chkhsh = model.get("chkhsh")
|
||||
|
||||
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
|
||||
continue
|
||||
@@ -195,35 +236,44 @@ for model in models:
|
||||
continue
|
||||
|
||||
# create the tokenizer
|
||||
try:
|
||||
if name == "t5":
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||
except OSError as e:
|
||||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||
continue # Skip to the next model if the tokenizer can't be loaded
|
||||
if chkhsh is not None:
|
||||
# if the model has a pre-computed hash, use it
|
||||
logger.info(f"Using pre-computed hash for model {name}: {chkhsh}")
|
||||
elif name in existing_models:
|
||||
# if the model already exists in convert_hf_to_gguf.py, skip compute hash
|
||||
chkhsh = existing_models[name]
|
||||
else:
|
||||
# otherwise, compute the hash of the tokenizer
|
||||
try:
|
||||
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
|
||||
if name == "t5":
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||
except OSError as e:
|
||||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||
continue # Skip to the next model if the tokenizer can't be loaded
|
||||
|
||||
chktok = tokenizer.encode(CHK_TXT)
|
||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||
chktok = tokenizer.encode(CHK_TXT)
|
||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||
|
||||
logger.info(f"model: {name}")
|
||||
logger.info(f"tokt: {tokt}")
|
||||
logger.info(f"repo: {model['repo']}")
|
||||
logger.info(f"chktok: {chktok}")
|
||||
logger.info(f"chkhsh: {chkhsh}")
|
||||
logger.info(f"model: {name}")
|
||||
logger.info(f"tokt: {tokt}")
|
||||
logger.info(f"repo: {model['repo']}")
|
||||
logger.info(f"chktok: {chktok}")
|
||||
logger.info(f"chkhsh: {chkhsh}")
|
||||
|
||||
# print the "pre_tokenizer" content from the tokenizer.json
|
||||
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
cfg = json.load(f)
|
||||
normalizer = cfg["normalizer"]
|
||||
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
|
||||
pre_tokenizer = cfg["pre_tokenizer"]
|
||||
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
||||
if "ignore_merges" in cfg["model"]:
|
||||
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
|
||||
# print the "pre_tokenizer" content from the tokenizer.json
|
||||
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
||||
cfg = json.load(f)
|
||||
normalizer = cfg["normalizer"]
|
||||
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
|
||||
pre_tokenizer = cfg["pre_tokenizer"]
|
||||
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
||||
if "ignore_merges" in cfg["model"]:
|
||||
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
|
||||
|
||||
logger.info("")
|
||||
logger.info("")
|
||||
|
||||
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
|
||||
src_ifs += f" # ref: {model['repo']}\n"
|
||||
@@ -271,8 +321,6 @@ src_func = f"""
|
||||
return res
|
||||
"""
|
||||
|
||||
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
|
||||
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
||||
convert_py = re.sub(
|
||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||
lambda m: m.group(1) + src_func + m.group(3),
|
||||
@@ -288,7 +336,7 @@ logger.info("+++ convert_hf_to_gguf.py was updated")
|
||||
|
||||
tests = [
|
||||
"ied 4 ½ months",
|
||||
"Führer",
|
||||
"Äpfel",
|
||||
"",
|
||||
" ",
|
||||
" ",
|
||||
@@ -367,6 +415,10 @@ for model in models:
|
||||
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
|
||||
continue # Skip this model and continue with the next one in the loop
|
||||
|
||||
if not os.path.exists(f"models/ggml-vocab-{name}.gguf"):
|
||||
logger.info(f"Skip vocab files for model {name}, no GGUF file found")
|
||||
continue
|
||||
|
||||
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
|
||||
for text in tests:
|
||||
f.write(f"{text}")
|
||||
|
||||
135
docs/backend/CANN.md
Normal file → Executable file
135
docs/backend/CANN.md
Normal file → Executable file
@@ -56,60 +56,82 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
|
||||
|
||||
## Model Supports
|
||||
|
||||
| Model Name | FP16 | Q8_0 | Q4_0 |
|
||||
| Model Name | FP16 | Q4_0 | Q8_0 |
|
||||
|:----------------------------|:-----:|:----:|:----:|
|
||||
| AquilaChat2-7B | √ | √ | √ |
|
||||
| Baichuan-7b | √ | √ | √ |
|
||||
| Baichuan2-7B-Chat | √ | √ | √ |
|
||||
| bitnet_b1_58-large | √ | √ | √ |
|
||||
| bloom-560m | √ | x | √ |
|
||||
| bloomz-alpaca-560m | √ | x | √ |
|
||||
| c4ai-command-r-35B-v01 | x | x | x |
|
||||
| chatglm3-6B | x | x | x |
|
||||
| chinese-alpaca-2-1.3b | √ | √ | √ |
|
||||
| CodeShell-7B | √ | √ | √ |
|
||||
| deepseek-ai_deepseek-coder-1.3B-base | x | x | x |
|
||||
| deepseek-ai_DeepSeek-V2-Lite | x | x | x |
|
||||
| deepseek-coder-6.7B-instruct | x | x | x |
|
||||
| DeepSeek-V2-Lite-64x1.5B | x | x | x |
|
||||
| falcon-7b-instruct | √ | √ | √ |
|
||||
| flan-t5-large | √ | √ | √ |
|
||||
| gemma-2-9b-it | √ | √ | √ |
|
||||
| glm-4-9B | x | x | x |
|
||||
| gpt2 | √ | √ | √ |
|
||||
| Gpt2-163M | √ | √ | √ |
|
||||
| granite-3B-code-instruct | √ | √ | √ |
|
||||
| Llama-2 | √ | √ | √ |
|
||||
| Llama-3 | √ | √ | √ |
|
||||
| Mistral-7B | √ | √ | √ |
|
||||
| Mistral MOE | √ | √ | √ |
|
||||
| DBRX | - | - | - |
|
||||
| Falcon | √ | √ | √ |
|
||||
| Chinese LLaMA/Alpaca | √ | √ | √ |
|
||||
| Vigogne(French) | √ | √ | √ |
|
||||
| BERT | x | x | x |
|
||||
| Koala | √ | √ | √ |
|
||||
| Baichuan | √ | √ | √ |
|
||||
| Aquila 1 & 2 | √ | √ | √ |
|
||||
| Starcoder models | √ | √ | √ |
|
||||
| Refact | √ | √ | √ |
|
||||
| MPT | √ | √ | √ |
|
||||
| Bloom | √ | √ | √ |
|
||||
| Yi models | √ | √ | √ |
|
||||
| stablelm models | √ | √ | √ |
|
||||
| DeepSeek models | x | x | x |
|
||||
| Qwen models | √ | √ | √ |
|
||||
| PLaMo-13B | √ | √ | √ |
|
||||
| Phi models | √ | √ | √ |
|
||||
| PhiMoE | √ | √ | √ |
|
||||
| GPT-2 | √ | √ | √ |
|
||||
| Orion | √ | √ | √ |
|
||||
| InternlLM2 | √ | √ | √ |
|
||||
| CodeShell | √ | √ | √ |
|
||||
| Gemma | √ | √ | √ |
|
||||
| Mamba | √ | √ | √ |
|
||||
| Xverse | √ | √ | √ |
|
||||
| command-r models | √ | √ | √ |
|
||||
| Grok-1 | - | - | - |
|
||||
| SEA-LION | √ | √ | √ |
|
||||
| GritLM-7B | √ | √ | √ |
|
||||
| internlm2_5-7b-chat | √ | √ | √ |
|
||||
| koala-7B-HF | √ | √ | √ |
|
||||
| Llama-2-7b-chat-hf | √ | √ | √ |
|
||||
| Llama-3-Smaug-8B | √ | √ | √ |
|
||||
| Llama2-Chinese-7b-Chat | √ | √ | √ |
|
||||
| Llama3-8B | √ | √ | √ |
|
||||
| Llama3-8b-chinese | √ | √ | √ |
|
||||
| mamba-130m-hf | √ | √ | √ |
|
||||
| Mistral-7B-Instruct-v0.2 | √ | √ | √ |
|
||||
| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ |
|
||||
| mpt-7B | √ | √ | √ |
|
||||
| OLMo-1B-hf | √ | √ | √ |
|
||||
| OpenELM-3B-Instruct | √ | √ | √ |
|
||||
| Orion-14b-base | √ | √ | √ |
|
||||
| phi1 | x | x | x |
|
||||
| phi2 | x | x | x |
|
||||
| Phi-3-mini-4k-instruct | √ | √ | √ |
|
||||
| plamo-13b | √ | √ | √ |
|
||||
| pythia-70M | x | x | x |
|
||||
| Qwen-7B | √ | √ | √ |
|
||||
| Qwen2-1.5B-Instruct | √ | x | √ |
|
||||
| Refact-1_6B-fim | √ | √ | √ |
|
||||
| SmolLM-135M | √ | √ | √ |
|
||||
| stablelm-zephyr | x | x | x |
|
||||
| stablelm-2-zephyr-1_6b | x | x | x |
|
||||
| starcoderbase-1b | √ | √ | √ |
|
||||
| starcoder2-3b | √ | √ | √ |
|
||||
| vigogne-7b-chat | √ | √ | √ |
|
||||
| xverse-7b-chat | √ | √ | √ |
|
||||
| Yi-6b-Chat | √ | √ | √ |
|
||||
| OLMo | √ | √ | √ |
|
||||
| OLMo 2 | √ | √ | √ |
|
||||
| OLMoE | √ | √ | √ |
|
||||
| Granite models | √ | √ | √ |
|
||||
| GPT-NeoX | √ | √ | √ |
|
||||
| Pythia | √ | √ | √ |
|
||||
| Snowflake-Arctic MoE | - | - | - |
|
||||
| Smaug | √ | √ | √ |
|
||||
| Poro 34B | √ | √ | √ |
|
||||
| Bitnet b1.58 models | √ | x | x |
|
||||
| Flan-T5 | √ | √ | √ |
|
||||
| Open Elm models | x | √ | √ |
|
||||
| chatGLM3-6B + ChatGLM4-9b + GLMEdge-1.5b + GLMEdge-4b | √ | √ | √ |
|
||||
| GLM-4-0414 | √ | √ | √ |
|
||||
| SmolLM | √ | √ | √ |
|
||||
| EXAONE-3.0-7.8B-Instruct | √ | √ | √ |
|
||||
| FalconMamba Models | √ | √ | √ |
|
||||
| Jais Models | - | x | x |
|
||||
| Bielik-11B-v2.3 | √ | √ | √ |
|
||||
| RWKV-6 | - | √ | √ |
|
||||
| QRWKV-6 | √ | √ | √ |
|
||||
| GigaChat-20B-A3B | x | x | x |
|
||||
| Trillion-7B-preview | √ | √ | √ |
|
||||
| Ling models | √ | √ | √ |
|
||||
|
||||
|
||||
**Multimodal**
|
||||
| Model Name | FP16 | Q4_0 | Q8_0 |
|
||||
|:----------------------------|:-----:|:----:|:----:|
|
||||
| LLaVA 1.5 models, LLaVA 1.6 models | x | x | x |
|
||||
| BakLLaVA | √ | √ | √ |
|
||||
| Obsidian | √ | - | - |
|
||||
| ShareGPT4V | x | - | - |
|
||||
| MobileVLM 1.7B/3B models | - | - | - |
|
||||
| Yi-VL | - | - | - |
|
||||
| Mini CPM | √ | √ | √ |
|
||||
| Moondream | √ | √ | √ |
|
||||
| Bunny | √ | - | - |
|
||||
| GLM-EDGE | √ | √ | √ |
|
||||
| Qwen2-VL | √ | √ | √ |
|
||||
|
||||
|
||||
|
||||
@@ -258,6 +280,15 @@ cmake --build build --config release
|
||||
### **GitHub contribution**:
|
||||
Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
|
||||
|
||||
## Updates
|
||||
### Basic Flash Attention Support
|
||||
The basic FA kernel with aclnnops has been added in aclnn_ops.cpp.
|
||||
Currently, the FA only supports the cases with FP16 KV tensors and NO logit softcap.
|
||||
Since the aclnn interface for flash attention cannot support the logit softcap, we will only update the quantized version in the future.
|
||||
|
||||
Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang@pku.edu.cn), Ruiyang Ma (ruiyang@stu.pku.edu.cn), and Guojie Luo (gluo@pku.edu.cn).
|
||||
|
||||
We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
|
||||
|
||||
## TODO
|
||||
- Support more models and data types.
|
||||
|
||||
@@ -63,6 +63,7 @@ cmake --build build --config Release
|
||||
cmake --preset x64-windows-llvm-release
|
||||
cmake --build build-x64-windows-llvm-release
|
||||
```
|
||||
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
|
||||
|
||||
## BLAS Build
|
||||
|
||||
|
||||
@@ -107,7 +107,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
|
||||
|
||||
The defaults are:
|
||||
|
||||
- `MUSA_VERSION` set to `rc3.1.1`
|
||||
- `MUSA_VERSION` set to `rc4.0.1`
|
||||
|
||||
The resulting images, are essentially the same as the non-MUSA images:
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
|
||||
- `llama-server` when started w/ `--jinja` flag
|
||||
- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
|
||||
|
||||
## Universal support w/ Native & Generic handlers
|
||||
|
||||
@@ -325,36 +324,65 @@ To get the official template from original HuggingFace repos, you can use [scrip
|
||||
> [!TIP]
|
||||
> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
|
||||
|
||||
> [!CAUTION]
|
||||
> Beware of extreme KV quantizations (e.g. `-ctk q4_0`), they can substantially degrade the model's tool calling performance.
|
||||
|
||||
Test in CLI (or with any library / software that can use OpenAI-compatible API backends):
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/v1/chat/completions -d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"tools": [
|
||||
{
|
||||
"type":"function",
|
||||
"function":{
|
||||
"name":"python",
|
||||
"description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
|
||||
"parameters":{
|
||||
"type":"object",
|
||||
"properties":{
|
||||
"code":{
|
||||
"type":"string",
|
||||
"description":"The code to run in the ipython interpreter."
|
||||
"model": "gpt-3.5-turbo",
|
||||
"tools": [
|
||||
{
|
||||
"type":"function",
|
||||
"function":{
|
||||
"name":"python",
|
||||
"description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
|
||||
"parameters":{
|
||||
"type":"object",
|
||||
"properties":{
|
||||
"code":{
|
||||
"type":"string",
|
||||
"description":"The code to run in the ipython interpreter."
|
||||
}
|
||||
},
|
||||
"required":["code"]
|
||||
}
|
||||
},
|
||||
"required":["code"]
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Print a hello world message with python."
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Print a hello world message with python."
|
||||
}
|
||||
]
|
||||
}'
|
||||
|
||||
|
||||
curl http://localhost:8080/v1/chat/completions -d '{
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
|
||||
{"role": "user", "content": "What is the weather in Istanbul?"}
|
||||
],
|
||||
"tools": [{
|
||||
"type":"function",
|
||||
"function":{
|
||||
"name":"get_current_weather",
|
||||
"description":"Get the current weather in a given location",
|
||||
"parameters":{
|
||||
"type":"object",
|
||||
"properties":{
|
||||
"location":{
|
||||
"type":"string",
|
||||
"description":"The city and country/state, e.g. `San Francisco, CA`, or `Paris, France`"
|
||||
}
|
||||
},
|
||||
"required":["location"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
}'
|
||||
```
|
||||
|
||||
|
||||
@@ -4,7 +4,9 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
|
||||
- [llama-mtmd-cli](../tools/mtmd/README.md)
|
||||
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
|
||||
|
||||
To enable it, can use use one of the 2 methods below:
|
||||
Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
|
||||
|
||||
To enable it, you can use one of the 2 methods below:
|
||||
|
||||
- Use `-hf` option with a supported model (see a list of pre-quantized model below)
|
||||
- To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
|
||||
@@ -31,12 +33,14 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
|
||||
|
||||
## Pre-quantized models
|
||||
|
||||
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org
|
||||
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc
|
||||
|
||||
Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
|
||||
|
||||
NOTE: some models may require large context window, for example: `-c 8192`
|
||||
|
||||
**Vision models**:
|
||||
|
||||
```sh
|
||||
# Gemma 3
|
||||
(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
@@ -77,4 +81,29 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||
|
||||
# Llama 4 Scout
|
||||
(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
|
||||
|
||||
# Moondream2 20250414 version
|
||||
(tool_name) -hf ggml-org/moondream2-20250414-GGUF
|
||||
|
||||
```
|
||||
|
||||
**Audio models**:
|
||||
|
||||
```sh
|
||||
# Ultravox 0.5
|
||||
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
|
||||
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
|
||||
|
||||
# Qwen2-Audio and SeaLLM-Audio
|
||||
# note: no pre-quantized GGUF this model, as they have very poor result
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
|
||||
```
|
||||
|
||||
**Mixed modalities**:
|
||||
|
||||
```sh
|
||||
# Qwen2.5 Omni
|
||||
# Capabilities: audio input, vision input
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
|
||||
```
|
||||
|
||||
@@ -41,8 +41,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to process\n", __func__);
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
|
||||
@@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
|
||||
const int N = 5; // n-gram size
|
||||
const int G = 15; // max verification n-grams
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
|
||||
// here we keep adding new n-grams as we go
|
||||
ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
|
||||
|
||||
// debug
|
||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
// sample first token
|
||||
@@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
while (true) {
|
||||
// debug
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||
//
|
||||
// Example for W = 5, N = 4, G = 2:
|
||||
@@ -473,8 +462,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_sampler_free(smpl);
|
||||
|
||||
llama_kv_cache_view_free(&kvc_view);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
@@ -24,8 +24,6 @@ int main(int argc, char ** argv){
|
||||
// max. number of additional tokens to draft if match is found
|
||||
const int n_draft = params.speculative.n_max;
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -110,18 +108,9 @@ int main(int argc, char ** argv){
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
|
||||
|
||||
// debug
|
||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
while (true) {
|
||||
// debug
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
// print current draft sequence
|
||||
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ Simplified simulation of serving incoming requests in parallel
|
||||
|
||||
## Example
|
||||
|
||||
Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question.
|
||||
Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of up to 10 junk questions (`--junk 10`) followed by the actual question.
|
||||
|
||||
```bash
|
||||
llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384
|
||||
|
||||
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.n_predict = 128;
|
||||
params.n_junk = 0;
|
||||
params.n_junk = 1;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||
return 1;
|
||||
@@ -178,13 +178,11 @@ int main(int argc, char ** argv) {
|
||||
// insert new requests as soon as the previous one is done
|
||||
const bool cont_batching = params.cont_batching;
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
// is the system prompt shared in the cache
|
||||
const bool is_sp_shared = params.is_pp_shared;
|
||||
|
||||
// extra text to insert in each client's prompt in order to make it larger
|
||||
const int32_t n_junk = params.n_junk;
|
||||
const int32_t n_junk = std::max(1, params.n_junk);
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
@@ -241,8 +239,6 @@ int main(int argc, char ** argv) {
|
||||
int32_t n_total_gen = 0;
|
||||
int32_t n_cache_miss = 0;
|
||||
|
||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
|
||||
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
|
||||
@@ -272,11 +268,6 @@ int main(int argc, char ** argv) {
|
||||
LOG_INF("Processing requests ...\n\n");
|
||||
|
||||
while (true) {
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
// decode any currently ongoing sequences
|
||||
@@ -324,7 +315,10 @@ int main(int argc, char ** argv) {
|
||||
} else {
|
||||
client.prompt += k_system;
|
||||
}
|
||||
for (int i = 0; i < n_junk; ++i) {
|
||||
|
||||
const int n_junk_cur = rand() % n_junk;
|
||||
|
||||
for (int i = 0; i < n_junk_cur; ++i) {
|
||||
const int r = rand() % k_questions.size();
|
||||
client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
|
||||
}
|
||||
@@ -349,7 +343,7 @@ int main(int argc, char ** argv) {
|
||||
client.n_decoded = 0;
|
||||
client.i_batch = batch.n_tokens - 1;
|
||||
|
||||
LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||
LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur);
|
||||
|
||||
g_seq_id += 1;
|
||||
|
||||
@@ -368,7 +362,9 @@ int main(int argc, char ** argv) {
|
||||
// process in chunks of params.n_batch
|
||||
int32_t n_batch = params.n_batch;
|
||||
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
int32_t i_next = 0;
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
|
||||
// experiment: process in powers of 2
|
||||
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
|
||||
// n_batch /= 2;
|
||||
@@ -376,7 +372,7 @@ int main(int argc, char ** argv) {
|
||||
// continue;
|
||||
//}
|
||||
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
@@ -396,19 +392,24 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||
LOG_WRN("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||
|
||||
n_cache_miss += 1;
|
||||
|
||||
// retry with half the batch size to try to find a free slot in the KV cache
|
||||
n_batch /= 2;
|
||||
i -= n_batch;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
||||
|
||||
// move the head of the batch forward with the number of tokens we just processed
|
||||
i_next = i + n_tokens;
|
||||
|
||||
// on successful decode, restore the original batch size
|
||||
n_batch = params.n_batch;
|
||||
|
||||
for (auto & client : clients) {
|
||||
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
||||
continue;
|
||||
|
||||
@@ -133,9 +133,8 @@ int main(int argc, char ** argv) {
|
||||
const int ib = i/n_batch - 1;
|
||||
const int bd = n_batch_grp*(n_grp - 1);
|
||||
|
||||
llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_self_update (ctx);
|
||||
llama_kv_self_seq_add(ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_self_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
|
||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
@@ -169,8 +168,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_self_defrag (ctx);
|
||||
llama_kv_self_update (ctx);
|
||||
|
||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
||||
|
||||
@@ -200,8 +197,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_self_defrag (ctx);
|
||||
llama_kv_self_update (ctx);
|
||||
|
||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
||||
}
|
||||
|
||||
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||
}
|
||||
}
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to decode\n", __func__);
|
||||
LOG_ERR("%s : failed to process\n", __func__);
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch) {
|
||||
float * out = emb + p * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd);
|
||||
batch_process(ctx, batch, out, s, n_embd);
|
||||
common_batch_clear(batch);
|
||||
p += s;
|
||||
s = 0;
|
||||
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// final batch
|
||||
float * out = emb + p * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd);
|
||||
batch_process(ctx, batch, out, s, n_embd);
|
||||
|
||||
// save embeddings to chunks
|
||||
for (int i = 0; i < n_chunks; i++) {
|
||||
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
|
||||
batch_add_seq(query_batch, query_tokens, 0);
|
||||
|
||||
std::vector<float> query_emb(n_embd, 0);
|
||||
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||
|
||||
common_batch_clear(query_batch);
|
||||
|
||||
|
||||
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
|
||||
auto generate = [&](const std::string & prompt) {
|
||||
std::string response;
|
||||
|
||||
const bool is_first = llama_kv_self_used_cells(ctx) == 0;
|
||||
const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
|
||||
|
||||
// tokenize the prompt
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
|
||||
while (true) {
|
||||
// check if we have enough space in the context to evaluate this batch
|
||||
int n_ctx = llama_n_ctx(ctx);
|
||||
int n_ctx_used = llama_kv_self_used_cells(ctx);
|
||||
int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
|
||||
@@ -10,8 +10,8 @@ Proof of concept:
|
||||
|
||||
``` sh
|
||||
export model_name=llama_3.2-1b && export quantization=f32
|
||||
./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
|
||||
./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
|
||||
./build/bin/llama-finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
|
||||
./build/bin/llama-perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
|
||||
```
|
||||
|
||||
The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
|
||||
|
||||
@@ -129,6 +129,7 @@ option(GGML_LASX "ggml: enable lasx" ON)
|
||||
option(GGML_LSX "ggml: enable lsx" ON)
|
||||
option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
@@ -176,7 +177,6 @@ option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks"
|
||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
||||
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
||||
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
|
||||
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
||||
|
||||
@@ -24,3 +24,28 @@ function(ggml_get_flags CCID CCVER)
|
||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
function(ggml_get_system_arch)
|
||||
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
||||
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
|
||||
set(GGML_SYSTEM_ARCH "ARM" PARENT_SCOPE)
|
||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR
|
||||
CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
||||
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
|
||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
|
||||
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
|
||||
set(GGML_SYSTEM_ARCH "riscv64" PARENT_SCOPE)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
||||
set(GGML_SYSTEM_ARCH "s390x" PARENT_SCOPE)
|
||||
else()
|
||||
set(GGML_SYSTEM_ARCH "UNKNOWN" PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
@@ -536,6 +536,7 @@ extern "C" {
|
||||
GGML_UNARY_OP_HARDSWISH,
|
||||
GGML_UNARY_OP_HARDSIGMOID,
|
||||
GGML_UNARY_OP_EXP,
|
||||
GGML_UNARY_OP_GELU_ERF,
|
||||
|
||||
GGML_UNARY_OP_COUNT,
|
||||
};
|
||||
@@ -934,6 +935,15 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// repeat a to the specified shape
|
||||
GGML_API struct ggml_tensor * ggml_repeat_4d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
// sums repetitions in a into shape of b
|
||||
GGML_API struct ggml_tensor * ggml_repeat_back(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1024,6 +1034,16 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// GELU using erf (error function) when possible
|
||||
// some backends may fallback to approximation based on Abramowitz and Stegun formula
|
||||
GGML_API struct ggml_tensor * ggml_gelu_erf(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
@@ -2075,9 +2095,6 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
|
||||
|
||||
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
||||
|
||||
// print info and performance information for the graph
|
||||
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
||||
|
||||
@@ -2161,6 +2178,7 @@ extern "C" {
|
||||
|
||||
// scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_LOW = -1,
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
|
||||
@@ -109,6 +109,8 @@ if (MSVC)
|
||||
else ()
|
||||
set(CMAKE_GENERATOR_PLATFORM_LWR "")
|
||||
endif ()
|
||||
ggml_get_system_arch()
|
||||
message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
|
||||
|
||||
if (NOT MSVC)
|
||||
if (GGML_STATIC)
|
||||
@@ -194,6 +196,7 @@ add_library(ggml-base
|
||||
../include/ggml-opt.h
|
||||
../include/gguf.h
|
||||
ggml.c
|
||||
ggml.cpp
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
ggml-opt.cpp
|
||||
@@ -224,6 +227,7 @@ function(ggml_add_backend_library backend)
|
||||
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
||||
add_dependencies(ggml ${backend})
|
||||
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
else()
|
||||
add_library(${backend} ${ARGN})
|
||||
target_link_libraries(ggml PUBLIC ${backend})
|
||||
@@ -287,16 +291,20 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
ggml_add_cpu_backend_variant(sse42 SSE42)
|
||||
ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
|
||||
ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
if (NOT MSVC)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||
// allocate graph
|
||||
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
||||
// the re-allocation may cause the split inputs to be moved to a different address
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
|
||||
for (int i = 0; i < sched->n_backends; i++) {
|
||||
ggml_backend_synchronize(sched->backends[i]);
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
||||
#endif
|
||||
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||
|
||||
ggml_backend_sched_split_graph(sched, graph);
|
||||
|
||||
|
||||
if (!ggml_backend_sched_alloc_splits(sched)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1598,6 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
||||
for (int i = 0; i < sched->n_backends; i++) {
|
||||
ggml_backend_synchronize(sched->backends[i]);
|
||||
}
|
||||
if (!sched->is_alloc) {
|
||||
// if the graph is not already allocated, always use copy 0 after a synchronization
|
||||
// this ensures that during generation the same copy is used every time,
|
||||
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
|
||||
sched->cur_copy = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
||||
|
||||
@@ -81,7 +81,7 @@ if (BLAS_FOUND)
|
||||
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
|
||||
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
|
||||
else()
|
||||
message(ERROR "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
" to set correct GGML_BLAS_VENDOR")
|
||||
message(FATAL_ERROR "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
" to set correct GGML_BLAS_VENDOR")
|
||||
endif()
|
||||
|
||||
1
ggml/src/ggml-cann/CMakeLists.txt
Normal file → Executable file
1
ggml/src/ggml-cann/CMakeLists.txt
Normal file → Executable file
@@ -30,6 +30,7 @@ string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
|
||||
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
|
||||
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
||||
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
|
||||
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
|
||||
|
||||
if (CANN_INSTALL_DIR)
|
||||
# Only Support Linux.
|
||||
|
||||
0
ggml/src/ggml-cann/Doxyfile
Normal file → Executable file
0
ggml/src/ggml-cann/Doxyfile
Normal file → Executable file
2
ggml/src/ggml-cann/acl_tensor.cpp
Normal file → Executable file
2
ggml/src/ggml-cann/acl_tensor.cpp
Normal file → Executable file
@@ -31,6 +31,8 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
|
||||
return ACL_FLOAT;
|
||||
case GGML_TYPE_F16:
|
||||
return ACL_FLOAT16;
|
||||
case GGML_TYPE_BF16:
|
||||
return ACL_BF16;
|
||||
case GGML_TYPE_I8:
|
||||
return ACL_INT8;
|
||||
case GGML_TYPE_I16:
|
||||
|
||||
0
ggml/src/ggml-cann/acl_tensor.h
Normal file → Executable file
0
ggml/src/ggml-cann/acl_tensor.h
Normal file → Executable file
469
ggml/src/ggml-cann/aclnn_ops.cpp
Normal file → Executable file
469
ggml/src/ggml-cann/aclnn_ops.cpp
Normal file → Executable file
@@ -66,6 +66,7 @@
|
||||
#include <aclnnop/aclnn_gt_scalar.h>
|
||||
#include <aclnnop/aclnn_pow.h>
|
||||
#include <aclnnop/aclnn_grouped_matmul_v2.h>
|
||||
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <cmath>
|
||||
@@ -74,11 +75,13 @@
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
|
||||
#include "../ggml-common.h"
|
||||
|
||||
|
||||
void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
|
||||
aclTensor ** acl_src1, aclTensor ** acl_dst) {
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
|
||||
@@ -2697,14 +2700,10 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
||||
}
|
||||
}
|
||||
|
||||
// GroupedMatmulV2 required tensor_list.size < 128
|
||||
size_t GROUP_SIZE = 128;
|
||||
std::vector<std::vector<aclTensor*>> src0_tensor_vec_vec;
|
||||
std::vector<std::vector<aclTensor*>> src1_tensor_vec_vec;
|
||||
std::vector<std::vector<aclTensor*>> dst_tensor_vec_vec;
|
||||
|
||||
// split and call GroupedMatmulV2
|
||||
// GroupedMatmulV2 required tensor_list.size < 128
|
||||
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
|
||||
// split and call GroupedMatmulV2
|
||||
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
|
||||
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
|
||||
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
|
||||
@@ -2722,6 +2721,133 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs expert-specific matrix multiplication (MoE) with
|
||||
* quantized precision using the CANN backend.
|
||||
*
|
||||
* This function executes a matrix multiplication operation tailored for
|
||||
* Mixture of Experts (MoE) models, where the input tensor is multiplied
|
||||
* with expert-specific quantized weight matrices. It leverages the CANN
|
||||
* backend to perform efficient low-precision computations and stores the
|
||||
* quantized result in the destination tensor `dst`.
|
||||
*
|
||||
* Quantization techniques reduce memory footprint and improve performance
|
||||
* by using lower-bit representations (e.g., int8) instead of floating-point.
|
||||
* This function is designed to work with such formats and may incorporate
|
||||
* optimizations like identity-based fast paths or routing masks for sparse
|
||||
* expert selection.
|
||||
*
|
||||
* @param ctx The context for executing CANN backend operations.
|
||||
* @param dst The destination tensor where the quantized MoE multiplication result
|
||||
* will be stored.
|
||||
*
|
||||
* @note This function assumes quantized data types and is designed for
|
||||
* MoE architectures with potential sparse expert routing.
|
||||
*/
|
||||
static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
// TODO: Use aclnnGroupedMatMul
|
||||
//dst [M, K, N, 1]
|
||||
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
|
||||
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
|
||||
ggml_tensor * ids = dst->src[2]; //ids [K, N]
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
// copy index from npu to cpu
|
||||
int64_t n_as = ne02; // A
|
||||
int64_t n_ids = ids->ne[0]; // K
|
||||
|
||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||
ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
|
||||
ACL_MEMCPY_DEVICE_TO_HOST);
|
||||
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
|
||||
|
||||
char * src0_original = (char *) src0->data;
|
||||
char * src1_original = (char *) src1->data;
|
||||
char * dst_original = (char *) dst->data;
|
||||
|
||||
ggml_tensor src0_row = *src0;
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
const enum ggml_type type = dst->src[0]->type;
|
||||
float weight_elem_size;
|
||||
if (type == GGML_TYPE_Q4_0) {
|
||||
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
||||
} else if (type == GGML_TYPE_Q8_0) {
|
||||
weight_elem_size = float(sizeof(uint8_t));
|
||||
} else {
|
||||
GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
|
||||
}
|
||||
|
||||
// src0_row [D, M, 1, 1] weight without permute
|
||||
src0_row.ne[2] = 1;
|
||||
src0_row.ne[3] = 1;
|
||||
src0_row.nb[0] = weight_elem_size;
|
||||
src0_row.nb[1] = weight_elem_size * ne00;
|
||||
src0_row.nb[2] = weight_elem_size * ne00;
|
||||
src0_row.nb[3] = weight_elem_size * ne00;
|
||||
size_t weight_stride = ne00 * ne01 * weight_elem_size;
|
||||
size_t weight_size = weight_stride * ne02 * ne03;
|
||||
|
||||
// scale [D, M, 1, 1] -> scale && permute
|
||||
size_t scale_elem_size = sizeof(uint16_t);
|
||||
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
||||
|
||||
// src1_row [D, 1, 1, 1] -> input
|
||||
src1_row.ne[1] = 1;
|
||||
src1_row.ne[2] = 1;
|
||||
src1_row.ne[3] = 1;
|
||||
src1_row.nb[2] = nb11;
|
||||
src1_row.nb[3] = nb11;
|
||||
|
||||
// dst_row [M, 1, 1, 1] -> out
|
||||
dst_row.ne[1] = 1;
|
||||
dst_row.ne[2] = 1;
|
||||
dst_row.ne[3] = 1;
|
||||
dst_row.nb[2] = nb1;
|
||||
dst_row.nb[3] = nb1;
|
||||
|
||||
//create weight for one row
|
||||
ggml_cann_pool_alloc weight_allocator(ctx.pool());
|
||||
void* weight_buffer = weight_allocator.alloc(nb02);
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
// expert index
|
||||
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||
|
||||
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
||||
int64_t i11 = (ne11 == 1 ? 0 : id);
|
||||
int64_t i12 = iid1;
|
||||
|
||||
int64_t i1 = id;
|
||||
int64_t i2 = i12;
|
||||
|
||||
void* src0_tmp_ptr = src0_original + i02*weight_stride;
|
||||
void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
|
||||
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
||||
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
||||
|
||||
// mem cpy
|
||||
ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
void* scale_buffer = (char*)weight_buffer + weight_stride;
|
||||
ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
|
||||
src0_row.data = weight_buffer;
|
||||
src1_row.data = src1_tmp_ptr;
|
||||
dst_row.data = dst_tmp_ptr;
|
||||
dst_row.src[0] = &src0_row;
|
||||
dst_row.src[1] = &src1_row;
|
||||
|
||||
ggml_cann_mul_mat(ctx, &dst_row);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
const enum ggml_type type = dst->src[0]->type;
|
||||
switch (type) {
|
||||
@@ -2729,8 +2855,339 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
case GGML_TYPE_F16:
|
||||
ggml_cann_mul_mat_id_fp(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
ggml_cann_mul_mat_id_quant(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported type for mul_mat_id");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
||||
|
||||
ggml_tensor* src0 = dst->src[0]; // q, fp32
|
||||
ggml_tensor* src1 = dst->src[1]; // k, fp16
|
||||
ggml_tensor* src2 = dst->src[2]; // v, fp16
|
||||
ggml_tensor* src3 = dst->src[3]; // mask, fp16
|
||||
|
||||
float maxBias = 0.0f;
|
||||
float scaleValue = 1.0f;
|
||||
float logitSoftcap = 0.0f;
|
||||
memcpy(&scaleValue, (float*)dst->op_params + 0, sizeof(float));
|
||||
memcpy(&maxBias, (float*)dst->op_params + 1, sizeof(float));
|
||||
memcpy(&logitSoftcap, (float*)dst->op_params + 2, sizeof(float));
|
||||
|
||||
if(logitSoftcap == 0.0f){
|
||||
size_t faElemSize = sizeof(uint16_t);
|
||||
auto faDataType = ACL_FLOAT16; //ACL_BF16;
|
||||
|
||||
aclTensor* acl_src0_f16_tensor = nullptr;
|
||||
aclTensor* acl_src1_f16_tensor = nullptr;
|
||||
aclTensor* acl_src2_f16_tensor = nullptr;
|
||||
aclTensor* acl_dst_f16_tensor = nullptr;
|
||||
|
||||
// Step 1: cast the src0 (Query) to fp16 if needed
|
||||
ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
|
||||
void* src0_f16_buffer = nullptr;
|
||||
|
||||
if(ggml_cann_type_mapping(src0->type) != faDataType){
|
||||
aclTensor* acl_src0_f32_tensor = ggml_cann_create_tensor(src0);
|
||||
src0_f16_buffer = src0_f16_allocator.alloc(
|
||||
ggml_nelements(src0) * faElemSize);
|
||||
|
||||
int64_t* src0_f16_ne = src0->ne;
|
||||
size_t src0_f16_nb[GGML_MAX_DIMS];
|
||||
src0_f16_nb[0] = sizeof(uint16_t);
|
||||
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
||||
src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1];
|
||||
}
|
||||
|
||||
acl_src0_f16_tensor = ggml_cann_create_tensor(
|
||||
src0_f16_buffer, faDataType, faElemSize,
|
||||
src0_f16_ne, src0_f16_nb, GGML_MAX_DIMS
|
||||
);
|
||||
aclnn_cast(ctx, acl_src0_f32_tensor, acl_src0_f16_tensor, faDataType);
|
||||
ggml_cann_release_resources(ctx, acl_src0_f32_tensor);
|
||||
}else{
|
||||
acl_src0_f16_tensor = ggml_cann_create_tensor(src0);
|
||||
}
|
||||
|
||||
// Step 2: create the acl tensors for src1 (Key), src2 (Value),
|
||||
// and the direct output from FusedInferAttention
|
||||
|
||||
acl_src1_f16_tensor = ggml_cann_create_tensor(src1);
|
||||
acl_src2_f16_tensor = ggml_cann_create_tensor(src2);
|
||||
|
||||
ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
|
||||
void* out_f16_buffer = out_f16_allocator.alloc(
|
||||
ggml_nelements(dst) * faElemSize);
|
||||
|
||||
int64_t* out_f16_ne = src0->ne;
|
||||
size_t out_f16_nb[GGML_MAX_DIMS];
|
||||
out_f16_nb[0] = faElemSize;
|
||||
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
||||
out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
|
||||
}
|
||||
|
||||
acl_dst_f16_tensor = ggml_cann_create_tensor(
|
||||
out_f16_buffer, faDataType, faElemSize,
|
||||
out_f16_ne, out_f16_nb, GGML_MAX_DIMS
|
||||
);
|
||||
|
||||
// Step 3: create the PSEShift tensor if needed
|
||||
// this tensor is considered as mask (f16) in the llama.cpp
|
||||
|
||||
aclTensor* bcast_pse_tensor = nullptr;
|
||||
int64_t bcast_pse_ne[GGML_MAX_DIMS];
|
||||
size_t bcast_pse_nb[GGML_MAX_DIMS];
|
||||
ggml_cann_pool_alloc bcast_pse_allocator(ctx.pool());
|
||||
void* bcast_pse_buffer = nullptr;
|
||||
|
||||
if(src3 != nullptr){
|
||||
bcast_pse_buffer = bcast_pse_allocator.alloc(
|
||||
ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t));
|
||||
|
||||
if(src0->ne[1] > 1){
|
||||
// Case 1: broadcast pse for prefill stage with multiple head
|
||||
aclTensor* acl_mask_f16_tensor = ggml_cann_create_tensor(src3);
|
||||
bcast_pse_ne[0] = src3->ne[0];
|
||||
bcast_pse_ne[1] = src3->ne[1];
|
||||
bcast_pse_ne[2] = src0->ne[2];
|
||||
bcast_pse_ne[3] = src3->ne[3];
|
||||
|
||||
bcast_pse_nb[0] = sizeof(uint16_t);
|
||||
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
||||
bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
|
||||
}
|
||||
|
||||
bcast_pse_tensor = ggml_cann_create_tensor(
|
||||
bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
|
||||
bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
|
||||
|
||||
int64_t repeats[] = {1, src0->ne[2], 1, 1};
|
||||
aclnn_repeat(ctx, acl_mask_f16_tensor, bcast_pse_tensor, repeats);
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_mask_f16_tensor);
|
||||
}else{
|
||||
// Case 2: trunc the first row and broadcast pse for decode stage with multiple head
|
||||
int64_t trunc_pse_ne[GGML_MAX_DIMS] = {src3->ne[0], src0->ne[1], src3->ne[2], src3->ne[3]};
|
||||
size_t* trunc_pse_nb = src3->nb;
|
||||
|
||||
aclTensor* acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(
|
||||
src3->data, ACL_FLOAT16, sizeof(uint16_t),
|
||||
trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS);
|
||||
|
||||
bcast_pse_ne[0] = src3->ne[0];
|
||||
bcast_pse_ne[1] = src0->ne[1];
|
||||
bcast_pse_ne[2] = src0->ne[2];
|
||||
bcast_pse_ne[3] = src3->ne[3];
|
||||
|
||||
bcast_pse_nb[0] = sizeof(uint16_t);
|
||||
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
||||
bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
|
||||
}
|
||||
|
||||
bcast_pse_tensor = ggml_cann_create_tensor(
|
||||
bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
|
||||
bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
|
||||
|
||||
int64_t repeats[] = {1, src0->ne[2], 1, 1};
|
||||
aclnn_repeat(ctx, acl_mask_f16_trunc_tensor, bcast_pse_tensor, repeats);
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_mask_f16_trunc_tensor);
|
||||
}
|
||||
|
||||
// Compute the slope if needed. Derived from ggml_cann_softmax().
|
||||
if(maxBias != 0.0f){
|
||||
// alibi
|
||||
const int64_t ne2_ne3 = src0->ne[2] * src0->ne[3];
|
||||
const int64_t n_head = src0->ne[2];
|
||||
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
|
||||
float m0 = powf(2.0f, -(maxBias) / n_heads_log2_floor);
|
||||
float m1 = powf(2.0f, -(maxBias / 2.0f) / n_heads_log2_floor);
|
||||
// init arange
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
|
||||
ne2_ne3 * faElemSize);
|
||||
void* tmp_arange_buffer = arange_allocator.get();
|
||||
|
||||
// arange1: [1, ..., n_heads_log2_floor+1)
|
||||
float start = 1;
|
||||
float stop = n_heads_log2_floor + 1;
|
||||
float step = 1;
|
||||
int64_t n_elements_arange = n_heads_log2_floor;
|
||||
|
||||
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_arange1_nb[] = {faElemSize};
|
||||
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, faDataType, faElemSize,
|
||||
tmp_arange1_ne, tmp_arange1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
|
||||
|
||||
aclTensor* tmp_arange2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
|
||||
start = 1;
|
||||
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
|
||||
step = 2;
|
||||
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
|
||||
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_arange2_nb[] = {faElemSize};
|
||||
|
||||
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_arange_buffer +
|
||||
n_heads_log2_floor * faElemSize,
|
||||
faDataType, faElemSize,
|
||||
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
|
||||
n_elements_arange);
|
||||
}
|
||||
|
||||
// init mk_base
|
||||
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
|
||||
ne2_ne3 * faElemSize);
|
||||
void* tmp_mk_base_buffer = mk_base_allocator.get();
|
||||
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_mk_base1_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base1_ne, tmp_mk_base1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
|
||||
|
||||
aclTensor* tmp_mk_base2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_mk_base2_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_mk_base_buffer +
|
||||
n_heads_log2_floor * faElemSize,
|
||||
faDataType, faElemSize,
|
||||
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
|
||||
}
|
||||
|
||||
// init mk
|
||||
int64_t tmp_mk_base_ne[] = {ne2_ne3};
|
||||
size_t tmp_mk_base_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
|
||||
|
||||
// reshape mk
|
||||
int64_t tmp_mk_ne[] = {1, 1, src0->ne[2], src0->ne[3]};
|
||||
size_t tmp_mk_nb[GGML_MAX_DIMS];
|
||||
tmp_mk_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
|
||||
}
|
||||
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
|
||||
|
||||
ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
|
||||
tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
|
||||
tmp_arange_tensor, tmp_mk_tensor);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: set the inputs for FusedInferAttention.
|
||||
int kvTensorNum = 1;
|
||||
aclTensor* acl_q_tensor = acl_src0_f16_tensor;
|
||||
aclTensor* acl_k_tensors[] = {acl_src1_f16_tensor};
|
||||
aclTensor* acl_v_tensors[] = {acl_src2_f16_tensor};
|
||||
auto acl_k_tensor_list = aclCreateTensorList(acl_k_tensors, kvTensorNum);
|
||||
auto acl_v_tensor_list = aclCreateTensorList(acl_v_tensors, kvTensorNum);
|
||||
|
||||
int64_t numHeads = src0->ne[2]; // N
|
||||
int64_t numKeyValueHeads = src1->ne[2];
|
||||
// double scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d)
|
||||
int64_t preTokens = 65535;
|
||||
int64_t nextTokens = 65535;
|
||||
char layout[5] = {'B', 'N', 'S', 'D', 0};
|
||||
int64_t sparseMode = 0;
|
||||
int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2;
|
||||
int64_t blockSize = 0;
|
||||
int64_t antiquantMode = 0;
|
||||
bool softmaxLseFlag = false;
|
||||
int64_t keyAntiquantMode = 0;
|
||||
int64_t valueAntiquantMode = 0;
|
||||
|
||||
// Step 5: launch the FusedInferAttentionScoreV2 kernel.
|
||||
// Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
|
||||
acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
|
||||
bcast_pse_tensor, nullptr, // pse, mask
|
||||
nullptr, nullptr, // actSeqLen, actSeqLenkv
|
||||
nullptr, nullptr, // deqScale1, quantScale1
|
||||
nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
|
||||
nullptr, nullptr, // antiquantScale, antiquantOffset
|
||||
nullptr, // blockTable
|
||||
nullptr, nullptr, // qPadSize, kvPadSize
|
||||
nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
|
||||
nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
|
||||
nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
|
||||
numHeads, scaleValue, // heads, scaleValue
|
||||
preTokens, nextTokens, // preTokens, nextTokens
|
||||
layout, // inputLayout
|
||||
numKeyValueHeads, // numKVHeads
|
||||
sparseMode, innerPrecise, // sparseMode, innerPrecise
|
||||
blockSize, antiquantMode, // blockSize, antiquantMode
|
||||
softmaxLseFlag, // softmaxLseFlag
|
||||
keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
|
||||
acl_dst_f16_tensor, // attentionOut
|
||||
nullptr // softmaxLse
|
||||
);
|
||||
|
||||
// Step 6: post-processing, permute and cast to f32
|
||||
|
||||
int64_t new_dim[] = {0, 2, 1, 3};
|
||||
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
|
||||
if(ggml_cann_type_mapping(dst->type) != faDataType){
|
||||
ggml_cann_pool_alloc perm_out_f16_allocator(ctx.pool());
|
||||
perm_out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
|
||||
void* perm_out_f16_buffer = perm_out_f16_allocator.get();
|
||||
|
||||
int64_t* perm_out_f16_ne = dst->ne;
|
||||
size_t perm_out_f16_nb[GGML_MAX_DIMS];
|
||||
perm_out_f16_nb[0] = faElemSize;
|
||||
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
||||
perm_out_f16_nb[i] = perm_out_f16_nb[i - 1] * perm_out_f16_ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_perm_out_f16_tensor = ggml_cann_create_tensor(
|
||||
perm_out_f16_buffer, faDataType, faElemSize,
|
||||
perm_out_f16_ne, perm_out_f16_nb, GGML_MAX_DIMS);
|
||||
aclnn_permute(ctx, acl_dst_f16_tensor, acl_perm_out_f16_tensor, new_dim, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx,
|
||||
acl_perm_out_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
||||
ggml_cann_release_resources(ctx, acl_perm_out_f16_tensor);
|
||||
}else{
|
||||
// only need to permute
|
||||
aclnn_permute(ctx, acl_dst_f16_tensor, acl_dst_tensor, new_dim, GGML_MAX_DIMS);
|
||||
}
|
||||
ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
|
||||
acl_src1_f16_tensor,
|
||||
acl_src2_f16_tensor,
|
||||
acl_dst_f16_tensor,
|
||||
acl_dst_tensor);
|
||||
if(src3 != nullptr){
|
||||
ggml_cann_release_resources(ctx, bcast_pse_tensor);
|
||||
}
|
||||
}else{
|
||||
GGML_ABORT("Function is not implemented.");
|
||||
}
|
||||
}
|
||||
|
||||
15
ggml/src/ggml-cann/aclnn_ops.h
Normal file → Executable file
15
ggml/src/ggml-cann/aclnn_ops.h
Normal file → Executable file
@@ -714,6 +714,21 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
*/
|
||||
void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Performs the Flash Attention extended operator using the CANN backend.
|
||||
*
|
||||
* @details This function implements the memory-efficient Flash Attention algorithm
|
||||
* for computing scaled dot-product attention with hardware acceleration.
|
||||
* The result is stored in the destination tensor `dst`.
|
||||
*
|
||||
* This operation is accelerated using the CANN backend to improve runtime performance.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
* dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
|
||||
*/
|
||||
void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/*
|
||||
* @brief A generic wrapper for ACL resources with custom deleter support.
|
||||
*/
|
||||
|
||||
0
ggml/src/ggml-cann/common.h
Normal file → Executable file
0
ggml/src/ggml-cann/common.h
Normal file → Executable file
45
ggml/src/ggml-cann/ggml-cann.cpp
Normal file → Executable file
45
ggml/src/ggml-cann/ggml-cann.cpp
Normal file → Executable file
@@ -36,6 +36,7 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cann/aclnn_ops.h"
|
||||
#include "ggml-cann/common.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
|
||||
@@ -1748,6 +1749,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
ggml_cann_count_equal(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
ggml_cann_flash_attn_ext(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2035,6 +2039,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return true;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
#ifdef ASCEND_310P
|
||||
// Q4 && Q8 per group is not suppor on 310p device
|
||||
return false;
|
||||
#endif
|
||||
// only support contiguous for quantized types.
|
||||
return ggml_is_contiguous(op->src[0]) &&
|
||||
ggml_is_contiguous(op->src[1]);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2168,6 +2181,38 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:{
|
||||
// derived from [ggml-cuda.cu]
|
||||
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
|
||||
return false;
|
||||
}
|
||||
if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
|
||||
return false;
|
||||
}
|
||||
if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
||||
// different head sizes of K and V are not supported yet
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
float logitSoftcap = 0.0f;
|
||||
memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
|
||||
if(logitSoftcap != 0.0f) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -82,13 +82,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
|
||||
endif()
|
||||
|
||||
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
||||
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
|
||||
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
message(STATUS "ARM detected")
|
||||
|
||||
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
|
||||
else()
|
||||
@@ -170,12 +165,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endforeach()
|
||||
endif()
|
||||
endif()
|
||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
||||
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
message(STATUS "x86 detected")
|
||||
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (GGML_NATIVE)
|
||||
@@ -299,7 +290,26 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||
|
||||
if (GGML_BACKEND_DL)
|
||||
if (GGML_NATIVE)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||
message(STATUS "PowerPC detected")
|
||||
if (GGML_NATIVE)
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
@@ -308,7 +318,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
endif()
|
||||
|
||||
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
|
||||
string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
|
||||
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
|
||||
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
||||
|
||||
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
||||
@@ -325,9 +336,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
|
||||
message(STATUS "loongarch64 detected")
|
||||
|
||||
list(APPEND ARCH_FLAGS -march=loongarch64)
|
||||
if (GGML_LASX)
|
||||
list(APPEND ARCH_FLAGS -mlasx)
|
||||
@@ -335,16 +345,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_LSX)
|
||||
list(APPEND ARCH_FLAGS -mlsx)
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
|
||||
message(STATUS "RISC-V detected")
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
|
||||
message(STATUS "riscv64 detected")
|
||||
if (GGML_RVV)
|
||||
if (GGML_RV_ZFH)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
|
||||
if (GGML_XTHEADVECTOR)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
|
||||
elseif (GGML_RV_ZFH)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
||||
message(STATUS "s390x detected")
|
||||
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
||||
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
|
||||
@@ -477,25 +489,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
|
||||
if (GGML_BACKEND_DL)
|
||||
if (GGML_NATIVE)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endif()
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||
endif()
|
||||
|
||||
@@ -1191,7 +1191,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
}
|
||||
return;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
@@ -3783,7 +3783,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
|
||||
@@ -320,21 +320,17 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#else
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#if !defined(__riscv)
|
||||
#elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
|
||||
@@ -883,7 +883,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
|
||||
size_t vl = QK8_0;
|
||||
|
||||
@@ -1221,7 +1221,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
|
||||
size_t vl = QK8_1;
|
||||
|
||||
@@ -2384,7 +2384,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl = qk / 2;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -2774,7 +2774,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl = qk / 2;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -3121,7 +3121,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl;
|
||||
size_t vlenb = __riscv_vlenb();
|
||||
|
||||
@@ -3460,7 +3460,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl;
|
||||
size_t vlenb = __riscv_vlenb();
|
||||
|
||||
@@ -3897,7 +3897,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(accum);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl = qk;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -5100,14 +5100,111 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
float sumf = 0;
|
||||
uint8_t atmp[16];
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
uint8_t *patmp = atmp;
|
||||
int vsums;
|
||||
int tmp;
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl16], e8, m1\n\t"
|
||||
"th.vmv.v.x v8, zero\n\t"
|
||||
"th.vlb.v v1, (%[sc])\n\t"
|
||||
"th.vand.vi v0, v1, 0xF\n\t"
|
||||
"th.vsrl.vi v1, v1, 4\n\t"
|
||||
"th.vsb.v v0, (%[scale])\n\t"
|
||||
"th.vwaddu.vx v16, v1, zero\n\t"
|
||||
"th.vsetvli zero, %[vl16], e16, m2\n\t"
|
||||
"th.vlh.v v2, (%[bsums])\n\t"
|
||||
"th.vwmul.vv v4, v16, v2\n\t"
|
||||
"th.vsetvli zero, %[vl16], e32, m4\n\t"
|
||||
"th.vredsum.vs v8, v4, v8\n\t"
|
||||
"th.vmv.x.s %[vsums], v8"
|
||||
: [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
|
||||
: [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
|
||||
, [vl16] "r" (16)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
sumf += dmin * vsums;
|
||||
int isum = 0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"th.vlb.v v0, (%[q2])\n\t"
|
||||
"th.vsrl.vi v2, v0, 2\n\t"
|
||||
"th.vsrl.vi v4, v0, 4\n\t"
|
||||
"th.vsrl.vi v6, v0, 6\n\t"
|
||||
"th.vand.vi v0, v0, 0x3\n\t"
|
||||
"th.vand.vi v2, v2, 0x3\n\t"
|
||||
"th.vand.vi v4, v4, 0x3\n\t"
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"th.vlb.v v8, (%[q8])\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"th.vwmul.vv v24, v4, v12\n\t"
|
||||
"th.vsetvli zero, %[vl16], e16, m2\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vwredsum.vs v10, v16, v0\n\t"
|
||||
"th.vwredsum.vs v9, v18, v0\n\t"
|
||||
"th.vwredsum.vs v8, v20, v0\n\t"
|
||||
"th.vwredsum.vs v7, v22, v0\n\t"
|
||||
"th.vwredsum.vs v11, v24, v0\n\t"
|
||||
"th.vwredsum.vs v12, v26, v0\n\t"
|
||||
"th.vwredsum.vs v13, v28, v0\n\t"
|
||||
"th.vwredsum.vs v14, v30, v0\n\t"
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vslideup.vi v10, v9, 1\n\t"
|
||||
"th.vslideup.vi v8, v7, 1\n\t"
|
||||
"th.vslideup.vi v11, v12, 1\n\t"
|
||||
"th.vslideup.vi v13, v14, 1\n\t"
|
||||
"th.vslideup.vi v10, v8, 2\n\t"
|
||||
"th.vslideup.vi v11, v13, 2\n\t"
|
||||
"li %[tmp], 8\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m2\n\t"
|
||||
"th.vlbu.v v12, (%[scale])\n\t"
|
||||
"th.vmul.vv v10, v10, v12\n\t"
|
||||
"th.vredsum.vs v0, v10, v0\n\t"
|
||||
"th.vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[isum], %[isum], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [isum] "+&r" (isum)
|
||||
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
|
||||
, [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
q2 += 32; q8 += 128; patmp += 8;
|
||||
}
|
||||
|
||||
sumf += dall * isum;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
float sumf = 0;
|
||||
uint8_t atmp[16];
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||
uint8_t atmp[16];
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
@@ -6137,14 +6234,141 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
uint32_t aux[3];
|
||||
uint32_t utmp[4];
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const uint8_t * restrict qh = x[i].hmask;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
int8_t * scale = (int8_t *)utmp;
|
||||
int tmp;
|
||||
__asm__ __volatile__(
|
||||
"li %[tmp], 12\n\t"
|
||||
"th.vsetvli zero, %[tmp], e8, m1\n\t"
|
||||
"th.vlb.v v0, (%[s6b])\n\t"
|
||||
"th.vmv.v.v v2, v0\n\t"
|
||||
"li %[tmp], 2\n\t"
|
||||
"th.vsetvli zero, %[tmp], e64, m1\n\t"
|
||||
"th.vmv.v.x v9, %[sh]\n\t"\
|
||||
"th.vslidedown.vi v1, v0, 1\n\t"
|
||||
"th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
|
||||
"th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vid.v v9\n\t"
|
||||
"th.vmv.x.s %[tmp], v1\n\t"
|
||||
"th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
|
||||
"th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
|
||||
"th.vsrl.vv v4, v1, v9\n\t"
|
||||
"th.vsrl.vv v2, v0, v8\n\t"
|
||||
"th.vand.vx v5, v4, %[kmask1]\n\t"
|
||||
"th.vand.vx v3, v2, %[kmask2]\n\t"
|
||||
"th.vsll.vi v6, v5, 4\n\t"
|
||||
"th.vor.vv v7, v6, v3\n\t"
|
||||
"li %[tmp], 16\n\t"
|
||||
"th.vsetvli zero, %[tmp], e8, m1\n\t"
|
||||
"th.vsub.vx v0, v7, %[c]\n\t"
|
||||
"th.vsb.v v0, (%[scale])"
|
||||
: [tmp] "=&r" (tmp)
|
||||
: [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
|
||||
, [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
|
||||
uint8_t m = 1;
|
||||
int isum = 0;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
__asm__ __volatile__(
|
||||
// fixme: use v0p7 mask layout directly
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"th.vlb.v v8, (%[q3])\n\t"
|
||||
"th.vsrl.vi v10, v8, 2\n\t"
|
||||
"th.vsrl.vi v12, v8, 4\n\t"
|
||||
"th.vsrl.vi v14, v8, 6\n\t"
|
||||
"th.vand.vi v8, v8, 3\n\t"
|
||||
"th.vand.vi v10, v10, 3\n\t"
|
||||
"th.vand.vi v12, v12, 3\n\t"
|
||||
"th.vlb.v v2, (%[qh])\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v8, v8, -4, v0.t\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v10, v10, -4, v0.t\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v12, v12, -4, v0.t\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v14, v14, -4, v0.t\n\t"
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"th.vlb.v v0, (%[q8])\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"th.vwmul.vv v24, v4, v12\n\t"
|
||||
"li %[tmp], 16\n\t"
|
||||
"th.vsetvli zero, %[tmp], e16, m2\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vwredsum.vs v10, v16, v0\n\t"
|
||||
"th.vwredsum.vs v9, v18, v0\n\t"
|
||||
"th.vwredsum.vs v8, v20, v0\n\t"
|
||||
"th.vwredsum.vs v7, v22, v0\n\t"
|
||||
"th.vwredsum.vs v11, v24, v0\n\t"
|
||||
"th.vwredsum.vs v12, v26, v0\n\t"
|
||||
"th.vwredsum.vs v13, v28, v0\n\t"
|
||||
"th.vwredsum.vs v14, v30, v0\n\t"
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vslideup.vi v10, v9, 1\n\t"
|
||||
"th.vslideup.vi v8, v7, 1\n\t"
|
||||
"th.vslideup.vi v11, v12, 1\n\t"
|
||||
"th.vslideup.vi v13, v14, 1\n\t"
|
||||
"th.vslideup.vi v10, v8, 2\n\t"
|
||||
"th.vslideup.vi v11, v13, 2\n\t"
|
||||
"li %[tmp], 8\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m2\n\t"
|
||||
"th.vlb.v v12, (%[scale])\n\t"
|
||||
"th.vmul.vv v10, v10, v12\n\t"
|
||||
"th.vredsum.vs v0, v10, v0\n\t"
|
||||
"th.vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[isum], %[isum], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
|
||||
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
|
||||
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
q3 += 32; q8 += 128; scale += 8;
|
||||
}
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
sumf += d * isum;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
uint32_t utmp[4];
|
||||
float sumf = 0;
|
||||
uint32_t aux[3];
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
@@ -6331,7 +6555,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
"vslideup.vi v13, v14, 1\n\t"
|
||||
"vslideup.vi v10, v8, 2\n\t"
|
||||
"vslideup.vi v11, v13, 2\n\t"
|
||||
"vsetivli zero, 8, e32, m2\n\t"\
|
||||
"vsetivli zero, 8, e32, m2\n\t"
|
||||
"vle8.v v15, (%[scale])\n\t"
|
||||
"vsext.vf4 v12, v15\n\t"
|
||||
"vmul.vv v10, v10, v12\n\t"
|
||||
@@ -6771,7 +6995,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
#ifdef __ARM_FEATURE_MATMUL_INT8
|
||||
assert((nrc == 2) || (nrc == 1));
|
||||
#else
|
||||
assert(nrc == 1);
|
||||
#endif
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
@@ -6788,6 +7016,146 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
uint32_t utmp[4];
|
||||
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const block_q4_K * GGML_RESTRICT x0 = x;
|
||||
const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx);
|
||||
const block_q8_K * GGML_RESTRICT y0 = y;
|
||||
const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
|
||||
float32x4_t vfsum = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
|
||||
const uint8_t * GGML_RESTRICT qx0 = x0->qs;
|
||||
const uint8_t * GGML_RESTRICT qx1 = x1->qs;
|
||||
const int8_t * GGML_RESTRICT qy0 = y0->qs;
|
||||
const int8_t * GGML_RESTRICT qy1 = y1->qs;
|
||||
|
||||
// decode scales and mins
|
||||
int8_t x0_scales[8], x1_scales[8];
|
||||
int16x8_t x0_mins, x1_mins;
|
||||
{
|
||||
uint32_t scales_mins[3];
|
||||
memcpy(scales_mins, x0->scales, 12);
|
||||
const uint32_t mins_0_3 = scales_mins[1] & kmask1;
|
||||
const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
|
||||
const uint32x2_t mins = {mins_0_3, mins_4_7};
|
||||
x0_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
|
||||
uint32_t scales[2];
|
||||
scales[0] = scales_mins[0] & kmask1; // scales 0~3
|
||||
scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
|
||||
memcpy(x0_scales, scales, 8);
|
||||
}
|
||||
{
|
||||
uint32_t scales_mins[3];
|
||||
memcpy(scales_mins, x1->scales, 12);
|
||||
const uint32_t mins_0_3 = scales_mins[1] & kmask1;
|
||||
const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
|
||||
const uint32x2_t mins = {mins_0_3, mins_4_7};
|
||||
x1_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
|
||||
uint32_t scales[2];
|
||||
scales[0] = scales_mins[0] & kmask1; // scales 0~3
|
||||
scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
|
||||
memcpy(x1_scales, scales, 8);
|
||||
}
|
||||
|
||||
int32x4_t visum = {0};
|
||||
|
||||
// process 64 data points per iteration, totally 256 data points
|
||||
for (int j = 0; j < QK_K / 64; ++j, qx0 += 32, qx1 += 32, qy0 += 64, qy1 += 64) {
|
||||
const int8x16x4_t vy0 = vld1q_s8_x4(qy0);
|
||||
const int8x16x4_t vy1 = vld1q_s8_x4(qy1);
|
||||
|
||||
int8x16_t vx0[4], vx1[4];
|
||||
{
|
||||
const uint8x16x2_t vv = vld1q_u8_x2(qx0);
|
||||
vx0[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
|
||||
vx0[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
|
||||
vx0[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
|
||||
vx0[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
|
||||
}
|
||||
{
|
||||
const uint8x16x2_t vv = vld1q_u8_x2(qx1);
|
||||
vx1[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
|
||||
vx1[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
|
||||
vx1[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
|
||||
vx1[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
|
||||
}
|
||||
|
||||
// process 32 data points (share same block scale) per iteration
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
const int blk = j * 2 + k;
|
||||
const int32x4_t block_scale = {
|
||||
x0_scales[blk],
|
||||
x0_scales[blk],
|
||||
x1_scales[blk],
|
||||
x1_scales[blk],
|
||||
};
|
||||
|
||||
int32x4_t vr = {0};
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const int idx = k * 2 + l;
|
||||
const int64x2_t vx0_s64 = vreinterpretq_s64_s8(vx0[idx]);
|
||||
const int64x2_t vx1_s64 = vreinterpretq_s64_s8(vx1[idx]);
|
||||
const int64x2_t vy0_s64 = vreinterpretq_s64_s8(vy0.val[idx]);
|
||||
const int64x2_t vy1_s64 = vreinterpretq_s64_s8(vy1.val[idx]);
|
||||
const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vx0_s64, vx1_s64));
|
||||
const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vx0_s64, vx1_s64));
|
||||
const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vy0_s64, vy1_s64));
|
||||
const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vy0_s64, vy1_s64));
|
||||
vr = vmmlaq_s32(vr, vx_l, vy_l);
|
||||
vr = vmmlaq_s32(vr, vx_h, vy_h);
|
||||
}
|
||||
// apply block scale, will NOT overflow
|
||||
// block_scale * sum_256(int4*int8) <= 2^(8+8+4+8) = 28 bits
|
||||
visum = vmlaq_s32(visum, vr, block_scale);
|
||||
}
|
||||
}
|
||||
|
||||
// adjust bias, apply superblock scale
|
||||
{
|
||||
int32_t bias[4];
|
||||
// no obvious uplift from sve sdot-16, just use neon mul add
|
||||
const int16x8_t y0_sums = vpaddq_s16(vld1q_s16(y0->bsums), vld1q_s16(y0->bsums+8));
|
||||
const int16x8_t y1_sums = vpaddq_s16(vld1q_s16(y1->bsums), vld1q_s16(y1->bsums+8));
|
||||
bias[0] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x0_mins)),
|
||||
vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x0_mins))));
|
||||
bias[1] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x0_mins)),
|
||||
vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x0_mins))));
|
||||
bias[2] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x1_mins)),
|
||||
vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x1_mins))));
|
||||
bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
|
||||
vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
|
||||
const float32x4_t dmins = {
|
||||
GGML_FP16_TO_FP32(x0->dmin) * y0->d,
|
||||
GGML_FP16_TO_FP32(x0->dmin) * y1->d,
|
||||
GGML_FP16_TO_FP32(x1->dmin) * y0->d,
|
||||
GGML_FP16_TO_FP32(x1->dmin) * y1->d,
|
||||
};
|
||||
vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
|
||||
|
||||
const float32x4_t superblock_scale = {
|
||||
GGML_FP16_TO_FP32(x0->d) * y0->d,
|
||||
GGML_FP16_TO_FP32(x0->d) * y1->d,
|
||||
GGML_FP16_TO_FP32(x1->d) * y0->d,
|
||||
GGML_FP16_TO_FP32(x1->d) * y1->d,
|
||||
};
|
||||
vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
|
||||
}
|
||||
}
|
||||
|
||||
// vfsum = ABCD -> ACBD
|
||||
// AC -> s, BD -> (s+bs)
|
||||
vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
|
||||
vst1_f32(s, vget_low_f32 (vfsum));
|
||||
vst1_f32(s + bs, vget_high_f32(vfsum));
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
@@ -7180,14 +7548,130 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int tmp, tmp2, sumi;
|
||||
__asm__ __volatile__(
|
||||
"li %[t1], 12\n\t"
|
||||
"th.vsetvli zero, %[t1], e8, m1\n\t"
|
||||
"th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
|
||||
"li %[t1], 4\n\t"
|
||||
"th.vsetvli zero, %[t1], e32, m1\n\t"
|
||||
"th.vslidedown.vi v2, v1, 2\n\t"
|
||||
"th.vmv.v.v v3, v2\n\t"
|
||||
"th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
|
||||
"li %[t1], 2\n\t"
|
||||
"th.vsetvli zero, %[t1], e32, m1\n\t"
|
||||
"th.vmv.v.i v4, 4\n\t"
|
||||
"th.vand.vx v8, v1, %[kmask1]\n\t"
|
||||
"th.vslide1up.vx v5, v4, zero\n\t" // {0, 4}
|
||||
"th.vsrl.vi v6, v1, 6\n\t"
|
||||
"th.vsrl.vv v7, v2, v5\n\t"
|
||||
"th.vand.vx v0, v6, %[kmask3]\n\t"
|
||||
"th.vand.vx v2, v7, %[kmask2]\n\t"
|
||||
"th.vsll.vi v6, v0, 4\n\t"
|
||||
"li %[t2], 8\n\t"
|
||||
"addi %[t1], %[utmp], 4\n\t"
|
||||
"th.vor.vv v1, v6, v2\n\t"
|
||||
"th.vssw.v v8, (%[utmp]), %[t2]\n\t"
|
||||
"th.vssw.v v1, (%[t1]), %[t2]\n\t"
|
||||
"th.vsetvli zero, zero, e32, m2\n\t" // vl == 8
|
||||
"th.vlw.v v2, (%[bsums])\n\t"
|
||||
"th.vsetvli zero, %[t2], e16, m1\n\t"
|
||||
"th.vnsrl.vi v0, v2, 0\n\t"
|
||||
"th.vnsrl.vi v1, v2, 16\n\t"
|
||||
"th.vadd.vv v2, v0, v1\n\t"
|
||||
"th.vlbu.v v4, (%[mins])\n\t"
|
||||
"th.vwmul.vv v6, v4, v2\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vsetvli zero, %[t2], e32, m2\n\t"
|
||||
"th.vredsum.vs v0, v6, v0\n\t"
|
||||
"th.vmv.x.s %[sumi], v0"
|
||||
: [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
|
||||
: [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
|
||||
, [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
|
||||
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
sumf -= dmin * sumi;
|
||||
|
||||
const uint8_t * restrict q4 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
sumi = 0;
|
||||
const uint8_t * scale = scales;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
int vl128 = 128, vl64 = 64, vl32 = 32;
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"th.vlb.v v8, (%[q8])\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"th.vlb.v v0, (%[q4])\n\t"
|
||||
"th.vsrl.vi v4, v0, 4\n\t"
|
||||
"th.vand.vi v0, v0, 0xF\n\t"
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"th.vwmul.vv v28, v6, v14\n\t"
|
||||
"th.vwmul.vv v20, v4, v10\n\t"
|
||||
"th.vwmul.vv v24, v2, v12\n\t"
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vlbu.v v1, (%[scale])\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vsetvli zero, %[vl32], e16, m4\n\t"
|
||||
"th.vwredsum.vs v6, v24, v0\n\t"
|
||||
"th.vwredsum.vs v7, v28, v0\n\t"
|
||||
"th.vwredsum.vs v4, v16, v0\n\t"
|
||||
"th.vwredsum.vs v5, v20, v0\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vslideup.vi v6, v7, 1\n\t"
|
||||
"th.vslideup.vi v4, v5, 1\n\t"
|
||||
"th.vslideup.vi v4, v6, 2\n\t"
|
||||
"th.vmul.vv v8, v4, v1\n\t"
|
||||
"th.vredsum.vs v0, v8, v0\n\t"
|
||||
"th.vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[sumi], %[sumi], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
|
||||
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
|
||||
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
|
||||
q4 += 64; q8 += 128; scale += 4;
|
||||
}
|
||||
|
||||
sumf += d * sumi;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
float sumf = 0;
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
@@ -8074,7 +8558,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_v
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
@@ -9232,11 +9716,92 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * restrict q6 = x[i].ql;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const int8_t * restrict scale = x[i].scales;
|
||||
|
||||
int sum_t = 0;
|
||||
int t0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32
|
||||
"th.vlb.v v4, (%[qh])\n\t"
|
||||
"th.vsll.vi v0, v4, 4\n\t"
|
||||
"th.vsll.vi v2, v4, 2\n\t"
|
||||
"th.vsrl.vi v6, v4, 2\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
|
||||
"th.vlb.v v8, (%[q6])\n\t"
|
||||
"th.vsrl.vi v12, v8, 4\n\t"
|
||||
"th.vand.vi v8, v8, 0xF\n\t"
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128
|
||||
"th.vand.vx v0, v0, %[mask]\n\t"
|
||||
"th.vor.vv v8, v8, v0\n\t"
|
||||
"th.vlb.v v0, (%[q8])\n\t"
|
||||
"th.vsub.vx v8, v8, %[vl32]\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"th.vwmul.vv v24, v4, v12\n\t"
|
||||
"li %[t0], 16\n\t"
|
||||
"th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vwredsum.vs v10, v16, v0\n\t"
|
||||
"th.vwredsum.vs v9, v18, v0\n\t"
|
||||
"th.vwredsum.vs v8, v20, v0\n\t"
|
||||
"th.vwredsum.vs v7, v22, v0\n\t"
|
||||
"th.vwredsum.vs v11, v24, v0\n\t"
|
||||
"th.vwredsum.vs v12, v26, v0\n\t"
|
||||
"th.vwredsum.vs v13, v28, v0\n\t"
|
||||
"th.vwredsum.vs v14, v30, v0\n\t"
|
||||
"li %[t0], 4\n\t"
|
||||
"th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4
|
||||
"th.vslideup.vi v10, v9, 1\n\t"
|
||||
"th.vslideup.vi v8, v7, 1\n\t"
|
||||
"th.vslideup.vi v11, v12, 1\n\t"
|
||||
"th.vslideup.vi v13, v14, 1\n\t"
|
||||
"th.vslideup.vi v10, v8, 2\n\t"
|
||||
"th.vslideup.vi v11, v13, 2\n\t"
|
||||
"li %[t0], 8\n\t"
|
||||
"th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8
|
||||
"th.vlb.v v4, (%[scale])\n\t"
|
||||
"th.vmul.vv v2, v4, v10\n\t"
|
||||
"th.vredsum.vs v0, v2, v0\n\t"
|
||||
"th.vmv.x.s %[t0], v0\n\t"
|
||||
"add %[sumi], %[sumi], %[t0]"
|
||||
: [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
|
||||
: [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
|
||||
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||
, [mask] "r" (0x30)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
q6 += 64; qh += 32; q8 += 128; scale += 8;
|
||||
}
|
||||
|
||||
sumf += d * sum_t;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
float sumf = 0;
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
@@ -270,7 +270,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.from_float = quantize_row_q4_K,
|
||||
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
.nrows = 2,
|
||||
#else
|
||||
.nrows = 1,
|
||||
#endif
|
||||
},
|
||||
[GGML_TYPE_Q5_K] = {
|
||||
.from_float = quantize_row_q5_K,
|
||||
@@ -2202,6 +2206,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
} break;
|
||||
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
{
|
||||
@@ -2413,12 +2418,32 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
// This is up to the applications.
|
||||
DWORD p = THREAD_PRIORITY_NORMAL;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
|
||||
case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
|
||||
case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
|
||||
case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
|
||||
}
|
||||
|
||||
if (prio != GGML_SCHED_PRIO_LOW) {
|
||||
// Tell Windows that this thread should not be throttled (needs its own CPU core).
|
||||
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
||||
// all our threads onto the first 4 cores which results in terrible performance with
|
||||
// n_threads > 4
|
||||
#if _WIN32_WINNT >= 0x0602
|
||||
THREAD_POWER_THROTTLING_STATE t;
|
||||
ZeroMemory(&t, sizeof(t));
|
||||
t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
|
||||
t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
|
||||
t.StateMask = 0;
|
||||
|
||||
if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
|
||||
GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
||||
// Keep inherited policy/priority
|
||||
return true;
|
||||
@@ -2446,6 +2471,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
struct sched_param p;
|
||||
int32_t policy = SCHED_OTHER;
|
||||
switch (prio) {
|
||||
// TODO: there seems to be no way to set lower prio on Apple platforms
|
||||
case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
||||
case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
|
||||
case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
|
||||
@@ -2502,6 +2529,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
struct sched_param p;
|
||||
int32_t policy = SCHED_OTHER;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
|
||||
case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
|
||||
case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
|
||||
@@ -3483,6 +3511,19 @@ void ggml_cpu_init(void) {
|
||||
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
||||
|
||||
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
|
||||
|
||||
#ifdef GGML_USE_OPENMP
|
||||
//if (!getenv("OMP_WAIT_POLICY")) {
|
||||
// // set the wait policy to active, so that OpenMP threads don't sleep
|
||||
// putenv("OMP_WAIT_POLICY=active");
|
||||
//}
|
||||
|
||||
if (!getenv("KMP_BLOCKTIME")) {
|
||||
// set the time to wait before sleeping a thread
|
||||
// this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
|
||||
putenv("KMP_BLOCKTIME=200"); // 200ms
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__ARM_ARCH)
|
||||
|
||||
@@ -2691,6 +2691,109 @@ static void ggml_compute_forward_gelu(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_gelu_erf
|
||||
|
||||
static void ggml_compute_forward_gelu_erf_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
assert(ggml_is_contiguous_1(src0));
|
||||
assert(ggml_is_contiguous_1(dst));
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
ggml_vec_gelu_erf_f32(nc,
|
||||
(float *) ((char *) dst->data + i1*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i1*(src0->nb[1])));
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
GGML_UNUSED(x);
|
||||
assert(!isnan(x));
|
||||
assert(!isinf(x));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_gelu_erf_f16(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
assert(ggml_is_contiguous_1(src0));
|
||||
assert(ggml_is_contiguous_1(dst));
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
ggml_vec_gelu_erf_f16(nc,
|
||||
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
|
||||
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
const float v = GGML_FP16_TO_FP32(x);
|
||||
GGML_UNUSED(v);
|
||||
assert(!isnan(v));
|
||||
assert(!isinf(v));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_gelu_erf(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_gelu_erf_f32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_gelu_erf_f16(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_gelu_quick
|
||||
|
||||
static void ggml_compute_forward_gelu_quick_f32(
|
||||
@@ -7530,39 +7633,83 @@ static void ggml_compute_forward_ssm_scan_f32(
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
const int ir = ir1 - ir0;
|
||||
|
||||
for (int i3 = 0; i3 < n_s; ++i3) {
|
||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||
const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
|
||||
const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
||||
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
|
||||
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
|
||||
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
for (int i3 = 0; i3 < n_s; ++i3) {
|
||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||
const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
|
||||
const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
||||
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
|
||||
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
|
||||
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
|
||||
|
||||
// use the output as the source for the next token-wise iterations
|
||||
if (i2 > 0) { s0 = s; }
|
||||
// use the output as the source for the next token-wise iterations
|
||||
if (i2 > 0) { s0 = s; }
|
||||
|
||||
// d_inner
|
||||
for (int i1 = 0; i1 < ir; ++i1) {
|
||||
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
|
||||
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
||||
float x_dt = x[i1] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
// d_state
|
||||
for (int i0 = 0; i0 < nc; ++i0) {
|
||||
int i = i0 + i1*nc;
|
||||
// state = prev_state * dA + dB * x
|
||||
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
||||
// y = rowwise_dotprod(state, C)
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
// d_inner
|
||||
for (int i1 = 0; i1 < ir; ++i1) {
|
||||
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
||||
float x_dt = x[i1] * dt_soft_plus;
|
||||
svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt);
|
||||
svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus);
|
||||
svfloat32_t r1_vector = GGML_F32_VEC_ZERO;
|
||||
|
||||
for (int64_t k = 0; k < nc; k += svcntw()) {
|
||||
svfloat32_t vA = GGML_F32_VEC_LOAD(&A[i1*nc + k]);
|
||||
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k]);
|
||||
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k]);
|
||||
svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[i1*nc + k]);
|
||||
|
||||
svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
|
||||
t1 = exp_ps_sve(svptrue_b32(), t1);
|
||||
svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB);
|
||||
|
||||
vs0 = GGML_F32_VEC_FMA(vs0, t1, t2);
|
||||
r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector);
|
||||
|
||||
GGML_F32_VEC_STORE(&s[i1*nc + k], vs0);
|
||||
}
|
||||
y[i1] = GGML_F32xt_REDUCE_ONE(r1_vector);
|
||||
}
|
||||
y[i1] = sumf;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int i3 = 0; i3 < n_s; ++i3) {
|
||||
for (int i2 = 0; i2 < n_t; ++i2) {
|
||||
const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
|
||||
const float * x = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
|
||||
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
|
||||
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
|
||||
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
|
||||
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
|
||||
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
|
||||
|
||||
// use the output as the source for the next token-wise iterations
|
||||
if (i2 > 0) { s0 = s; }
|
||||
|
||||
// d_inner
|
||||
for (int i1 = 0; i1 < ir; ++i1) {
|
||||
// ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
|
||||
float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
|
||||
float x_dt = x[i1] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
// d_state
|
||||
for (int i0 = 0; i0 < nc; ++i0) {
|
||||
int i = i0 + i1*nc;
|
||||
// state = prev_state * dA + dB * x
|
||||
float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
||||
// y = rowwise_dotprod(state, C)
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
}
|
||||
y[i1] = sumf;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_compute_forward_ssm_scan(
|
||||
@@ -7749,6 +7896,10 @@ void ggml_compute_forward_unary(
|
||||
{
|
||||
ggml_compute_forward_gelu(params, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
{
|
||||
ggml_compute_forward_gelu_erf(params, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
{
|
||||
ggml_compute_forward_gelu_quick(params, dst);
|
||||
@@ -7963,6 +8114,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
||||
#define GGML_F32X_MUL GGML_F32x16_MUL
|
||||
#define GGML_F32X_FMA GGML_F32x16_FMA
|
||||
#define WKV_VECTOR_SIZE 16
|
||||
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
#define GGML_F32X GGML_F32xt
|
||||
#define GGML_F32X_SET1 GGML_F32xt_SET1
|
||||
#define GGML_F32X_LOAD GGML_F32xt_LOAD
|
||||
#define GGML_F32X_STORE GGML_F32xt_STORE
|
||||
#define GGML_F32X_MUL GGML_F32xt_MUL
|
||||
#define GGML_F32X_FMA GGML_F32xt_FMA
|
||||
#define WKV_VECTOR_SIZE 8
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#define GGML_F32X GGML_F32x4
|
||||
#define GGML_F32X_SET1 GGML_F32x4_SET1
|
||||
@@ -7973,8 +8132,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
||||
#define WKV_VECTOR_SIZE 4
|
||||
#endif
|
||||
|
||||
int wkv_vector_size;
|
||||
#ifdef WKV_VECTOR_SIZE
|
||||
const int64_t vec_count = head_size / WKV_VECTOR_SIZE;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
wkv_vector_size = svcntw();
|
||||
#else
|
||||
wkv_vector_size = WKV_VECTOR_SIZE;
|
||||
#endif
|
||||
const int64_t vec_count = head_size / wkv_vector_size;
|
||||
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
size_t t_offset = t * t_stride;
|
||||
@@ -8004,7 +8169,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
||||
GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
|
||||
|
||||
for (int64_t j = 0; j < vec_count; j++) {
|
||||
size_t base_j = j * WKV_VECTOR_SIZE;
|
||||
size_t base_j = j * wkv_vector_size;
|
||||
size_t t_h_j_offset = t_h_offset + base_j;
|
||||
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
|
||||
|
||||
@@ -8029,7 +8194,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
||||
}
|
||||
|
||||
// Handle remaining elements, this will not be used.
|
||||
for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) {
|
||||
for (int64_t j = vec_count * wkv_vector_size; j < head_size; j++) {
|
||||
size_t t_h_j_offset = t_h_offset + j;
|
||||
size_t h_2d_i_j_offset = h_2d_i_offset + j;
|
||||
float v_val = v[t_h_j_offset];
|
||||
@@ -8165,6 +8330,14 @@ static void ggml_compute_forward_gla_f32(
|
||||
#define GGML_F32X_MUL GGML_F32x16_MUL
|
||||
#define GGML_F32X_FMA GGML_F32x16_FMA
|
||||
#define GLA_VECTOR_SIZE 16
|
||||
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
#define GGML_F32X GGML_F32xt
|
||||
#define GGML_F32X_SET1 GGML_F32xt_SET1
|
||||
#define GGML_F32X_LOAD GGML_F32xt_LOAD
|
||||
#define GGML_F32X_STORE GGML_F32xt_STORE
|
||||
#define GGML_F32X_MUL GGML_F32xt_MUL
|
||||
#define GGML_F32X_FMA GGML_F32xt_FMA
|
||||
#define GLA_VECTOR_SIZE 8
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#define GGML_F32X GGML_F32x4
|
||||
#define GGML_F32X_SET1 GGML_F32x4_SET1
|
||||
@@ -8175,8 +8348,14 @@ static void ggml_compute_forward_gla_f32(
|
||||
#define GLA_VECTOR_SIZE 4
|
||||
#endif
|
||||
|
||||
int gla_vector_size;
|
||||
#ifdef GLA_VECTOR_SIZE
|
||||
const int64_t vec_count = head_size / GLA_VECTOR_SIZE;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
gla_vector_size = svcntw();
|
||||
#else
|
||||
gla_vector_size = GLA_VECTOR_SIZE;
|
||||
#endif
|
||||
const int64_t vec_count = head_size / gla_vector_size;
|
||||
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
size_t t_offset = t * t_stride;
|
||||
@@ -8203,7 +8382,7 @@ static void ggml_compute_forward_gla_f32(
|
||||
GGML_F32X g_vec = GGML_F32X_SET1(g_val);
|
||||
|
||||
for (int64_t j = 0; j < vec_count; j++) {
|
||||
size_t base_j = j * GLA_VECTOR_SIZE;
|
||||
size_t base_j = j * gla_vector_size;
|
||||
size_t t_h_j_offset = t_h_offset + base_j;
|
||||
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
|
||||
|
||||
@@ -8227,7 +8406,7 @@ static void ggml_compute_forward_gla_f32(
|
||||
}
|
||||
|
||||
// Handle remaining elements, this will not be used.
|
||||
for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) {
|
||||
for (int64_t j = vec_count * gla_vector_size; j < head_size; j++) {
|
||||
size_t t_h_j_offset = t_h_offset + j;
|
||||
size_t h_2d_i_j_offset = h_2d_i_offset + j;
|
||||
float v_val = v[t_h_j_offset];
|
||||
@@ -8336,83 +8515,126 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
|
||||
int64_t h_stride_2d = head_size * head_size;
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
int64_t t_offset = t * t_stride;
|
||||
int64_t state_offset = head_size * C * (t / (T / n_seqs));
|
||||
float * state_cur = state + state_offset;
|
||||
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// scalar Route to scalar implementation //TODO: Write SVE code
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
int64_t t_offset = t * t_stride;
|
||||
int64_t state_offset = head_size * C * (t / (T / n_seqs));
|
||||
float * state_cur = state + state_offset;
|
||||
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
|
||||
|
||||
for (int64_t h = h_start; h < h_end; h++) {
|
||||
int64_t h_offset = h * h_stride;
|
||||
int64_t t_h_offset = t_offset + h_offset;
|
||||
int64_t h_2d_offset = h * h_stride_2d;
|
||||
for (int64_t h = h_start; h < h_end; h++) {
|
||||
int64_t h_offset = h * h_stride;
|
||||
int64_t t_h_offset = t_offset + h_offset;
|
||||
int64_t h_2d_offset = h * h_stride_2d;
|
||||
|
||||
for (int64_t ii = 0; ii < head_size; ii++) {
|
||||
int64_t t_h_i_offset = t_h_offset + ii;
|
||||
int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
|
||||
for (int64_t i = 0; i < head_size; i++) {
|
||||
int64_t t_h_i_offset = t_h_offset + i;
|
||||
int64_t h_2d_i_offset = h_2d_offset + i * h_stride;
|
||||
|
||||
GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
|
||||
float v_val = v[t_h_i_offset];
|
||||
|
||||
float sa = 0;
|
||||
{
|
||||
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
GGML_F32_VEC ax[GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
|
||||
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
|
||||
ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
|
||||
ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
|
||||
sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
|
||||
}
|
||||
float sa = 0, result = 0;
|
||||
for (int64_t j = 0; j < head_size; j++) {
|
||||
sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j];
|
||||
}
|
||||
GGML_F32_VEC_REDUCE(sa, sum);
|
||||
}
|
||||
|
||||
GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
|
||||
for (int64_t j = 0; j < head_size; j++) {
|
||||
int64_t t_h_j_offset = t_h_offset + j;
|
||||
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
|
||||
|
||||
int64_t j = 0;
|
||||
GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
for (; j < head_size; j += GGML_F32_STEP) {
|
||||
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
|
||||
int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
|
||||
int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
|
||||
|
||||
GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
|
||||
GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
|
||||
GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
|
||||
GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
|
||||
|
||||
k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
|
||||
|
||||
GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
|
||||
// kv + s * decay + sa * b
|
||||
state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
|
||||
state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
|
||||
GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
|
||||
|
||||
result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
|
||||
float r_val = r[t_h_j_offset];
|
||||
float w_val = w[t_h_j_offset];
|
||||
float k_val = k[t_h_j_offset];
|
||||
float b_val = b[t_h_j_offset];
|
||||
float kv_val = v_val * k_val;
|
||||
float prev_state_val = state_prev[h_2d_i_j_offset];
|
||||
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
|
||||
result += state_cur[h_2d_i_j_offset] * r_val;
|
||||
}
|
||||
}
|
||||
GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
|
||||
|
||||
// There shouldn't be left-overs though.
|
||||
for (; j < head_size; j++) {
|
||||
int64_t t_h_j_offset = t_h_offset + j;
|
||||
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
|
||||
|
||||
float r_val = r[t_h_j_offset];
|
||||
float w_val = w[t_h_j_offset];
|
||||
float k_val = k[t_h_j_offset];
|
||||
float b_val = b[t_h_j_offset];
|
||||
float kv_val = v[t_h_i_offset] * k_val;
|
||||
|
||||
float prev_state_val = state_prev[h_2d_i_j_offset];
|
||||
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
|
||||
dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
|
||||
dst_data[t_h_i_offset] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
int64_t t_offset = t * t_stride;
|
||||
int64_t state_offset = head_size * C * (t / (T / n_seqs));
|
||||
float * state_cur = state + state_offset;
|
||||
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
|
||||
|
||||
for (int64_t h = h_start; h < h_end; h++) {
|
||||
int64_t h_offset = h * h_stride;
|
||||
int64_t t_h_offset = t_offset + h_offset;
|
||||
int64_t h_2d_offset = h * h_stride_2d;
|
||||
|
||||
for (int64_t ii = 0; ii < head_size; ii++) {
|
||||
int64_t t_h_i_offset = t_h_offset + ii;
|
||||
int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
|
||||
|
||||
GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
|
||||
|
||||
float sa = 0;
|
||||
{
|
||||
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
GGML_F32_VEC ax[GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
|
||||
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
|
||||
ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
|
||||
ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
|
||||
sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
|
||||
}
|
||||
}
|
||||
GGML_F32_VEC_REDUCE(sa, sum);
|
||||
}
|
||||
|
||||
GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
|
||||
|
||||
int64_t j = 0;
|
||||
GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
for (; j < head_size; j += GGML_F32_STEP) {
|
||||
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
|
||||
int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
|
||||
int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
|
||||
|
||||
GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
|
||||
GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
|
||||
GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
|
||||
GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
|
||||
|
||||
k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
|
||||
|
||||
GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
|
||||
// kv + s * decay + sa * b
|
||||
state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
|
||||
state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
|
||||
GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
|
||||
|
||||
result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
|
||||
}
|
||||
}
|
||||
GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
|
||||
|
||||
// There shouldn't be left-overs though.
|
||||
for (; j < head_size; j++) {
|
||||
int64_t t_h_j_offset = t_h_offset + j;
|
||||
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
|
||||
|
||||
float r_val = r[t_h_j_offset];
|
||||
float w_val = w[t_h_j_offset];
|
||||
float k_val = k[t_h_j_offset];
|
||||
float b_val = b[t_h_j_offset];
|
||||
float kv_val = v[t_h_i_offset] * k_val;
|
||||
|
||||
float prev_state_val = state_prev[h_2d_i_j_offset];
|
||||
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
|
||||
dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
int64_t t_offset = t * t_stride;
|
||||
|
||||
@@ -17,7 +17,123 @@
|
||||
// number of elements to fit in a single register
|
||||
//
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 SVE
|
||||
#define GGML_F32_EPR 8
|
||||
#define DEFAULT_PG svptrue_b32()
|
||||
|
||||
#define GGML_F32xt svfloat32_t
|
||||
#define GGML_F32xt_ZERO svdup_n_f32(0.0f)
|
||||
#define GGML_F32xt_SET1(x) svdup_n_f32(x)
|
||||
#define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
|
||||
#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
|
||||
#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
|
||||
#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
|
||||
#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
|
||||
#define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
|
||||
#define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
|
||||
{ \
|
||||
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
|
||||
sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
|
||||
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
|
||||
sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
|
||||
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
|
||||
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
|
||||
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
|
||||
(res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
|
||||
}
|
||||
#define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
|
||||
#define GGML_F32_VEC GGML_F32xt
|
||||
#define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32xt_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32xt_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32xt_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32xt_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
|
||||
|
||||
// F16 NEON
|
||||
|
||||
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
#define GGML_F16_STEP 32
|
||||
#define GGML_F16_EPR 8
|
||||
|
||||
#define GGML_F16x8 float16x8_t
|
||||
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
||||
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
||||
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
|
||||
#define GGML_F16x8_STORE vst1q_f16
|
||||
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
||||
#define GGML_F16x8_ADD vaddq_f16
|
||||
#define GGML_F16x8_MUL vmulq_f16
|
||||
#define GGML_F16x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
||||
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F16x8
|
||||
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
|
||||
#else
|
||||
// if FP16 vector arithmetic is not supported, we use FP32 instead
|
||||
// and take advantage of the vcvt_ functions to convert to/from FP16
|
||||
|
||||
#define GGML_F16_STEP 16
|
||||
#define GGML_F16_EPR 4
|
||||
|
||||
#define GGML_F32Cx4 float32x4_t
|
||||
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
||||
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
||||
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
|
||||
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
||||
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
||||
#define GGML_F32Cx4_ADD vaddq_f32
|
||||
#define GGML_F32Cx4_MUL vmulq_f32
|
||||
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx4
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||
#endif
|
||||
|
||||
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
|
||||
@@ -17,29 +17,98 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
float sumf = 0.0f;
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
||||
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
||||
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
|
||||
|
||||
GGML_F32_VEC ax[GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
const int np = (n & ~(ggml_f32_step - 1));
|
||||
svfloat32_t sum1 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum2 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum3 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum4 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum5 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum6 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum7 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum8 = svdup_n_f32(0.0f);
|
||||
svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
|
||||
svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
||||
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
|
||||
|
||||
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
|
||||
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
||||
sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
|
||||
|
||||
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
||||
sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
|
||||
|
||||
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
||||
sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
|
||||
|
||||
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
||||
sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
|
||||
|
||||
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
||||
sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
|
||||
|
||||
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
||||
sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
|
||||
}
|
||||
}
|
||||
// leftovers
|
||||
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
||||
const int np2 = (n & ~(ggml_f32_epr - 1));
|
||||
for (int i = np; i < np2; i += ggml_f32_epr) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
|
||||
}
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b32(np2, n);
|
||||
ax1 = svld1_f32(pg, x + np2);
|
||||
ay1 = svld1_f32(pg, y + np2);
|
||||
sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
|
||||
}
|
||||
// reduce sum1,sum2 to sum1
|
||||
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
||||
#else
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
// reduce sum0..sum3 to sum0
|
||||
GGML_F32_VEC_REDUCE(sumf, sum);
|
||||
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
sumf += x[i]*y[i];
|
||||
}
|
||||
GGML_F32_VEC ax[GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
||||
|
||||
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce sum0..sum3 to sum0
|
||||
GGML_F32_VEC_REDUCE(sumf, sum);
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
sumf += x[i]*y[i];
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// scalar
|
||||
ggml_float sumf = 0.0;
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "simd-mappings.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
@@ -148,27 +149,108 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
||||
|
||||
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
|
||||
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
||||
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
||||
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
||||
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
|
||||
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
||||
|
||||
GGML_F32_VEC ax[GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
const int np = (n & ~(ggml_f32_step - 1));
|
||||
svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
||||
GGML_F32_VEC_STORE(y + i, ay1);
|
||||
|
||||
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
||||
|
||||
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
|
||||
|
||||
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
|
||||
|
||||
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
|
||||
|
||||
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
|
||||
|
||||
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
|
||||
|
||||
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
|
||||
}
|
||||
}
|
||||
// leftovers
|
||||
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
||||
const int np2 = (n & ~(ggml_f32_epr - 1));
|
||||
for (int i = np; i < np2; i += ggml_f32_epr) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] += x[i]*v;
|
||||
}
|
||||
GGML_F32_VEC_STORE(y + i, ay1);
|
||||
}
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
if (np2 < n) {
|
||||
svbool_t pg =svwhilelt_b32(np2, n);
|
||||
ax1 = svld1_f32(pg, x + np2);
|
||||
ay1 = svld1_f32(pg, y + np2);
|
||||
ay1 = svmad_f32_m(pg, ax1, vx, ay1);
|
||||
|
||||
svst1_f32(pg, y + np2, ay1);
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
||||
|
||||
GGML_F32_VEC ax[GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] += x[i]*v;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
@@ -220,36 +302,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
||||
}
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
|
||||
|
||||
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
||||
vx[k] = GGML_F32_VEC_SET1(v[k][0]);
|
||||
}
|
||||
|
||||
GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
||||
|
||||
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
||||
ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// scalar Route to scalar implementation //TODO: Write SVE code
|
||||
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] += x[k][i]*v[k][0];
|
||||
}
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
// leftovers
|
||||
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] += x[k][i]*v[k][0];
|
||||
GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
|
||||
|
||||
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
||||
vx[k] = GGML_F32_VEC_SET1(v[k][0]);
|
||||
}
|
||||
}
|
||||
|
||||
GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
||||
|
||||
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
||||
ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
|
||||
}
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] += x[k][i]*v[k][0];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// scalar
|
||||
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
||||
@@ -265,25 +356,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
vDSP_vsmul(y, 1, &v, y, 1, n);
|
||||
#elif defined(GGML_SIMD)
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
||||
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
||||
const int ggml_f32_step = 2 * ggml_f32_epr;
|
||||
|
||||
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
||||
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
||||
const int np = (n & ~(ggml_f32_step - 1));
|
||||
svfloat32_t ay1;
|
||||
svfloat32_t ay2;
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
ay1 = GGML_F32_VEC_MUL(ay1, vx);
|
||||
GGML_F32_VEC_STORE(y + i, ay1);
|
||||
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_MUL(ay2, vx);
|
||||
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
||||
}
|
||||
}
|
||||
// leftovers
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
if (np < n) {
|
||||
svbool_t pg = svwhilelt_b32(np, n);
|
||||
ay1 = svld1_f32(pg, y + np);
|
||||
ay1 = svmul_f32_m(pg, ay1, vx);
|
||||
svst1_f32(pg, y + np, ay1);
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] *= v;
|
||||
}
|
||||
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
||||
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] *= v;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
@@ -428,6 +547,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
||||
|
||||
inline static float ggml_gelu_f32(float x) {
|
||||
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
@@ -440,6 +560,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = GGML_FP16_TO_FP32(x[i]);
|
||||
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
||||
y[i] = GGML_FP32_TO_FP16(res);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_GELU_FP16
|
||||
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
||||
uint16_t t;
|
||||
@@ -463,6 +591,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
||||
}
|
||||
#endif
|
||||
|
||||
inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = x[i];
|
||||
y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
||||
}
|
||||
}
|
||||
|
||||
inline static float ggml_gelu_quick_f32(float x) {
|
||||
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
||||
}
|
||||
@@ -512,6 +647,42 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
|
||||
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
|
||||
#endif
|
||||
|
||||
/* Below function was borrowed from the GitHub repository:
|
||||
https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
|
||||
// Constants
|
||||
const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
|
||||
const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
|
||||
const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
|
||||
const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
|
||||
const svfloat32_t one = svdup_n_f32(1.0f);
|
||||
const svfloat32_t inactive1 = svdup_n_f32(0.0f);
|
||||
const svint32_t inactive2 = svdup_n_s32(0);
|
||||
|
||||
// Algorithm starts here
|
||||
svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
|
||||
svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
|
||||
svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
|
||||
|
||||
t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
|
||||
t1 = svadd_f32_m(pg, t1, one); // b = a + 1
|
||||
|
||||
svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
|
||||
svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
|
||||
t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
|
||||
|
||||
// and_(t2.d, t1.d, not_mask17.d)
|
||||
svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
|
||||
t5 = svsub_f32_m(pg, t1, t5); // z
|
||||
t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
|
||||
t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
|
||||
t0 = svmul_f32_m(pg, t0, t4); // Final result
|
||||
|
||||
return t0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||
|
||||
// adapted from arm limited optimized routine
|
||||
|
||||
@@ -168,7 +168,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||
|
||||
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
||||
|
||||
#if !defined(GGML_USE_HIP)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
static const char * cu_get_error_str(CUresult err) {
|
||||
const char * err_str;
|
||||
cuGetErrorString(err, &err_str);
|
||||
@@ -635,6 +635,7 @@ struct ggml_cuda_device_info {
|
||||
int nsm; // number of streaming multiprocessors
|
||||
size_t smpb; // max. shared memory per block
|
||||
size_t smpbo; // max. shared memory per block (with opt-in)
|
||||
bool integrated; // Device is integrated as opposed to discrete
|
||||
bool vmm; // virtual memory support
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#include "cpy.cuh"
|
||||
#include "dequantize.cuh"
|
||||
#ifdef GGML_USE_MUSA
|
||||
#include "ggml-musa/mudnn.cuh"
|
||||
#endif // GGML_USE_MUSA
|
||||
|
||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||
|
||||
@@ -597,7 +600,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
#endif
|
||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
#ifdef GGML_USE_MUSA
|
||||
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
|
||||
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
|
||||
} else
|
||||
#endif // GGML_USE_MUSA
|
||||
{
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
|
||||
@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
|
||||
__builtin_assume(tid < D);
|
||||
|
||||
extern __shared__ float2 meta[];
|
||||
if (tid < 2*parallel_blocks) {
|
||||
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
|
||||
for (int i = tid; i < 2*parallel_blocks; i += D) {
|
||||
((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -772,7 +772,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
|
||||
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
|
||||
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
|
||||
GGML_UNUSED(kb0);
|
||||
GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
@@ -1246,7 +1246,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#endif __CUDA_ARCH__ == GGML_CUDA_CC_TURING
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
|
||||
|
||||
static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");
|
||||
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#ifndef GGML_USE_HIP
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // GGML_USE_HIP
|
||||
static __global__ void flash_attn_vec_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
@@ -48,6 +48,12 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
if (ncols > 1) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
@@ -91,6 +97,13 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__shared__ half maskh_shared[ncols*D];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskh_shared[j*D + tid] = 0.0f;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
|
||||
@@ -175,6 +188,36 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + k_VKQ_0 + tid];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
|
||||
// In such cases, skip the KV slice.
|
||||
// On AMD __all_sync would not work correctly because it assumes a warp size of 64.
|
||||
#ifndef GGML_USE_HIP
|
||||
bool skip = true;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float2 tmp = __half22float2(((const half2 *) maskh_shared)[j*(D/2) + i]);
|
||||
skip = skip && isinf(tmp.x) && isinf(tmp.y);
|
||||
}
|
||||
}
|
||||
if (__all_sync(0xFFFFFFFF, skip)) {
|
||||
__syncthreads();
|
||||
continue;
|
||||
}
|
||||
#endif // GGML_USE_HIP
|
||||
}
|
||||
|
||||
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
||||
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
|
||||
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
|
||||
@@ -202,7 +245,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
sum = logit_softcap*tanhf(sum);
|
||||
}
|
||||
|
||||
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||
sum += maskh_shared[j*D + i_KQ];
|
||||
|
||||
if (ncols == 1) {
|
||||
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
||||
@@ -335,7 +378,9 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (Q->ne[1] == 1) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
constexpr int cols_per_block = 1;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#ifndef GGML_USE_HIP
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // GGML_USE_HIP
|
||||
static __global__ void flash_attn_vec_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
@@ -60,6 +60,12 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
if (ncols > 1) {
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
@@ -104,6 +110,13 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__shared__ float maskf_shared[ncols*D];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskf_shared[j*D + tid] = 0.0f;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
|
||||
@@ -181,6 +194,35 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
if (mask) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + k_VKQ_0 + tid]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// When using multiple parallel sequences in llama.cpp, some KV slices can be fully masked out.
|
||||
// In such cases, skip the KV slice.
|
||||
// On AMD __all_sync would not work correctly because it assumes a warp size of 64.
|
||||
#ifndef GGML_USE_HIP
|
||||
bool skip = true;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
skip = skip && isinf(maskf_shared[j*D + i]);
|
||||
}
|
||||
}
|
||||
if (__all_sync(0xFFFFFFFF, skip)) {
|
||||
__syncthreads();
|
||||
continue;
|
||||
}
|
||||
#endif // GGML_USE_HIP
|
||||
}
|
||||
|
||||
float kqmax_new_arr[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
@@ -204,7 +246,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
sum = logit_softcap*tanhf(sum);
|
||||
}
|
||||
|
||||
sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
sum += maskf_shared[j*D + i_KQ];
|
||||
|
||||
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
|
||||
|
||||
@@ -326,7 +368,9 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
|
||||
float logit_softcap;
|
||||
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||
|
||||
if (Q->ne[1] == 1) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
|
||||
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
constexpr int cols_per_block = 1;
|
||||
if (logit_softcap == 0.0f) {
|
||||
constexpr bool use_logit_softcap = false;
|
||||
|
||||
@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
|
||||
info.default_tensor_split[id] = total_vram;
|
||||
total_vram += prop.totalGlobalMem;
|
||||
|
||||
info.devices[id].nsm = prop.multiProcessorCount;
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
info.devices[id].warp_size = prop.warpSize;
|
||||
info.devices[id].integrated = prop.integrated;
|
||||
info.devices[id].nsm = prop.multiProcessorCount;
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
info.devices[id].warp_size = prop.warpSize;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||
|
||||
@@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||
}
|
||||
@@ -2192,6 +2196,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_SILU:
|
||||
ggml_cuda_op_silu(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
ggml_cuda_op_gelu_erf(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
ggml_cuda_op_gelu_quick(ctx, dst);
|
||||
break;
|
||||
@@ -2638,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
// flag used to determine whether it is an integrated_gpu
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
|
||||
while (!graph_evaluated_or_captured) {
|
||||
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
||||
@@ -2656,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
if (node->src[j] != nullptr) {
|
||||
assert(node->src[j]->buffer);
|
||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
|
||||
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
|
||||
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -2977,6 +2986,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
@@ -2990,9 +3000,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
{
|
||||
struct ggml_tensor * a = op->src[0];
|
||||
struct ggml_tensor * b = op->src[1];
|
||||
// for small weight matrices the active device can end up without any rows, don't use row split in those cases
|
||||
// this avoids some edge cases (and the performance would not be good anyways)
|
||||
if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
|
||||
if (a->ne[2] > 1 || a->ne[3] > 1) {
|
||||
return false;
|
||||
}
|
||||
// for small weight matrices the active device can end up without any rows, don't use row split in those cases
|
||||
// this avoids some edge cases (and the performance would not be good anyways)
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
|
||||
int64_t row_low;
|
||||
int64_t row_high;
|
||||
@@ -3259,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||
const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
|
||||
return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
|
||||
}
|
||||
|
||||
static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||
|
||||
@@ -23,6 +23,12 @@ static __device__ __forceinline__ float op_gelu(float x) {
|
||||
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_gelu_erf(float x) {
|
||||
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
||||
|
||||
return 0.5f*x*(1.0f + erff(x*SQRT_2_INV));
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_gelu_quick(float x) {
|
||||
const float GELU_QUICK_COEF = -1.702f;
|
||||
|
||||
@@ -134,6 +140,10 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_gelu>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_gelu_erf>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_gelu_quick>(ctx, dst);
|
||||
}
|
||||
|
||||
@@ -30,6 +30,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -32,6 +32,8 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_print_backtrace(void);
|
||||
|
||||
#ifndef MIN
|
||||
# define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
@@ -386,7 +388,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
||||
return r;
|
||||
}
|
||||
|
||||
#elif defined(__riscv) && defined(GGML_RV_ZFH)
|
||||
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
float f;
|
||||
|
||||
@@ -149,6 +149,8 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_SIGMOID,
|
||||
GGML_METAL_KERNEL_TYPE_GELU,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_4,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_ERF,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_ERF_4,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_QUICK,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
|
||||
GGML_METAL_KERNEL_TYPE_SILU,
|
||||
@@ -1103,6 +1105,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID, sigmoid, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF, gelu_erf, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF_4, gelu_erf_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
|
||||
@@ -1613,6 +1617,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
@@ -2251,6 +2256,25 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
{
|
||||
int64_t n = ggml_nelements(dst);
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
if (n % 4 == 0) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF_4].pipeline;
|
||||
n /= 4;
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF].pipeline;
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
{
|
||||
int64_t n = ggml_nelements(dst);
|
||||
|
||||
@@ -856,6 +856,7 @@ kernel void kernel_tanh(
|
||||
constant float GELU_COEF_A = 0.044715f;
|
||||
constant float GELU_QUICK_COEF = -1.702f;
|
||||
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
constant float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
||||
|
||||
kernel void kernel_gelu(
|
||||
device const float * src0,
|
||||
@@ -897,6 +898,42 @@ kernel void kernel_gelu_quick_4(
|
||||
dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
|
||||
}
|
||||
|
||||
// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
|
||||
// ref: https://www.johndcook.com/blog/python_erf/
|
||||
constant float p_erf = 0.3275911f;
|
||||
constant float a1_erf = 0.254829592f;
|
||||
constant float a2_erf = -0.284496736f;
|
||||
constant float a3_erf = 1.421413741f;
|
||||
constant float a4_erf = -1.453152027f;
|
||||
constant float a5_erf = 1.061405429f;
|
||||
|
||||
template<typename T>
|
||||
T erf_approx(T x) {
|
||||
T sign_x = sign(x);
|
||||
x = fabs(x);
|
||||
T t = 1.0f / (1.0f + p_erf * x);
|
||||
T y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
|
||||
return sign_x * y;
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_erf(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
device const float & x = src0[tpig];
|
||||
|
||||
dst[tpig] = 0.5f*x*(1.0f+erf_approx<float>(x*SQRT_2_INV));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_erf_4(
|
||||
device const float4 * src0,
|
||||
device float4 * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
device const float4 & x = src0[tpig];
|
||||
|
||||
dst[tpig] = 0.5f*x*(1.0f+erf_approx<float4>(x*SQRT_2_INV));
|
||||
}
|
||||
|
||||
kernel void kernel_silu(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
@@ -3255,7 +3292,7 @@ template<
|
||||
typename kd4x4_t, // key type in device memory
|
||||
short nl_k,
|
||||
void (*deq_k)(device const kd4x4_t *, short, thread k4x4_t &),
|
||||
typename vd4x4_t, // key type in device memory
|
||||
typename vd4x4_t, // value type in device memory
|
||||
short nl_v,
|
||||
void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
|
||||
short DK, // K head size
|
||||
@@ -3776,7 +3813,7 @@ template<
|
||||
typename kd4_t, // key type in device memory
|
||||
short nl_k,
|
||||
void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
|
||||
typename vd4_t, // key type in device memory
|
||||
typename vd4_t, // value type in device memory
|
||||
short nl_v,
|
||||
void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
|
||||
short DK, // K head size
|
||||
|
||||
@@ -27,12 +27,15 @@ if (MUSAToolkit_FOUND)
|
||||
|
||||
file(GLOB GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
|
||||
list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
|
||||
list(APPEND GGML_HEADERS_MUSA "../ggml-musa/mudnn.cuh")
|
||||
|
||||
file(GLOB GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
file(GLOB SRCS "../ggml-musa/*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
||||
@@ -62,7 +65,9 @@ if (MUSAToolkit_FOUND)
|
||||
)
|
||||
|
||||
# TODO: do not use CUDA definitions for MUSA
|
||||
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
||||
endif()
|
||||
|
||||
add_compile_definitions(GGML_USE_MUSA)
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
@@ -92,9 +97,10 @@ if (MUSAToolkit_FOUND)
|
||||
endif()
|
||||
|
||||
if (GGML_STATIC)
|
||||
# TODO: mudnn has not provided static libraries yet
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
|
||||
else()
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
|
||||
112
ggml/src/ggml-musa/mudnn.cu
Normal file
112
ggml/src/ggml-musa/mudnn.cu
Normal file
@@ -0,0 +1,112 @@
|
||||
#include <mutex>
|
||||
#include <mudnn.h>
|
||||
|
||||
#include "mudnn.cuh"
|
||||
|
||||
namespace mudnn = musa::dnn;
|
||||
|
||||
// Returns a human-readable error string for mudnn::Status
|
||||
const char* mudnnGetErrorString(mudnn::Status err) {
|
||||
switch (err) {
|
||||
case mudnn::Status::SUCCESS:
|
||||
return "Success";
|
||||
case mudnn::Status::INVALID_PARAMETER:
|
||||
return "Invalid parameter";
|
||||
case mudnn::Status::NOT_INITIALIZED:
|
||||
return "Not initialized";
|
||||
case mudnn::Status::ALLOC_FAILED:
|
||||
return "Allocation failed";
|
||||
case mudnn::Status::NOT_SUPPORTED:
|
||||
return "Not supported";
|
||||
case mudnn::Status::INTERNAL_ERROR:
|
||||
return "Internal error";
|
||||
case mudnn::Status::ARCH_MISMATCH:
|
||||
return "Architecture mismatch";
|
||||
case mudnn::Status::EXECUTION_FAILED:
|
||||
return "Execution failed";
|
||||
default:
|
||||
return "Unknown mudnn status";
|
||||
}
|
||||
}
|
||||
|
||||
// Error checking macro for MUDNN calls
|
||||
#define MUDNN_CHECK(err) CUDA_CHECK_GEN(err, mudnn::Status::SUCCESS, mudnnGetErrorString)
|
||||
|
||||
namespace {
|
||||
// Thread-safe cache for mudnn::Handle objects per device
|
||||
std::unordered_map<int, std::unique_ptr<mudnn::Handle>> handle_cache;
|
||||
std::mutex handle_cache_mutex;
|
||||
|
||||
mudnn::Handle* get_cached_handle(int device_id) {
|
||||
std::lock_guard<std::mutex> lock(handle_cache_mutex);
|
||||
auto it = handle_cache.find(device_id);
|
||||
if (it != handle_cache.end()) {
|
||||
return it->second.get();
|
||||
}
|
||||
auto handle = std::make_unique<mudnn::Handle>(device_id);
|
||||
mudnn::Handle* handle_ptr = handle.get();
|
||||
handle_cache[device_id] = std::move(handle);
|
||||
return handle_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts dimensions and strides from a ggml_tensor
|
||||
int get_ggml_dims_and_strides(const ggml_tensor* tensor,
|
||||
std::vector<int64_t>& dims,
|
||||
std::vector<int64_t>& strides) {
|
||||
const int ndims = ggml_n_dims(tensor);
|
||||
const size_t element_size = ggml_element_size(tensor);
|
||||
|
||||
dims.resize(ndims);
|
||||
strides.resize(ndims);
|
||||
|
||||
for (int i = 0; i < ndims; ++i) {
|
||||
dims[i] = tensor->ne[i];
|
||||
strides[i] = tensor->nb[i] / static_cast<int64_t>(element_size);
|
||||
}
|
||||
return ndims;
|
||||
}
|
||||
|
||||
// Converts ggml_type to mudnn::Tensor::Type
|
||||
mudnn::Tensor::Type ggml_type_to_mudnn_type(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return mudnn::Tensor::Type::FLOAT;
|
||||
case GGML_TYPE_F16:
|
||||
return mudnn::Tensor::Type::HALF;
|
||||
|
||||
// TODO: Add support for other types
|
||||
|
||||
default:
|
||||
MUDNN_CHECK(mudnn::Status::NOT_SUPPORTED);
|
||||
}
|
||||
|
||||
return mudnn::Tensor::Type::FLOAT; // Default fallback
|
||||
}
|
||||
|
||||
// Asynchronous memory copy using mudnn::Unary::IDENTITY
|
||||
musaError_t mudnnMemcpyAsync(ggml_backend_cuda_context& ctx, const ggml_tensor* dst, const ggml_tensor* src) {
|
||||
mudnn::Tensor tensor_dst, tensor_src;
|
||||
|
||||
MUDNN_CHECK(tensor_dst.SetType(ggml_type_to_mudnn_type(dst->type)));
|
||||
MUDNN_CHECK(tensor_src.SetType(ggml_type_to_mudnn_type(src->type)));
|
||||
|
||||
std::vector<int64_t> dims, strides;
|
||||
const int ndims = get_ggml_dims_and_strides(src, dims, strides);
|
||||
|
||||
MUDNN_CHECK(tensor_dst.SetNdInfo(ndims, dims.data(), strides.data()));
|
||||
MUDNN_CHECK(tensor_src.SetNdInfo(ndims, dims.data(), strides.data()));
|
||||
MUDNN_CHECK(tensor_dst.SetAddr(dst->data));
|
||||
MUDNN_CHECK(tensor_src.SetAddr(src->data));
|
||||
|
||||
mudnn::Unary op;
|
||||
MUDNN_CHECK(op.SetMode(mudnn::Unary::Mode::IDENTITY));
|
||||
MUDNN_CHECK(op.SetAlpha(0.0f));
|
||||
MUDNN_CHECK(op.SetBeta(0.0f));
|
||||
|
||||
mudnn::Handle* handle = get_cached_handle(ctx.device);
|
||||
MUDNN_CHECK(handle->SetStream(ctx.stream()));
|
||||
MUDNN_CHECK(op.Run(*handle, tensor_dst, tensor_src));
|
||||
|
||||
return musaSuccess;
|
||||
}
|
||||
12
ggml/src/ggml-musa/mudnn.cuh
Normal file
12
ggml/src/ggml-musa/mudnn.cuh
Normal file
@@ -0,0 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include "../include/ggml.h"
|
||||
#include "../ggml-cuda/common.cuh"
|
||||
|
||||
// Asynchronously copies data from src tensor to dst tensor using the provided context.
|
||||
// Returns a musaError_t indicating success or failure.
|
||||
musaError_t mudnnMemcpyAsync(
|
||||
ggml_backend_cuda_context &ctx,
|
||||
const ggml_tensor *dst,
|
||||
const ggml_tensor *src
|
||||
);
|
||||
@@ -55,14 +55,17 @@ endfunction()
|
||||
|
||||
set(GGML_OPENCL_KERNELS
|
||||
add
|
||||
argsort
|
||||
clamp
|
||||
cpy
|
||||
cvt
|
||||
diag_mask_inf
|
||||
div
|
||||
gelu
|
||||
gemv_noshuffle_general
|
||||
gemv_noshuffle
|
||||
get_rows
|
||||
group_norm
|
||||
im2col_f32
|
||||
im2col_f16
|
||||
mul_mat_Ab_Bi_8x4
|
||||
@@ -83,11 +86,14 @@ set(GGML_OPENCL_KERNELS
|
||||
rms_norm
|
||||
rope
|
||||
scale
|
||||
sigmoid
|
||||
silu
|
||||
softmax_4_f32
|
||||
softmax_4_f16
|
||||
softmax_f32
|
||||
softmax_f16
|
||||
sub
|
||||
sum_rows
|
||||
transpose
|
||||
)
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
86
ggml/src/ggml-opencl/kernels/argsort.cl
Normal file
86
ggml/src/ggml-opencl/kernels/argsort.cl
Normal file
@@ -0,0 +1,86 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
|
||||
|
||||
enum ggml_sort_order {
|
||||
GGML_SORT_ORDER_ASC,
|
||||
GGML_SORT_ORDER_DESC,
|
||||
};
|
||||
|
||||
kernel void kernel_argsort_f32_i32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global int * dst,
|
||||
ulong offsetd,
|
||||
const int ne00,
|
||||
const int ne00_pad,
|
||||
const int order,
|
||||
local int * dst_row
|
||||
) {
|
||||
// bitonic sort
|
||||
int col = get_local_id(0);
|
||||
int row = get_group_id(1);
|
||||
|
||||
if (col >= ne00_pad) {
|
||||
return;
|
||||
}
|
||||
|
||||
src0 = (global char *)((global char *)src0 + offset0);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
global float * x_row = src0 + row * ne00;
|
||||
|
||||
// initialize indices
|
||||
dst_row[col] = col;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int k = 2; k <= ne00_pad; k *= 2) {
|
||||
for (int j = k / 2; j > 0; j /= 2) {
|
||||
int ixj = col ^ j;
|
||||
if (ixj > col) {
|
||||
if ((col & k) == 0) {
|
||||
if (dst_row[col] >= ne00 ||
|
||||
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
|
||||
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
||||
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
||||
) {
|
||||
SWAP(dst_row[col], dst_row[ixj], int);
|
||||
}
|
||||
} else {
|
||||
if (dst_row[ixj] >= ne00 ||
|
||||
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
|
||||
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
||||
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
||||
) {
|
||||
SWAP(dst_row[col], dst_row[ixj], int);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
// copy the result to dst without the padding
|
||||
if (col < ne00) {
|
||||
dst[row * ne00 + col] = dst_row[col];
|
||||
}
|
||||
}
|
||||
72
ggml/src/ggml-opencl/kernels/div.cl
Normal file
72
ggml/src/ggml-opencl/kernels/div.cl
Normal file
@@ -0,0 +1,72 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// div
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_div(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int i13 = i03 % ne13;
|
||||
int i12 = i02 % ne12;
|
||||
int i11 = i01 % ne11;
|
||||
|
||||
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_div_row(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] / src1[idx1];
|
||||
}
|
||||
72
ggml/src/ggml-opencl/kernels/group_norm.cl
Normal file
72
ggml/src/ggml-opencl/kernels/group_norm.cl
Normal file
@@ -0,0 +1,72 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
// Workgroup must be a subgroup
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_32
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_group_norm(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne,
|
||||
int group_size,
|
||||
float eps
|
||||
) {
|
||||
src0 = (global float *)((global char *)src0 + offset0);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
int start = get_group_id(0) * group_size;
|
||||
int end = start + group_size;
|
||||
|
||||
start += get_local_id(0);
|
||||
|
||||
if (end >= ne) {
|
||||
end = ne;
|
||||
}
|
||||
|
||||
float tmp = 0.0f;
|
||||
|
||||
for (int j = start; j < end; j += get_local_size(0)) {
|
||||
tmp += src0[j];
|
||||
}
|
||||
|
||||
tmp = sub_group_reduce_add(tmp);
|
||||
|
||||
const float mean = tmp / group_size;
|
||||
tmp = 0.0f;
|
||||
|
||||
for (int j = start; j < end; j += get_local_size(0)) {
|
||||
float xi = src0[j] - mean;
|
||||
dst[j] = xi;
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
tmp = sub_group_reduce_add(tmp);
|
||||
|
||||
const float variance = tmp / group_size;
|
||||
const float scale = 1.0f/sqrt(variance + eps);
|
||||
for (int j = start; j < end; j += get_local_size(0)) {
|
||||
dst[j] *= scale;
|
||||
}
|
||||
}
|
||||
29
ggml/src/ggml-opencl/kernels/sigmoid.cl
Normal file
29
ggml/src/ggml-opencl/kernels/sigmoid.cl
Normal file
@@ -0,0 +1,29 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// sigmoid
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
kernel void kernel_sigmoid_f32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
||||
}
|
||||
|
||||
kernel void kernel_sigmoid_f16(
|
||||
global half * src0,
|
||||
ulong offset0,
|
||||
global half * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half*)((global char*)src0 + offset0);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
||||
}
|
||||
72
ggml/src/ggml-opencl/kernels/sub.cl
Normal file
72
ggml/src/ggml-opencl/kernels/sub.cl
Normal file
@@ -0,0 +1,72 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// div
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_sub(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int i13 = i03 % ne13;
|
||||
int i12 = i02 % ne12;
|
||||
int i11 = i01 % ne11;
|
||||
|
||||
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_sub_row(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] - src1[idx1];
|
||||
}
|
||||
39
ggml/src/ggml-opencl/kernels/sum_rows.cl
Normal file
39
ggml/src/ggml-opencl/kernels/sum_rows.cl
Normal file
@@ -0,0 +1,39 @@
|
||||
|
||||
kernel void kernel_sum_rows_f32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = (global float *)((global char *)src0 + offset0);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
int i3 = get_global_id(2);
|
||||
int i2 = get_global_id(1);
|
||||
int i1 = get_global_id(0);
|
||||
|
||||
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
float row_sum = 0;
|
||||
|
||||
for (int i0 = 0; i0 < ne00; i0++) {
|
||||
row_sum += src_row[i0];
|
||||
}
|
||||
|
||||
dst_row[0] = row_sum;
|
||||
}
|
||||
@@ -13,7 +13,7 @@ elseif(SUPPORTS_SYCL)
|
||||
If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
|
||||
source /opt/intel/oneapi/setvars.sh")
|
||||
else()
|
||||
message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
|
||||
message(FATAL_ERROR "C++ compiler lacks SYCL support.")
|
||||
endif()
|
||||
message(STATUS "SYCL found")
|
||||
#todo: AOT
|
||||
@@ -170,7 +170,7 @@ else()
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
if (NOT GGML_SYCL_DEVICE_ARCH)
|
||||
message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
|
||||
message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
|
||||
endif()
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
||||
|
||||
@@ -1,74 +1,93 @@
|
||||
#include "binbcast.hpp"
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
#include "dpct/helper.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static __dpct_inline__ void k_bin_bcast_contiguous(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1,
|
||||
dst_t * dst, std::size_t num_elements, const sycl::nd_item<1> & it) {
|
||||
auto element_id = it.get_global_id(0);
|
||||
auto global_range = it.get_global_range(0);
|
||||
for (; element_id < num_elements; element_id += global_range) {
|
||||
auto src0_float_val = sycl::vec(src0[element_id]).template convert<float, sycl::rounding_mode::rte>();
|
||||
auto src1_float_val = sycl::vec(src1[element_id]).template convert<float, sycl::rounding_mode::rte>();
|
||||
float dst_val = bin_op(src0_float_val[0], src1_float_val[0]);
|
||||
auto val_to_store = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
|
||||
dst[element_id] = val_to_store;
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) /
|
||||
ne3;
|
||||
const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) %
|
||||
ne3;
|
||||
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
for (int i0 = i0s; i0 < ne0;
|
||||
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static __dpct_inline__ void k_bin_bcast(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13,
|
||||
int s0, int s1, int s2, int s3, int s00, int s01, int s02, int s03, int s10,
|
||||
int s11, int s12, int s13, std::size_t num_dst_elements,
|
||||
const sycl::nd_item<1> & item_ct1) {
|
||||
auto calculate_logical_index =
|
||||
[](const std::array<int, 4> & dims, std::size_t element_id) __attribute__((always_inline))->std::array<int, 4> {
|
||||
std::array<int, 4> logical_index;
|
||||
#pragma unroll(4)
|
||||
for (int i = 3; i >= 0; i--) {
|
||||
logical_index[i] = element_id % dims[i];
|
||||
element_id /= dims[i];
|
||||
}
|
||||
return logical_index;
|
||||
};
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
auto calculate_index = [](const std::array<int, 4> & dims, const std::array<int, 4> & strides,
|
||||
const std::array<int, 4> & indices) __attribute__((always_inline))
|
||||
->std::size_t {
|
||||
std::size_t index = 0;
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < 4; i++) {
|
||||
auto index_i = indices[i];
|
||||
if (indices[i] >= dims[i]) {
|
||||
index_i = indices[i] % dims[i];
|
||||
}
|
||||
index += strides[i] * index_i;
|
||||
}
|
||||
return index;
|
||||
};
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
|
||||
auto element_id = item_ct1.get_global_id(0);
|
||||
for (; element_id < num_dst_elements; element_id += item_ct1.get_global_range(0)) {
|
||||
auto logical_index = calculate_logical_index({ ne3, ne2, ne1, ne0 }, element_id);
|
||||
auto src_0_index = calculate_index({ ne3, ne2, ne1, ne0 }, { s03, s02, s01, s00 }, logical_index);
|
||||
auto src_1_index = calculate_index({ ne13, ne12, ne11, ne10 }, { s13, s12, s11, s10 }, logical_index);
|
||||
auto dst_index = calculate_index({ ne3, ne2, ne1, ne0 }, { s3, s2, s1, s0 }, logical_index);
|
||||
auto src0_float_val = sycl::vec(src0[src_0_index]).template convert<float, sycl::rounding_mode::rte>();
|
||||
auto src1_float_val = sycl::vec(src1[src_1_index]).template convert<float, sycl::rounding_mode::rte>();
|
||||
float dst_val = bin_op(src0_float_val[0], src1_float_val[0]);
|
||||
auto val_to_store = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
|
||||
dst[dst_index] = val_to_store;
|
||||
const int i3 = i/(ne2*ne1*ne0);
|
||||
const int i2 = (i/(ne1*ne0)) % ne2;
|
||||
const int i1 = (i/ne0) % ne1;
|
||||
const int i0 = i % ne0;
|
||||
|
||||
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
|
||||
template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
|
||||
|
||||
template<float (*bin_op)(const float, const float)>
|
||||
struct bin_bcast_sycl {
|
||||
template <typename src0_t, typename src1_t, typename dst_t>
|
||||
void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
|
||||
const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
|
||||
@@ -77,73 +96,165 @@ template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
|
||||
const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
|
||||
const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
|
||||
const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
|
||||
auto check_bcast_required = [](const std::array<int64_t, 4> & src_dims,
|
||||
const std::array<int64_t, 4> & dst_dims) -> bool {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (dst_dims[i] > src_dims[i]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
int nr0 = ne10 / ne0;
|
||||
int nr1 = ne11/ne1;
|
||||
int nr2 = ne12/ne2;
|
||||
int nr3 = ne13/ne3;
|
||||
|
||||
int nr[4] = { nr0, nr1, nr2, nr3 };
|
||||
|
||||
// collapse dimensions until first broadcast dimension
|
||||
int64_t cne[] = {ne0, ne1, ne2, ne3};
|
||||
int64_t cne0[] = {ne00, ne01, ne02, ne03};
|
||||
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
||||
size_t cnb[] = {nb0, nb1, nb2, nb3};
|
||||
size_t cnb0[] = {nb00, nb01, nb02, nb03};
|
||||
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
||||
auto collapse = [](int64_t cne[]) {
|
||||
cne[0] *= cne[1];
|
||||
cne[1] = cne[2];
|
||||
cne[2] = cne[3];
|
||||
cne[3] = 1;
|
||||
};
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
||||
cnb[1] *= cne[1];
|
||||
cnb[2] *= cne[2];
|
||||
cnb[3] *= cne[3];
|
||||
};
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
|
||||
if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (nr[i] != 1) {
|
||||
break;
|
||||
}
|
||||
if (i > 0) {
|
||||
collapse_nb(cnb, cne);
|
||||
collapse_nb(cnb0, cne0);
|
||||
collapse_nb(cnb1, cne1);
|
||||
collapse(cne);
|
||||
collapse(cne0);
|
||||
collapse(cne1);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
int64_t ne0 = cne[0];
|
||||
int64_t ne1 = cne[1];
|
||||
int64_t ne2 = cne[2];
|
||||
int64_t ne3 = cne[3];
|
||||
|
||||
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
|
||||
int64_t ne10 = cne1[0];
|
||||
int64_t ne11 = cne1[1];
|
||||
int64_t ne12 = cne1[2];
|
||||
int64_t ne13 = cne1[3];
|
||||
|
||||
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
size_t nb0 = cnb[0];
|
||||
size_t nb1 = cnb[1];
|
||||
size_t nb2 = cnb[2];
|
||||
size_t nb3 = cnb[3];
|
||||
|
||||
// dst strides in number of elements
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
size_t nb00 = cnb0[0];
|
||||
size_t nb01 = cnb0[1];
|
||||
size_t nb02 = cnb0[2];
|
||||
size_t nb03 = cnb0[3];
|
||||
|
||||
// src1 strides in number of elements
|
||||
size_t s10 = nb10 / sizeof(src0_t);
|
||||
size_t s11 = nb11 / sizeof(src1_t);
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
size_t nb10 = cnb1[0];
|
||||
size_t nb11 = cnb1[1];
|
||||
size_t nb12 = cnb1[2];
|
||||
size_t nb13 = cnb1[3];
|
||||
|
||||
// src0 strides in number of elements
|
||||
size_t s00 = nb00 / sizeof(src0_t);
|
||||
size_t s01 = nb01 / sizeof(src0_t);
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
|
||||
std::size_t num_dst_elements = static_cast<std::size_t>(ne0) * static_cast<std::size_t>(ne1) *
|
||||
static_cast<std::size_t>(ne2) * static_cast<std::size_t>(ne3);
|
||||
std::size_t local_range = 256;
|
||||
std::size_t global_range = ceil_div(num_dst_elements, local_range) * local_range;
|
||||
size_t s10 = nb10 / sizeof(src1_t);
|
||||
size_t s11 = nb11 / sizeof(src1_t);
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
|
||||
bool needs_broadcasting = check_bcast_required({ ne00, ne01, ne02, ne03 }, { ne0, ne1, ne2, ne3 }) ||
|
||||
check_bcast_required({ ne10, ne11, ne12, ne13 }, { ne0, ne1, ne2, ne3 });
|
||||
bool all_contiguous = src0_is_contiguous && src1_is_contiguous && dst_is_contiguous;
|
||||
size_t s00 = nb00 / sizeof(src0_t);
|
||||
size_t s01 = nb01 / sizeof(src0_t);
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
if (! needs_broadcasting && all_contiguous) {
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
|
||||
k_bin_bcast_contiguous<bin_op>(src0_dd, src1_dd, dst_dd, num_dst_elements, it);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s0, s1,
|
||||
s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, num_dst_elements, it);
|
||||
});
|
||||
});
|
||||
GGML_UNUSED(s00);
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(s0 == 1);
|
||||
GGML_ASSERT(s10 == 1);
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
||||
|
||||
sycl::range<3> block_dims(1, 1, 1);
|
||||
block_dims[2] = std::min<unsigned int>(hne0, block_size);
|
||||
block_dims[1] = std::min<unsigned int>(
|
||||
ne1, block_size / (unsigned int)block_dims[2]);
|
||||
block_dims[0] = std::min(
|
||||
std::min<unsigned int>(
|
||||
ne2 * ne3, block_size / (unsigned int)block_dims[2] /
|
||||
(unsigned int)block_dims[1]),
|
||||
64U);
|
||||
|
||||
sycl::range<3> block_nums(
|
||||
(ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
|
||||
(ne1 + block_dims[1] - 1) / block_dims[1],
|
||||
(hne0 + block_dims[2] - 1) / block_dims[2]);
|
||||
|
||||
if (block_nums[0] > 65535) {
|
||||
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
||||
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
|
||||
sycl::range<3>(1, 1, block_size),
|
||||
sycl::range<3>(1, 1, block_size)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast_unravel<bin_op>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
|
||||
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
|
||||
s03, s11, s12, s13, item_ct1);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:16: The work-group size passed to the SYCL kernel may
|
||||
exceed the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if
|
||||
needed.
|
||||
*/
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
||||
ne2, ne3, ne10, ne11, ne12, ne13,
|
||||
s1, s2, s3, s01, s02, s03, s11, s12, s13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -208,32 +319,27 @@ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *ds
|
||||
|
||||
|
||||
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_add(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_sub(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_mul(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_div(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_repeat(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
@@ -13,8 +13,10 @@
|
||||
#ifndef GGML_SYCL_COMMON_HPP
|
||||
#define GGML_SYCL_COMMON_HPP
|
||||
|
||||
#include <cstddef>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "dpct/helper.hpp"
|
||||
#include "ggml-sycl.h"
|
||||
@@ -44,11 +46,20 @@ extern int g_ggml_sycl_debug;
|
||||
extern int g_ggml_sycl_disable_optimize;
|
||||
extern int g_ggml_sycl_prioritize_dmmv;
|
||||
|
||||
#define GGML_SYCL_DEBUG(...) \
|
||||
do { \
|
||||
if (g_ggml_sycl_debug) \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
} while (0)
|
||||
#if defined(__clang__) && __has_builtin(__builtin_expect)
|
||||
// Hint the optimizer to pipeline the more likely following instruction in branches
|
||||
# define LIKELY(expr) __builtin_expect(expr, true)
|
||||
# define UNLIKELY(expr) __builtin_expect(expr, false)
|
||||
#else
|
||||
# define LIKELY(expr) (expr)
|
||||
# define UNLIKELY(expr) (expr)
|
||||
#endif
|
||||
|
||||
#define GGML_SYCL_DEBUG(...) \
|
||||
do { \
|
||||
if (UNLIKELY(g_ggml_sycl_debug)) \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define CHECK_TRY_ERROR(expr) \
|
||||
[&]() { \
|
||||
@@ -471,6 +482,19 @@ static __dpct_inline__ float warp_reduce_max(float x,
|
||||
return x;
|
||||
}
|
||||
|
||||
/* Helper for Computing the linear offset of a ggml_tensor given
|
||||
per-dimension sizes, strides, and indices */
|
||||
template<int N>
|
||||
__dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
|
||||
size_t offset = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < N; i++) {
|
||||
auto index_i = indices[i];
|
||||
offset += strides[i] * index_i;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
// Helper for vec loading aligned data
|
||||
template <typename Tp, int n>
|
||||
inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
|
||||
@@ -490,4 +514,76 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
|
||||
}
|
||||
|
||||
bool gpu_has_xmx(sycl::device &dev);
|
||||
|
||||
template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return;
|
||||
}
|
||||
std::stringstream ss;
|
||||
ss << prefix << "=[";
|
||||
for (std::size_t i = 0; i < N - 1; ++i) {
|
||||
ss << array[i] << ", ";
|
||||
}
|
||||
if constexpr (N > 0) {
|
||||
ss << array[N - 1];
|
||||
}
|
||||
ss << "]";
|
||||
GGML_SYCL_DEBUG("%s", ss.str().c_str());
|
||||
}
|
||||
|
||||
inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
|
||||
const std::string & suffix = "") {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return;
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s=", prefix.c_str());
|
||||
if (tensor) {
|
||||
GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
|
||||
debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
|
||||
debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
|
||||
if (!ggml_is_contiguous(tensor)) {
|
||||
GGML_SYCL_DEBUG(";strided");
|
||||
}
|
||||
if (ggml_is_permuted(tensor)) {
|
||||
GGML_SYCL_DEBUG(";permuted");
|
||||
}
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("nullptr");
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s", suffix.c_str());
|
||||
}
|
||||
|
||||
// Use scope_op_debug_print to log operations coming from running a model
|
||||
struct scope_op_debug_print {
|
||||
// Use string_views to avoid the cost of creating a string and concatenating them
|
||||
// string_views must be alive for as long as the object is alive
|
||||
// scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
|
||||
scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
|
||||
std::size_t num_src, const std::string_view & suffix = "") :
|
||||
func(func),
|
||||
func_suffix(func_suffix) {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return;
|
||||
}
|
||||
GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
|
||||
debug_print_tensor(" dst", dst);
|
||||
if (dst) {
|
||||
for (std::size_t i = 0; i < num_src; ++i) {
|
||||
debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
|
||||
}
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s\n", suffix.data());
|
||||
}
|
||||
|
||||
scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
|
||||
const std::string_view & suffix = "") :
|
||||
scope_op_debug_print(func, "", dst, num_src, suffix) {}
|
||||
|
||||
~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
|
||||
|
||||
private:
|
||||
std::string_view func;
|
||||
std::string_view func_suffix;
|
||||
};
|
||||
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
||||
@@ -159,39 +159,37 @@ static void concat_f32_sycl_non_cont(
|
||||
}
|
||||
|
||||
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
queue_ptr stream = ctx.stream();
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
queue_ptr stream = ctx.stream();
|
||||
|
||||
const int32_t dim = ((int32_t *)dst->op_params)[0];
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
const float *src0_d = (const float *)src0->data;
|
||||
const float *src1_d = (const float *)src1->data;
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
|
||||
float *dst_d = (float *)dst->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_sycl(
|
||||
src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
|
||||
src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
|
||||
dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
} else {
|
||||
const size_t size0 = ggml_nbytes(src0);
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
|
||||
}
|
||||
} else {
|
||||
const size_t size0 = ggml_nbytes(src0);
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
|
||||
concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
|
||||
src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
|
||||
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
||||
}
|
||||
} else
|
||||
concat_f32_sycl_non_cont(
|
||||
stream, (const char *)src0->data, (const char *)src1->data,
|
||||
(char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
|
||||
src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
|
||||
src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
|
||||
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
||||
}
|
||||
|
||||
@@ -72,6 +72,7 @@ static void conv_transpose_1d_f32_f32_sycl(
|
||||
}
|
||||
|
||||
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
|
||||
@@ -616,6 +616,9 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
|
||||
}
|
||||
|
||||
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
||||
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
|
||||
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
|
||||
std::string(" src0 type=") + ggml_type_name(src0->type));
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
@@ -629,8 +632,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
|
||||
|
||||
char * src0_ddc = (char *) src0->data;
|
||||
char * src1_ddc = (char *) src1->data;
|
||||
GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type),
|
||||
ggml_type_name(src1->type));
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
|
||||
@@ -694,8 +695,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
|
||||
}
|
||||
|
||||
void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
// TODO: why do we pass dst as src1 here?
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_cpy(ctx, dst->src[0], dst);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s done\n", __func__);
|
||||
}
|
||||
|
||||
@@ -1092,6 +1092,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
||||
|
||||
if (src1_convert_f16) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
|
||||
" : converting src1 to fp16");
|
||||
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
|
||||
@@ -84,6 +84,15 @@ static void gelu_quick(const T *x, T *dst, int k,
|
||||
dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
||||
const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
|
||||
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
||||
auto x_i = x[i];
|
||||
dst[i] = static_cast<T>(0.5f) * x_i * (static_cast<T>(1.0f) + sycl::erf(x_i * SQRT_2_INV));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void tanh(const T *x, T *dst, int k,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
@@ -400,6 +409,20 @@ static void gelu_quick_sycl(const T *x, T *dst, const int k,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
static void gelu_erf_sycl(const T *x, T *dst, const int k,
|
||||
queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
gelu_erf(x, dst, k, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void tanh_sycl(const T *x, T *dst, const int k,
|
||||
queue_ptr stream) {
|
||||
@@ -816,6 +839,38 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
}
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
#else
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
#endif
|
||||
GGML_ASSERT(dst->src[0]->type == dst->type);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
switch (dst->type) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
auto data_pts = cast_data<sycl::half>(dst);
|
||||
gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
auto data_pts = cast_data<float>(dst);
|
||||
gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
@@ -1391,146 +1446,126 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
||||
|
||||
|
||||
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_sqrt(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_sin(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_cos(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_acc(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_gelu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_silu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_gelu_quick(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_gelu_erf(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_tanh(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_relu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_sigmoid(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_hardsigmoid(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_hardswish(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_exp(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_log(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_neg(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_step(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_leaky_relu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_sqr(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_upscale(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_pad(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_clamp(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_sgn(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_abs(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_elu(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
@@ -38,6 +38,8 @@ void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -257,8 +257,7 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
GGML_UNUSED(ctx);
|
||||
}
|
||||
|
||||
void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -308,4 +307,3 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -346,6 +346,8 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static enum ggml_status
|
||||
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor, "\n");
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
if (tensor->view_src != NULL) {
|
||||
@@ -381,20 +383,23 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
|
||||
#ifndef _WIN32
|
||||
// Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU.
|
||||
// This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here.
|
||||
char* host_buf = (char*)malloc(size);
|
||||
char * host_buf = (char *) malloc(size);
|
||||
memcpy(host_buf, data, size);
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
|
||||
.wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait()));
|
||||
free(host_buf);
|
||||
#else
|
||||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, data, size).wait()));
|
||||
#endif
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -406,7 +411,9 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *tensor,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
@@ -434,7 +441,12 @@ static bool
|
||||
ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *src,
|
||||
ggml_tensor *dst) try {
|
||||
if (ggml_backend_buffer_is_sycl(src->buffer)) {
|
||||
bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": dst=", dst);
|
||||
debug_print_tensor(" src=", src);
|
||||
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
||||
if (is_cpy_supported) {
|
||||
ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
|
||||
ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;
|
||||
|
||||
@@ -491,7 +503,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||
|
||||
static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
||||
uint8_t value) try {
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s: size=%zu\n", __func__, buffer->size);
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
||||
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
queue_ptr stream = ctx->stream;
|
||||
@@ -510,7 +523,9 @@ catch (sycl::exception const &exc) {
|
||||
|
||||
static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
|
||||
size_t offset, size_t size) {
|
||||
GGML_SYCL_DEBUG(" [SYCL] call %s\n", __func__);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx->device));
|
||||
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
|
||||
@@ -788,6 +803,8 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff
|
||||
static enum ggml_status
|
||||
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor, "\n");
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
@@ -872,6 +889,9 @@ static void
|
||||
ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor, const void *data,
|
||||
size_t offset, size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
@@ -925,6 +945,9 @@ static void
|
||||
ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *tensor, void *data,
|
||||
size_t offset, size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
@@ -1411,6 +1434,59 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
||||
reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
|
||||
}
|
||||
|
||||
template <int ElementsPerWI>
|
||||
static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor,
|
||||
const int kx, const int kx_padded, const sycl::nd_item<1> & it) {
|
||||
/*
|
||||
Quantizes and reorders the resultant q8 tensor in a per row fashion
|
||||
Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
|
||||
*/
|
||||
|
||||
auto subgroup_id = it.get_group(0);
|
||||
auto wi_id = it.get_local_id(0);
|
||||
|
||||
const int num_blocks_per_row = kx / QK8_1;
|
||||
auto row = subgroup_id / num_blocks_per_row;
|
||||
auto col = subgroup_id % num_blocks_per_row;
|
||||
|
||||
auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1);
|
||||
auto col_offset = QK8_1 * col + wi_id * ElementsPerWI;
|
||||
|
||||
auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset);
|
||||
auto ds_ptr = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2));
|
||||
|
||||
sycl::vec<float, ElementsPerWI> wi_f32_vals;
|
||||
sycl::vec<int8_t, ElementsPerWI> quantized_values;
|
||||
|
||||
auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id;
|
||||
wi_f32_vals = *reinterpret_cast<const sycl::vec<float, ElementsPerWI> *>(x + float_ptr_offset);
|
||||
|
||||
float sum = 0.0f;
|
||||
float amax = 0.0f;
|
||||
|
||||
#pragma unroll(ElementsPerWI)
|
||||
for (int i = 0; i < ElementsPerWI; i++) {
|
||||
sum += wi_f32_vals[i];
|
||||
amax = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i]));
|
||||
quantized_values[i] = 0;
|
||||
}
|
||||
sum = sycl::reduce_over_group(it.get_group(), sum, sycl::plus<float>());
|
||||
amax = sycl::reduce_over_group(it.get_group(), amax, sycl::maximum<float>());
|
||||
float d = amax == 0 ? 1 : amax / 127;
|
||||
|
||||
#pragma unroll(ElementsPerWI)
|
||||
for (int i = 0; i < ElementsPerWI; i++) {
|
||||
quantized_values[i] = sycl::round(wi_f32_vals[i] / d);
|
||||
}
|
||||
|
||||
d = amax == 0 ? 0 : d;
|
||||
|
||||
*reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(quant_ptr) = quantized_values;
|
||||
if (wi_id == 0) {
|
||||
*ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum));
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_p021_f16_f32(
|
||||
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
|
||||
@@ -1695,23 +1771,30 @@ static void pool2d_nchw_kernel(
|
||||
o_ptr[cur_oh * ow + cur_ow] = res;
|
||||
}
|
||||
|
||||
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
||||
const int ky, const int kx_padded,
|
||||
queue_ptr stream) {
|
||||
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
||||
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
||||
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
|
||||
static_assert(QK8_1 % WARP_SIZE == 0);
|
||||
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
static void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded,
|
||||
bool reorder_q8_tensor, queue_ptr stream) {
|
||||
if (reorder_q8_tensor) {
|
||||
auto local_range = std::size_t(WARP_SIZE);
|
||||
auto num_quant_blocks = ky * (kx / QK8_1);
|
||||
auto global_range = num_quant_blocks * local_range;
|
||||
stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }),
|
||||
[=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
quantize_and_reorder_q8_1<QK8_1 / WARP_SIZE>(x, vy, kx, kx_padded, it);
|
||||
});
|
||||
} else {
|
||||
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
||||
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
||||
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
|
||||
static_assert(QK8_1 % WARP_SIZE == 0);
|
||||
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(num_blocks * block_size, block_size),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
||||
});
|
||||
stream->parallel_for(sycl::nd_range<3>(num_blocks * block_size, block_size),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2014,12 +2097,12 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
#else
|
||||
bool use_fp16 = false;
|
||||
#endif
|
||||
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
|
||||
dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
|
||||
if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
|
||||
row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
|
||||
if (src0->type != GGML_TYPE_F16) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
|
||||
" : converting src0 to fp16");
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
size_t ne = row_diff*ne00;
|
||||
@@ -2032,6 +2115,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
|
||||
ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
|
||||
" : converting src1 to fp16");
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
size_t ne = src1_ncols*ne10;
|
||||
@@ -2048,6 +2133,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting dst to fp32");
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
||||
}
|
||||
@@ -2063,21 +2150,25 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
||||
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
||||
dpct::library_data_t::real_half)));
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting dst to fp32");
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
|
||||
} else {
|
||||
ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
|
||||
ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
|
||||
if (src0->type != GGML_TYPE_F32) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting src0 to fp32");
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
|
||||
GGML_ASSERT(to_fp32_sycl != nullptr);
|
||||
src0_ddq_as_f32.alloc(row_diff*ne00);
|
||||
to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
||||
}
|
||||
if (src1->type != GGML_TYPE_F32) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting src1 to fp32");
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_fp32_sycl != nullptr);
|
||||
src1_ddq_as_f32.alloc(src1_ncols*ne10);
|
||||
@@ -2113,8 +2204,7 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
@@ -2166,8 +2256,7 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
||||
sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
@@ -2198,8 +2287,7 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
|
||||
argsort_f32_i32_sycl(src0_dd, (int *) dst_dd, ncols, nrows, order, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||
|
||||
@@ -2214,8 +2302,7 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *ds
|
||||
argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tensor *dst) {
|
||||
|
||||
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
@@ -2232,8 +2319,7 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tens
|
||||
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
@@ -2420,7 +2506,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
|
||||
|
||||
if (src1_on_device && src1_is_contiguous) {
|
||||
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
|
||||
bool reorder_q8_tensor = src0->extra && ((ggml_tensor_extra_gpu *)src0->extra)->optimized_feature.reorder;
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
||||
/*num_src=*/2, " : converting src1 to Q8_1");
|
||||
quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, reorder_q8_tensor, stream);
|
||||
/*
|
||||
DPCT1010:90: SYCL uses exceptions to report errors and does not
|
||||
use the error codes. The call was replaced with 0. You need to
|
||||
@@ -2524,7 +2613,9 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||
}
|
||||
|
||||
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
||||
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
|
||||
/*num_src=*/2, " : converting src1 to Q8_1");
|
||||
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, false, stream);
|
||||
/*
|
||||
DPCT1010:92: SYCL uses exceptions to report errors and does
|
||||
not use the error codes. The call was replaced with 0. You
|
||||
@@ -2618,33 +2709,28 @@ catch (sycl::exception const &exc) {
|
||||
|
||||
|
||||
static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_get_rows(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_norm(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_rms_norm(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_l2_norm(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_group_norm(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
@@ -2772,6 +2858,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
|
||||
// convert src1 to fp16
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
|
||||
" : converting src1 to fp16");
|
||||
const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
|
||||
GGML_ASSERT(to_fp16_nc_sycl != nullptr);
|
||||
const int64_t ne_src1 = ggml_nelements(src1);
|
||||
@@ -3027,7 +3115,7 @@ static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_ten
|
||||
return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
|
||||
ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
|
||||
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
|
||||
dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
|
||||
dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
|
||||
}
|
||||
|
||||
static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
|
||||
@@ -3075,6 +3163,7 @@ static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor *
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
|
||||
@@ -3150,11 +3239,8 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
|
||||
} else {
|
||||
constexpr bool convert_src1_to_q8_1 = false;
|
||||
// MUL_MAT_SYCL supports reorder
|
||||
opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MUL_MAT_SYCL);
|
||||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
|
||||
}
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
@@ -3225,6 +3311,7 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
|
||||
|
||||
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
ggml_tensor *dst) try {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
||||
@@ -3393,37 +3480,45 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_scale(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_diag_mask_inf(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_pool2d(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
ggml_sycl_op_im2col(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
||||
ggml_sycl_op_sum(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
||||
ggml_sycl_op_sum_rows(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
||||
ggml_sycl_op_argsort(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
|
||||
ggml_sycl_op_argmax(ctx, dst);
|
||||
}
|
||||
@@ -3509,6 +3604,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
ggml_sycl_gelu_quick(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
ggml_sycl_gelu_erf(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
ggml_sycl_tanh(ctx, dst);
|
||||
break;
|
||||
@@ -3717,6 +3815,9 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_tensor *tensor,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
@@ -3735,13 +3836,16 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||
const ggml_tensor *tensor,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
|
||||
data, (const char *)tensor->data + offset, size).wait()));
|
||||
data, (const char *)tensor->data + offset, size)));
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -3753,7 +3857,13 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
||||
const ggml_tensor *src,
|
||||
ggml_tensor *dst) try {
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
|
||||
bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
|
||||
ggml_backend_buffer_is_sycl(src->buffer);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": dst=", dst);
|
||||
debug_print_tensor(" src=", src);
|
||||
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
||||
if (is_cpy_supported) {
|
||||
/*
|
||||
DPCT1009:215: SYCL uses exceptions to report errors and does not use the
|
||||
error codes. The original code was commented out and a warning string
|
||||
@@ -3761,7 +3871,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
||||
*/
|
||||
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
|
||||
dst->data, src->data, ggml_nbytes(dst)).wait()));
|
||||
dst->data, src->data, ggml_nbytes(dst))));
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -3774,6 +3884,7 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait()));
|
||||
@@ -3810,11 +3921,43 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_GRAPH
|
||||
static bool check_graph_compatibility(ggml_cgraph * cgraph) {
|
||||
if (ggml_sycl_info().device_count > 1) {
|
||||
// A sycl_ex::command_graph object can only be created for a single device
|
||||
GGML_LOG_INFO("%s: disabling SYCL graphs due to multiple devices\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
const ggml_op node_op = cgraph->nodes[i]->op;
|
||||
switch (node_op) {
|
||||
default:
|
||||
break;
|
||||
case GGML_OP_CONCAT:
|
||||
// ggml_sycl_op_concat() does a blocking host wait after memcpy operations,
|
||||
// but wait() can't be called on the events returned by a queue recording
|
||||
// to a graph.
|
||||
[[fallthrough]];
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
// ggml_sycl_mul_mat_id() does a blocking host wait on the sycl queue after
|
||||
// submitting a memcpy operation, but wait() can't be called on a queue that
|
||||
// is recording to a graph.
|
||||
GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
|
||||
ggml_op_name(node_op));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
|
||||
|
||||
#ifdef GGML_SYCL_GRAPH
|
||||
if (!g_ggml_sycl_disable_graph) {
|
||||
bool use_sycl_graph = !g_ggml_sycl_disable_graph && check_graph_compatibility(cgraph);
|
||||
if (use_sycl_graph) {
|
||||
const bool graph_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_limited_graph);
|
||||
if (!graph_support) {
|
||||
GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
|
||||
@@ -3875,7 +4018,7 @@ catch (sycl::exception const &exc)
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
|
||||
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
||||
sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
|
||||
|
||||
if (ggml_backend_is_sycl(backend)) {
|
||||
@@ -4017,6 +4160,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
@@ -4162,6 +4306,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
#endif
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return true;
|
||||
case GGML_OP_L2_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
@@ -4173,14 +4318,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
const int mode = ((const int32_t *) op->op_params)[2];
|
||||
// mode is not used as a bitmask in practice, the various rope type modes are independent implementations
|
||||
if (mode == GGML_ROPE_TYPE_MROPE) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
case GGML_OP_UPSCALE:
|
||||
@@ -4270,6 +4407,7 @@ static void ggml_backend_sycl_device_event_free(ggml_backend_dev_t dev, ggml_bac
|
||||
|
||||
static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
|
||||
GGML_UNUSED(dev);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
|
||||
|
||||
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
|
||||
|
||||
@@ -76,6 +76,7 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
|
||||
}
|
||||
|
||||
void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/5);
|
||||
const float * k_d = static_cast<const float *>(dst->src[0]->data);
|
||||
const float * v_d = static_cast<const float *>(dst->src[1]->data);
|
||||
const float * r_d = static_cast<const float *>(dst->src[2]->data);
|
||||
|
||||
@@ -29,8 +29,6 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
|
||||
static_assert(blocks_per_subgroup > 0);
|
||||
static_assert(block_elements_per_subgroup > 0);
|
||||
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||
|
||||
float partial_sum = 0.0f;
|
||||
for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
|
||||
const int ibx = row * blocks_per_row + i; // x block index
|
||||
@@ -40,13 +38,15 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
|
||||
|
||||
// Y block index that aligns with ibx
|
||||
const int iby = i * block_type::block_to_q8_1_ratio();
|
||||
const int8_t* q8_1_quant_ptr = (const int8_t*)vy + iby * QK8_1;
|
||||
const sycl::half2* q8_1_ds_ptr = (const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2));
|
||||
|
||||
#pragma unroll
|
||||
for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
|
||||
// x block quant index when casting the quants to int
|
||||
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
|
||||
|
||||
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
|
||||
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs, nblocks);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1059,8 +1059,10 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
case GGML_TYPE_Q4_K:
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n");
|
||||
reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n");
|
||||
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -1,40 +1,50 @@
|
||||
#include "norm.hpp"
|
||||
#include "ggml-sycl/common.hpp"
|
||||
#include "ggml-sycl/presets.hpp"
|
||||
|
||||
static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
|
||||
const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
|
||||
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
|
||||
|
||||
const int nrows = item_ct1.get_group_range(2);
|
||||
const int nchannels = item_ct1.get_group_range(1);
|
||||
|
||||
const int nthreads = item_ct1.get_local_range(2);
|
||||
const int sample = item_ct1.get_group(0);
|
||||
const int channel = item_ct1.get_group(1);
|
||||
const int row = item_ct1.get_group(2);
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
|
||||
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
|
||||
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
|
||||
|
||||
x += strided_offset;
|
||||
dst += packed_offset;
|
||||
|
||||
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row * ncols + col];
|
||||
const float xi = x[col];
|
||||
mean_var.x() += xi;
|
||||
mean_var.y() += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = mean_var;
|
||||
if (block_size > WARP_SIZE) {
|
||||
const auto sub_group = item_ct1.get_sub_group();
|
||||
const auto sg_id = sub_group.get_group_linear_id();
|
||||
const auto wi_in_sg = sub_group.get_local_linear_id();
|
||||
if (wi_in_sg == 0) {
|
||||
s_sum[sg_id] = mean_var;
|
||||
}
|
||||
/*
|
||||
DPCT1118:0: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
mean_var = 0.f;
|
||||
size_t nreduce = nwarps / WARP_SIZE;
|
||||
const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
|
||||
for (size_t i = 0; i < nreduce; i += 1)
|
||||
{
|
||||
mean_var += s_sum[lane_id + i * WARP_SIZE];
|
||||
mean_var += s_sum[wi_in_sg + i * WARP_SIZE];
|
||||
}
|
||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||
}
|
||||
@@ -44,7 +54,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
|
||||
const float inv_std = sycl::rsqrt(var + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
|
||||
dst[col] = (x[col] - mean) * inv_std;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,39 +145,51 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
|
||||
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
|
||||
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||
|
||||
const int nrows = item_ct1.get_group_range(2);
|
||||
const int nchannels = item_ct1.get_group_range(1);
|
||||
|
||||
const int sample = item_ct1.get_group(0);
|
||||
const int channel = item_ct1.get_group(1);
|
||||
const int row = item_ct1.get_group(2);
|
||||
|
||||
const int nthreads = item_ct1.get_local_range(2);
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int nwarps = nthreads / WARP_SIZE;
|
||||
|
||||
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
|
||||
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
|
||||
|
||||
x += strided_offset;
|
||||
dst += packed_offset;
|
||||
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[row * ncols + col];
|
||||
const float xi = x[col];
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
if (block_size > WARP_SIZE) {
|
||||
|
||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
const auto sub_group = item_ct1.get_sub_group();
|
||||
const auto sg_id = sub_group.get_group_linear_id();
|
||||
const auto wi_in_sg = sub_group.get_local_linear_id();
|
||||
if (wi_in_sg == 0) {
|
||||
s_sum[sg_id] = tmp;
|
||||
}
|
||||
/*
|
||||
DPCT1118:3: SYCL group functions and algorithms must be encountered in
|
||||
converged control flow. You may need to adjust the code.
|
||||
*/
|
||||
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
size_t nreduce = nwarps / WARP_SIZE;
|
||||
const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
|
||||
tmp = 0.f;
|
||||
for (size_t i = 0; i < nreduce; i += 1)
|
||||
{
|
||||
tmp += s_sum[lane_id + i * WARP_SIZE];
|
||||
tmp += s_sum[wi_in_sg + i * WARP_SIZE];
|
||||
}
|
||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||
}
|
||||
@@ -176,7 +198,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
|
||||
const float scale = sycl::rsqrt(mean + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[row * ncols + col] = scale * x[row * ncols + col];
|
||||
dst[col] = scale * x[col];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -224,20 +246,20 @@ static void l2_norm_f32(const float* x, float* dst, const int ncols, const float
|
||||
}
|
||||
}
|
||||
|
||||
static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
const int nrows, const float eps,
|
||||
queue_ptr stream, int device) {
|
||||
static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
|
||||
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
|
||||
const float eps, queue_ptr stream, int device) {
|
||||
|
||||
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -252,15 +274,12 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
*/
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
||||
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
||||
|
||||
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -313,21 +332,20 @@ static void group_norm_f32_sycl(const float* x, float* dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
const int nrows, const float eps,
|
||||
queue_ptr stream, int device) {
|
||||
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
|
||||
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
||||
|
||||
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
|
||||
if (ncols < 1024) {
|
||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler& cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -344,12 +362,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||
cgh);
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
sycl::nd_range<3>(global_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -398,12 +414,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
}
|
||||
|
||||
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = dst->src[0]->ne[0];
|
||||
const int64_t nrows = ggml_nrows(dst->src[0]);
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
|
||||
@@ -411,8 +427,14 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
const size_t ts0 = ggml_type_size(src0->type);
|
||||
GGML_ASSERT(nb00 == ts0);
|
||||
const int64_t s01 = nb01 / ts0;
|
||||
const int64_t s02 = nb02 / ts0;
|
||||
const int64_t s03 = nb03 / ts0;
|
||||
|
||||
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
|
||||
norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
|
||||
}
|
||||
|
||||
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
@@ -436,11 +458,10 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = dst->src[0]->ne[0];
|
||||
const int64_t nrows = ggml_nrows(dst->src[0]);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
|
||||
@@ -450,7 +471,13 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
const size_t ts0 = ggml_type_size(src0->type);
|
||||
GGML_ASSERT(nb00 == ts0);
|
||||
const int64_t s01 = nb01 / ts0;
|
||||
const int64_t s02 = nb02 / ts0;
|
||||
const int64_t s03 = nb03 / ts0;
|
||||
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
|
||||
}
|
||||
|
||||
void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "outprod.hpp"
|
||||
|
||||
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
|
||||
|
||||
@@ -49,10 +49,7 @@ static void rope_norm(const T * x, T * dst, const int ne0, const int ne1, const
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row * ne0 + i0;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
|
||||
*reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -93,10 +90,7 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row * ne0 + i0;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
|
||||
*reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -122,6 +116,63 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const
|
||||
dst[i + n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
|
||||
}
|
||||
|
||||
template <typename T, bool has_ff>
|
||||
static void rope_multi(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
|
||||
const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
|
||||
const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
|
||||
const float theta_scale, const float * freq_factors, const mrope_sections sections,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
// get index pos
|
||||
const int i0 = 2 * (item_ct1.get_group(1) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1));
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
const int row_dst = (item_ct1.get_group(2) * item_ct1.get_local_range(2)) + item_ct1.get_local_id(2);
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row_dst*ne0 + i0;
|
||||
*reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
|
||||
return;
|
||||
}
|
||||
|
||||
const int row_x = row_dst % ne1;
|
||||
const int channel_x = row_dst / ne1;
|
||||
const int idst = (row_dst * ne0) + (i0 / 2);
|
||||
const size_t ix = ((size_t) channel_x * s2) + ((size_t) row_x * s1) + (i0 / 2);
|
||||
|
||||
const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
|
||||
const int sec_w = sections.v[1] + sections.v[0];
|
||||
const int sector = (i0 / 2) % sect_dims;
|
||||
|
||||
|
||||
float theta_base = 0.0;
|
||||
if (sector < sections.v[0]) {
|
||||
theta_base = pos[channel_x]*sycl::pow(theta_scale, i0/2.0f);
|
||||
}
|
||||
else if (sector >= sections.v[0] && sector < sec_w) {
|
||||
theta_base = pos[channel_x + ne2 * 1]*sycl::pow(theta_scale, i0/2.0f);
|
||||
}
|
||||
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
|
||||
theta_base = pos[channel_x + ne2 * 2]*sycl::pow(theta_scale, i0/2.0f);
|
||||
}
|
||||
else if (sector >= sec_w + sections.v[2]) {
|
||||
theta_base = pos[channel_x + ne2 * 3]*sycl::pow(theta_scale, i0/2.0f);
|
||||
}
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
rope_yarn(theta_base / freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
const float x0 = x[ix + 0];
|
||||
const float x1 = x[ix + n_dims/2];
|
||||
|
||||
// store results in dst
|
||||
dst[idst + 0] = x0 * cos_theta - x1 * sin_theta;
|
||||
dst[idst + n_dims/2] = x0 * sin_theta + x1 * cos_theta;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <typename T, bool has_ff>
|
||||
static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
|
||||
const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
|
||||
@@ -171,7 +222,7 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
|
||||
const float * freq_factors, queue_ptr stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
|
||||
const int num_blocks_x = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
|
||||
const sycl::range<3> block_nums(1, num_blocks_x, nr);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
@@ -208,7 +259,7 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
|
||||
const rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
|
||||
const int num_blocks_x = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
|
||||
const sycl::range<3> block_nums(1, num_blocks_x, nr);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
@@ -228,6 +279,40 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
|
||||
const size_t s2, const int n_dims, const int nr, const int32_t * pos,
|
||||
const float freq_scale, const float freq_base, const float ext_factor,
|
||||
const float attn_factor, const rope_corr_dims corr_dims, const float * freq_factors,
|
||||
const mrope_sections sections, queue_ptr stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
|
||||
const sycl::range<3> grid_dims(1, n_blocks_y, nr);
|
||||
const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);
|
||||
|
||||
const float theta_scale = std::pow(freq_base, -2.0f / n_dims);
|
||||
// Add FP16 capability check if T could be sycl::half
|
||||
if constexpr (std::is_same_v<T, sycl::half>) {
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
}
|
||||
// launch kernel
|
||||
if (freq_factors == nullptr) {
|
||||
stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
||||
});
|
||||
} else {
|
||||
stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
|
||||
rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, freq_factors, sections, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// rope vision
|
||||
template <typename T>
|
||||
static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
|
||||
@@ -237,7 +322,7 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
|
||||
const mrope_sections sections, queue_ptr stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_y = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
|
||||
const int n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
|
||||
const sycl::range<3> grid_dims(1, n_blocks_y, nr);
|
||||
const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);
|
||||
|
||||
@@ -298,8 +383,17 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
||||
memcpy(§ions.v, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
||||
|
||||
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
||||
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||
|
||||
if (is_mrope) {
|
||||
GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
|
||||
}
|
||||
|
||||
if (is_vision) {
|
||||
GGML_ASSERT(n_dims == ne00/2);
|
||||
}
|
||||
|
||||
const int32_t * pos = (const int32_t *) dst->src[1]->data;
|
||||
|
||||
const float * freq_factors = nullptr;
|
||||
@@ -326,6 +420,19 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (is_mrope && !is_vision) {
|
||||
GGML_SYCL_DEBUG("%s: mrope path\n", __func__);
|
||||
if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||
rope_multi_sycl((const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, ne01, ne02, s01,
|
||||
s02, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
|
||||
freq_factors, sections, main_stream);
|
||||
} else if (dst->src[0]->type == GGML_TYPE_F32) {
|
||||
rope_multi_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, ne02, s01, s02, n_dims,
|
||||
nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections,
|
||||
main_stream);
|
||||
} else {
|
||||
GGML_ABORT("Fatal error: Tensor type unsupported!");
|
||||
}
|
||||
} else if (is_vision) {
|
||||
GGML_SYCL_DEBUG("%s: vision path\n", __func__);
|
||||
if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||
@@ -355,8 +462,7 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
|
||||
}
|
||||
|
||||
void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_SYCL_DEBUG("call %s\n", __func__);
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
|
||||
ggml_sycl_op_rope(ctx, dst);
|
||||
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
||||
}
|
||||
|
||||
|
||||
@@ -225,7 +225,7 @@ static void soft_max_f32_sycl(const float * x, const T * mask,
|
||||
}
|
||||
|
||||
void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -249,16 +249,13 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
|
||||
const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
|
||||
GGML_SYCL_DEBUG("%s: F16 mask\n", __func__);
|
||||
soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
|
||||
main_stream, ctx.device);
|
||||
} else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
|
||||
const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
|
||||
GGML_SYCL_DEBUG("%s: F32 mask\n", __func__);
|
||||
soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
|
||||
} else {
|
||||
/* mask unavailable */
|
||||
GGML_SYCL_DEBUG("%s: No mask\n", __func__);
|
||||
soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user