mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-12 14:03:20 +02:00
Compare commits
149 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
95402553a5 | ||
|
|
3865cff4f5 | ||
|
|
d03172cc79 | ||
|
|
dd8e59f443 | ||
|
|
bbe98d2784 | ||
|
|
c2056ed6d4 | ||
|
|
c46503014d | ||
|
|
860a9e4eef | ||
|
|
fe9d60e74a | ||
|
|
e434e69183 | ||
|
|
89fea80d29 | ||
|
|
6adc3c3ebc | ||
|
|
0dbcabde8c | ||
|
|
ad590be98c | ||
|
|
7d6d91babf | ||
|
|
d3e64b9f49 | ||
|
|
3ba0d843c6 | ||
|
|
0bf49eb668 | ||
|
|
4ad243677b | ||
|
|
c89c2d1ab9 | ||
|
|
3555b3004b | ||
|
|
d7da8dc83a | ||
|
|
cd355eda7d | ||
|
|
30e5b01de2 | ||
|
|
e54b394082 | ||
|
|
2c2caa4443 | ||
|
|
5fce5f948d | ||
|
|
9ae4143bc6 | ||
|
|
c311ac664d | ||
|
|
b9912ac570 | ||
|
|
00ba772610 | ||
|
|
3cb203c89f | ||
|
|
2e42be42bd | ||
|
|
fb85a288d7 | ||
|
|
40643edb86 | ||
|
|
3cfbbdb44e | ||
|
|
80709b70a2 | ||
|
|
26ff3685bf | ||
|
|
60c666347b | ||
|
|
b7cc7745e3 | ||
|
|
cc8d081879 | ||
|
|
d714dadb57 | ||
|
|
ffad043973 | ||
|
|
0889eba570 | ||
|
|
c61285e739 | ||
|
|
09cf2c7c65 | ||
|
|
c33fe8b8c4 | ||
|
|
ed52f3668e | ||
|
|
a681b4ba83 | ||
|
|
7d516443dd | ||
|
|
f6e1a7aa87 | ||
|
|
c3ee46fab4 | ||
|
|
e2c0b6e46a | ||
|
|
9596506965 | ||
|
|
a20b2b05bc | ||
|
|
2e89f76b7a | ||
|
|
532802f938 | ||
|
|
d4e0d95cf5 | ||
|
|
cc66a7f78f | ||
|
|
bd248d4dc7 | ||
|
|
7781e5fe99 | ||
|
|
89a184fa71 | ||
|
|
2baf07727f | ||
|
|
7ae2932116 | ||
|
|
1f7d50b293 | ||
|
|
4c763c8d1b | ||
|
|
dad5c44398 | ||
|
|
55f6b9fa65 | ||
|
|
3678b838bb | ||
|
|
652b70e667 | ||
|
|
3a12db23b6 | ||
|
|
ae92c1855b | ||
|
|
b7ce1ad1e3 | ||
|
|
97340b4c99 | ||
|
|
2bb0467043 | ||
|
|
b8e2194efc | ||
|
|
1a3b5e80f7 | ||
|
|
1f63e75f3b | ||
|
|
40cbf571c9 | ||
|
|
7f4fbe5183 | ||
|
|
f470bc36be | ||
|
|
8f47e25f56 | ||
|
|
201b31dc2e | ||
|
|
e21d2d4ae2 | ||
|
|
dc0623fddb | ||
|
|
87d34b381d | ||
|
|
b460d16ae8 | ||
|
|
91a8ee6a6f | ||
|
|
056eb74534 | ||
|
|
247e5c6e44 | ||
|
|
5787b5da57 | ||
|
|
228f34c9ce | ||
|
|
0974ad7a7c | ||
|
|
745aa5319b | ||
|
|
487a5e0401 | ||
|
|
d17a809ef0 | ||
|
|
1caae7fc6c | ||
|
|
669c13e0f6 | ||
|
|
146b88e8b3 | ||
|
|
7f37b6cf1e | ||
|
|
3a077146a4 | ||
|
|
d01d112abb | ||
|
|
9f47fa5792 | ||
|
|
9e31bec4fd | ||
|
|
5a8ae3053c | ||
|
|
0d3984424f | ||
|
|
3e63a58ef7 | ||
|
|
2589ad3704 | ||
|
|
482548716f | ||
|
|
3ac67535c8 | ||
|
|
0b4be4c435 | ||
|
|
e0e806f52e | ||
|
|
7e00e60ef8 | ||
|
|
ea1431b0fa | ||
|
|
71e74a3ac9 | ||
|
|
bfb1e012a0 | ||
|
|
3637576288 | ||
|
|
ea394d7ab1 | ||
|
|
5582c49c39 | ||
|
|
c9bbc77931 | ||
|
|
bfd322796c | ||
|
|
093e3f1feb | ||
|
|
663445b0de | ||
|
|
7675c555a1 | ||
|
|
5e1c3aed40 | ||
|
|
c496fe0b1d | ||
|
|
e57bb87ced | ||
|
|
f3a4b1659c | ||
|
|
108009f5c7 | ||
|
|
d337252acf | ||
|
|
af6f91db47 | ||
|
|
a7b8d35f78 | ||
|
|
6eba72b71c | ||
|
|
fedf034a98 | ||
|
|
8726392d3d | ||
|
|
c04621711a | ||
|
|
0fc16b42e8 | ||
|
|
053b1539c0 | ||
|
|
b3a89c3d9e | ||
|
|
e15898d1c7 | ||
|
|
803f8baf4f | ||
|
|
3600cc2886 | ||
|
|
c7e0a2054b | ||
|
|
3f55f781f1 | ||
|
|
51fa76f172 | ||
|
|
12d0188c0d | ||
|
|
eb3949938e | ||
|
|
e562eece7c | ||
|
|
b47ab7b8e9 |
@@ -49,19 +49,23 @@ COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-venv && \
|
||||
python3 -m venv /opt/venv && \
|
||||
. /opt/venv/bin/activate && \
|
||||
pip install --upgrade pip setuptools wheel && \
|
||||
pip install -r requirements.txt && \
|
||||
apt autoremove -y && \
|
||||
apt clean -y && \
|
||||
rm -rf /tmp/* /var/tmp/* && \
|
||||
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
|
||||
find /var/cache -type f -delete
|
||||
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
|
||||
7
.github/labeler.yml
vendored
7
.github/labeler.yml
vendored
@@ -86,3 +86,10 @@ nix:
|
||||
embedding:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: examples/embedding/
|
||||
|
||||
Ascend NPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-cann.h
|
||||
- ggml/src/ggml-cann/**
|
||||
- docs/backend/CANN.md
|
||||
|
||||
113
.github/workflows/build-linux-cross.yml
vendored
113
.github/workflows/build-linux-cross.yml
vendored
@@ -231,3 +231,116 @@ jobs:
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
debian-13-loongarch64-cpu-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup LoongArch
|
||||
run: |
|
||||
rm -f /etc/apt/sources.list.d/*
|
||||
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
|
||||
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
|
||||
EOF
|
||||
( echo 'quiet "true";'; \
|
||||
echo 'APT::Get::Assume-Yes "true";'; \
|
||||
echo 'APT::Install-Recommends "false";'; \
|
||||
echo 'Acquire::Check-Valid-Until "false";'; \
|
||||
echo 'Acquire::Retries "5";'; \
|
||||
) > /etc/apt/apt.conf.d/99snapshot-repos
|
||||
|
||||
apt-get update
|
||||
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
|
||||
dpkg --add-architecture loong64
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
|
||||
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
|
||||
EOF
|
||||
|
||||
apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-loongarch64-linux-gnu \
|
||||
g++-14-loongarch64-linux-gnu
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
|
||||
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
debian-13-loongarch64-vulkan-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup LoongArch
|
||||
run: |
|
||||
rm -f /etc/apt/sources.list.d/*
|
||||
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
|
||||
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
|
||||
EOF
|
||||
( echo 'quiet "true";'; \
|
||||
echo 'APT::Get::Assume-Yes "true";'; \
|
||||
echo 'APT::Install-Recommends "false";'; \
|
||||
echo 'Acquire::Check-Valid-Until "false";'; \
|
||||
echo 'Acquire::Retries "5";'; \
|
||||
) > /etc/apt/apt.conf.d/99snapshot-repos
|
||||
|
||||
apt-get update
|
||||
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
|
||||
dpkg --add-architecture loong64
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
|
||||
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
|
||||
EOF
|
||||
|
||||
apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
glslc \
|
||||
gcc-14-loongarch64-linux-gnu \
|
||||
g++-14-loongarch64-linux-gnu \
|
||||
libvulkan-dev:loong64
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
|
||||
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
16
.github/workflows/build.yml
vendored
16
.github/workflows/build.yml
vendored
@@ -306,6 +306,7 @@ jobs:
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
export GGML_VK_VISIBLE_DEVICES=0
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 3600
|
||||
|
||||
@@ -687,12 +688,12 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
|
||||
- build: 'cpu-x64 (static)'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
|
||||
- build: 'openblas-x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'llvm-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
- build: 'llvm-arm64-opencl-adreno'
|
||||
@@ -777,6 +778,7 @@ jobs:
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
@@ -839,12 +841,12 @@ jobs:
|
||||
-DGGML_CUDA=ON
|
||||
cmake --build build
|
||||
|
||||
windows-2019-cmake-cuda:
|
||||
runs-on: windows-2019
|
||||
windows-2022-cmake-cuda:
|
||||
runs-on: windows-2022
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '11.7']
|
||||
cuda: ['12.4']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -878,7 +880,7 @@ jobs:
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DLLAMA_BUILD_SERVER=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
|
||||
17
.github/workflows/release.yml
vendored
17
.github/workflows/release.yml
vendored
@@ -131,8 +131,9 @@ jobs:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-22.04-arm
|
||||
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
|
||||
# - build: 'arm64'
|
||||
# os: ubuntu-22.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
@@ -159,6 +160,9 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
@@ -207,6 +211,9 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_VULKAN=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
@@ -373,11 +380,11 @@ jobs:
|
||||
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
|
||||
windows-cuda:
|
||||
runs-on: windows-2019
|
||||
runs-on: windows-2022
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '11.7']
|
||||
cuda: ['12.4']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -405,7 +412,7 @@ jobs:
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
|
||||
2
.github/workflows/server.yml
vendored
2
.github/workflows/server.yml
vendored
@@ -180,7 +180,7 @@ jobs:
|
||||
|
||||
|
||||
server-windows:
|
||||
runs-on: windows-2019
|
||||
runs-on: windows-2022
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
@@ -89,6 +89,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
||||
|
||||
if (NOT DEFINED LLAMA_BUILD_NUMBER)
|
||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
endif()
|
||||
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
endif()
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
|
||||
# override ggml options
|
||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||
@@ -155,10 +163,17 @@ if (LLAMA_USE_SYSTEM_GGML)
|
||||
endif()
|
||||
|
||||
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
||||
set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
|
||||
set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
|
||||
add_subdirectory(ggml)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
|
||||
if (MINGW)
|
||||
# Target Windows 8 for PrefetchVirtualMemory
|
||||
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
||||
endif()
|
||||
|
||||
#
|
||||
# build the library
|
||||
#
|
||||
@@ -199,10 +214,6 @@ endif()
|
||||
include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
|
||||
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||
|
||||
4
Makefile
4
Makefile
@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
|
||||
endif
|
||||
|
||||
ifndef GGML_NO_CPU_AARCH64
|
||||
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
|
||||
MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
|
||||
endif
|
||||
|
||||
# warnings
|
||||
@@ -970,7 +970,7 @@ OBJ_GGML = \
|
||||
$(DIR_GGML)/src/ggml-threading.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/repack.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
|
||||
|
||||
46
README.md
46
README.md
@@ -3,9 +3,10 @@
|
||||

|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/ggml-org/llama.cpp/releases)
|
||||
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
|
||||
@@ -17,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
## Hot topics
|
||||
|
||||
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
@@ -28,6 +28,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
----
|
||||
|
||||
## Quick start
|
||||
|
||||
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
|
||||
|
||||
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
|
||||
- Run with Docker - see our [Docker documentation](docs/docker.md)
|
||||
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
|
||||
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
|
||||
|
||||
Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
|
||||
|
||||
Example command:
|
||||
|
||||
```sh
|
||||
# Use a local model file
|
||||
llama-cli -m my_model.gguf
|
||||
|
||||
# Or download and run a model directly from Hugging Face
|
||||
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
||||
|
||||
# Launch OpenAI-compatible API server
|
||||
llama-server -hf ggml-org/gemma-3-1b-it-GGUF
|
||||
```
|
||||
|
||||
## Description
|
||||
|
||||
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
|
||||
@@ -130,6 +154,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
<details>
|
||||
<summary>Bindings</summary>
|
||||
|
||||
- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||
@@ -229,6 +254,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
## Supported backends
|
||||
|
||||
| Backend | Target devices |
|
||||
@@ -245,16 +271,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
|
||||
## Building the project
|
||||
|
||||
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
||||
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
|
||||
|
||||
- Clone this repository and build locally, see [how to build](docs/build.md)
|
||||
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
|
||||
- Use a Docker image, see [documentation for Docker](docs/docker.md)
|
||||
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
|
||||
|
||||
## Obtaining and quantizing models
|
||||
|
||||
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
|
||||
@@ -262,7 +278,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
|
||||
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
||||
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
||||
|
||||
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
|
||||
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
|
||||
|
||||
```sh
|
||||
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
||||
```
|
||||
|
||||
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
|
||||
|
||||
|
||||
17
ci/run.sh
17
ci/run.sh
@@ -39,14 +39,27 @@ sd=`dirname $0`
|
||||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
|
||||
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
|
||||
if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
|
||||
else
|
||||
echo "Warning: Using fallback CUDA architectures"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
|
||||
fi
|
||||
else
|
||||
echo "Error: nvidia-smi not found, cannot build with CUDA"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||
|
||||
@@ -7,8 +7,8 @@ llama_add_compile_flags()
|
||||
# Build info header
|
||||
#
|
||||
|
||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
|
||||
|
||||
# Is git submodule
|
||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||
@@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
if (SLASH_POS EQUAL 0)
|
||||
set(GIT_DIR "${REAL_GIT_DIR}")
|
||||
else()
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||
set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(EXISTS "${GIT_DIR}/index")
|
||||
set(GIT_INDEX "${GIT_DIR}/index")
|
||||
# For build-info.cpp below
|
||||
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
|
||||
else()
|
||||
message(WARNING "Git index not found in git repository.")
|
||||
set(GIT_INDEX "")
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||
set(GIT_INDEX "")
|
||||
endif()
|
||||
|
||||
# Add a custom command to rebuild build-info.cpp when .git/index changes
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
|
||||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
||||
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
|
||||
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||
VERBATIM
|
||||
)
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
|
||||
set(TARGET build_info)
|
||||
add_library(${TARGET} OBJECT build-info.cpp)
|
||||
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
|
||||
if (params.reranking && params.embedding) {
|
||||
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
||||
}
|
||||
|
||||
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
||||
throw std::runtime_error(string_format(
|
||||
"error: the supplied chat template is not supported: %s%s\n",
|
||||
@@ -1348,9 +1344,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--prio"}, "N",
|
||||
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
|
||||
string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
|
||||
[](common_params & params, int prio) {
|
||||
if (prio < 0 || prio > 3) {
|
||||
if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
params.cpuparams.priority = (enum ggml_sched_priority) prio;
|
||||
@@ -2747,9 +2743,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
||||
add_opt(common_arg(
|
||||
{"--reranking", "--rerank"},
|
||||
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
|
||||
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
||||
[](common_params & params) {
|
||||
params.reranking = true;
|
||||
params.embedding = true;
|
||||
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
||||
add_opt(common_arg(
|
||||
@@ -2869,6 +2866,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"(default: deepseek)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
||||
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
||||
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
|
||||
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
|
||||
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
|
||||
|
||||
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
||||
result_.tool_calls.emplace_back(tool_call);
|
||||
|
||||
return true;
|
||||
}
|
||||
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
||||
@@ -154,9 +155,10 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
|
||||
if (!rest.empty()) {
|
||||
handle_reasoning(rest, /* closed */ !is_partial());
|
||||
}
|
||||
if (!syntax_.thinking_forced_open) {
|
||||
throw common_chat_msg_partial_exception(end_think);
|
||||
}
|
||||
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
|
||||
// if (!syntax_.thinking_forced_open) {
|
||||
// throw common_chat_msg_partial_exception(end_think);
|
||||
// }
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -377,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
||||
/* .is_partial = */ found_healing_marker,
|
||||
};
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::clear_tools() {
|
||||
result_.tool_calls.clear();
|
||||
}
|
||||
|
||||
@@ -115,4 +115,6 @@ class common_chat_msg_parser {
|
||||
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||
);
|
||||
|
||||
void clear_tools();
|
||||
};
|
||||
|
||||
@@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
|
||||
|
||||
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
|
||||
std::vector<common_chat_msg_diff> diffs;
|
||||
// if (previous_msg.reasoning_content != current.reasoning_content) {
|
||||
// auto & diff = diffs.emplace_back();
|
||||
// diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
|
||||
// }
|
||||
if (previous_msg.reasoning_content != new_msg.reasoning_content) {
|
||||
auto & diff = diffs.emplace_back();
|
||||
diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
|
||||
}
|
||||
if (previous_msg.content != new_msg.content) {
|
||||
auto & diff = diffs.emplace_back();
|
||||
diff.content_delta = string_diff(previous_msg.content, new_msg.content);
|
||||
@@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
|
||||
|
||||
template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
|
||||
json delta = json::object();
|
||||
// if (!diff.reasoning_content_delta.empty()) {
|
||||
// delta["reasoning_content"] = msg.reasoning_content;
|
||||
// }
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
delta["reasoning_content"] = diff.reasoning_content_delta;
|
||||
}
|
||||
if (!diff.content_delta.empty()) {
|
||||
delta["content"] = diff.content_delta;
|
||||
}
|
||||
@@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
|
||||
switch (format) {
|
||||
case COMMON_REASONING_FORMAT_NONE: return "none";
|
||||
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
||||
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
||||
default:
|
||||
throw std::runtime_error("Unknown reasoning format");
|
||||
}
|
||||
@@ -1837,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
|
||||
if (res < 0) {
|
||||
// if the custom "tmpl" is not supported, we throw an error
|
||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
||||
throw std::runtime_error("this custom template is not supported");
|
||||
throw std::runtime_error("this custom template is not supported, try using --jinja");
|
||||
}
|
||||
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
@@ -1920,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
||||
} catch (const common_chat_msg_partial_exception & ex) {
|
||||
LOG_DBG("Partial parse: %s\n", ex.what());
|
||||
if (!is_partial) {
|
||||
throw std::runtime_error(ex.what());
|
||||
builder.clear_tools();
|
||||
builder.move_to(0);
|
||||
common_chat_parse_content_only(builder);
|
||||
}
|
||||
}
|
||||
auto msg = builder.result();
|
||||
|
||||
@@ -70,7 +70,7 @@ struct common_chat_msg {
|
||||
};
|
||||
|
||||
struct common_chat_msg_diff {
|
||||
// std::string reasoning_content_delta;
|
||||
std::string reasoning_content_delta;
|
||||
std::string content_delta;
|
||||
size_t tool_call_index = std::string::npos;
|
||||
common_chat_tool_call tool_call_delta;
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
||||
|
||||
# Only write the build info if it changed
|
||||
if(EXISTS ${OUTPUT_FILE})
|
||||
file(READ ${OUTPUT_FILE} CONTENTS)
|
||||
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
||||
if (
|
||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
||||
)
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
else()
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
|
||||
DWORD p = NORMAL_PRIORITY_CLASS;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
||||
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
||||
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
|
||||
int p = 0;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_LOW: p = 5; break;
|
||||
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
||||
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
||||
@@ -464,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
|
||||
|
||||
std::string regex_escape(const std::string & s) {
|
||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||
return std::regex_replace(s, special_chars, "\\$0");
|
||||
return std::regex_replace(s, special_chars, "\\$&");
|
||||
}
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
||||
@@ -765,6 +767,9 @@ bool fs_validate_filename(const std::string & filename) {
|
||||
return true;
|
||||
}
|
||||
|
||||
#include <iostream>
|
||||
|
||||
|
||||
// returns true if successful, false otherwise
|
||||
bool fs_create_directory_with_parents(const std::string & path) {
|
||||
#ifdef _WIN32
|
||||
@@ -782,9 +787,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||
// process path from front to back, procedurally creating directories
|
||||
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
||||
const std::wstring subpath = wpath.substr(0, pos_slash);
|
||||
const wchar_t * test = subpath.c_str();
|
||||
|
||||
const bool success = CreateDirectoryW(test, NULL);
|
||||
pos_slash += 1;
|
||||
|
||||
// skip the drive letter, in some systems it can return an access denied error
|
||||
if (subpath.length() == 2 && subpath[1] == ':') {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
|
||||
|
||||
if (!success) {
|
||||
const DWORD error = GetLastError();
|
||||
|
||||
@@ -798,8 +810,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
pos_slash += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -895,34 +905,6 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
if (params.reranking) {
|
||||
bool ok = true;
|
||||
|
||||
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
@@ -932,7 +914,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
return iparams;
|
||||
}
|
||||
|
||||
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
|
||||
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
||||
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
||||
params.ctx_shift = false;
|
||||
}
|
||||
@@ -964,6 +946,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
|
||||
bool ok = true;
|
||||
|
||||
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
// load and optionally apply lora adapters
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
@@ -1039,7 +1050,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
if (llama_model_has_decoder(model)) {
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
||||
}
|
||||
llama_kv_self_clear(lctx);
|
||||
llama_memory_clear(llama_get_memory(lctx), true);
|
||||
llama_synchronize(lctx);
|
||||
llama_perf_context_reset(lctx);
|
||||
llama_set_warmup(lctx, false);
|
||||
@@ -1141,11 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.op_offload = !params.no_op_offload;
|
||||
cparams.swa_full = params.swa_full;
|
||||
|
||||
if (params.reranking) {
|
||||
cparams.embeddings = true;
|
||||
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||
}
|
||||
|
||||
cparams.type_k = params.cache_type_k;
|
||||
cparams.type_v = params.cache_type_v;
|
||||
|
||||
|
||||
@@ -215,7 +215,8 @@ struct common_params_vocoder {
|
||||
|
||||
enum common_reasoning_format {
|
||||
COMMON_REASONING_FORMAT_NONE,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
||||
};
|
||||
|
||||
struct common_params {
|
||||
@@ -354,7 +355,6 @@ struct common_params {
|
||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||
std::string embd_sep = "\n"; // separator of embeddings
|
||||
bool reranking = false; // enable reranking support on server
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
|
||||
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
|
||||
auto & smpl = spec->smpl;
|
||||
auto & prompt = spec->prompt;
|
||||
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
|
||||
int reuse_i = 0;
|
||||
int reuse_n = 0;
|
||||
|
||||
@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
|
||||
result.reserve(params.n_draft);
|
||||
|
||||
if (reuse_n == 0) {
|
||||
llama_kv_self_clear(ctx);
|
||||
llama_memory_clear(mem, false);
|
||||
|
||||
prompt.clear();
|
||||
} else {
|
||||
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
|
||||
}
|
||||
|
||||
if (reuse_i > 0) {
|
||||
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
|
||||
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
||||
llama_memory_seq_rm (mem, 0, 0, reuse_i);
|
||||
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
|
||||
|
||||
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
||||
}
|
||||
|
||||
if (reuse_n < (int) prompt.size()) {
|
||||
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
|
||||
llama_memory_seq_rm (mem, 0, reuse_n, -1);
|
||||
|
||||
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
||||
}
|
||||
|
||||
@@ -519,7 +519,7 @@ class TextModel(ModelBase):
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
|
||||
self.gguf_writer.add_context_length(n_ctx)
|
||||
logger.info(f"gguf: context length = {n_ctx}")
|
||||
|
||||
@@ -1898,9 +1898,7 @@ class LlamaModel(TextModel):
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
|
||||
if "head_dim" in hparams:
|
||||
rope_dim = hparams["head_dim"]
|
||||
else:
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
@@ -1982,7 +1980,8 @@ class LlamaModel(TextModel):
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||
base = self.hparams.get("rope_theta", 10000.0)
|
||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||
if (dim := self.hparams.get("head_dim")) is None:
|
||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
|
||||
factor = rope_scaling.get("factor", 8.0)
|
||||
@@ -2017,6 +2016,20 @@ class LlamaModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("ArceeForCausalLM")
|
||||
class ArceeModel(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.ARCEE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self._try_set_pooling_type()
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
"LlavaForConditionalGeneration", # pixtral
|
||||
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
||||
@@ -2304,9 +2317,7 @@ class DeciModel(TextModel):
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
|
||||
if "head_dim" in hparams:
|
||||
rope_dim = hparams["head_dim"]
|
||||
else:
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
@@ -2346,7 +2357,8 @@ class DeciModel(TextModel):
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||
base = self.hparams.get("rope_theta", 10000.0)
|
||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||
if (dim := self.hparams.get("head_dim")) is None:
|
||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
|
||||
factor = rope_scaling.get("factor", 8.0)
|
||||
@@ -3664,9 +3676,7 @@ class InternLM3Model(TextModel):
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
|
||||
if "head_dim" in hparams:
|
||||
rope_dim = hparams["head_dim"]
|
||||
else:
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
@@ -3709,8 +3719,7 @@ class BertModel(TextModel):
|
||||
self._try_set_pooling_type()
|
||||
|
||||
if self.cls_out_labels:
|
||||
key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
|
||||
self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
|
||||
self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
|
||||
|
||||
def set_vocab(self):
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
@@ -3814,7 +3823,7 @@ class BertModel(TextModel):
|
||||
remove_whitespaces = tokenizer.clean_up_tokenization_spaces
|
||||
precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
|
||||
|
||||
vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
|
||||
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
|
||||
else:
|
||||
sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
|
||||
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||
@@ -3827,7 +3836,7 @@ class BertModel(TextModel):
|
||||
tokenizer = SentencePieceProcessor()
|
||||
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||
|
||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||
vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
|
||||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
@@ -3857,33 +3866,26 @@ class BertModel(TextModel):
|
||||
unk_token = tokenizer_config_json.get("unk_token")
|
||||
unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
|
||||
|
||||
for token_id in range(vocab_size):
|
||||
for token_id in range(tokenizer.vocab_size):
|
||||
piece = tokenizer._convert_id_to_token(token_id)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer_json["model"]["vocab"][token_id][1]
|
||||
if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer_json["model"]["vocab"][token_id][1]
|
||||
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if token_id == unk_token_id:
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif token_id in tokenizer.all_special_ids:
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif token_id in added_vocab.values():
|
||||
toktype = SentencePieceTokenTypes.USER_DEFINED
|
||||
# No reliable way to detect this, but jina doesn't have any
|
||||
# elif tokenizer.IsByte(token_id):
|
||||
# toktype = SentencePieceTokenTypes.BYTE
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if token_id == unk_token_id:
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif token_id in tokenizer.all_special_ids:
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif token_id in added_vocab.values():
|
||||
toktype = SentencePieceTokenTypes.USER_DEFINED
|
||||
# No reliable way to detect this, but jina doesn't have any
|
||||
# elif tokenizer.IsByte(token_id):
|
||||
# toktype = SentencePieceTokenTypes.BYTE
|
||||
|
||||
tokens[token_id] = text
|
||||
scores[token_id] = score
|
||||
toktypes[token_id] = toktype
|
||||
|
||||
if vocab_size > len(tokens):
|
||||
pad_count = vocab_size - len(tokens)
|
||||
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||
for i in range(1, pad_count + 1):
|
||||
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||
tokens[token_id] = text
|
||||
scores[token_id] = score
|
||||
toktypes[token_id] = toktype
|
||||
|
||||
if isinstance(tokenizer, SentencePieceProcessor):
|
||||
# realign tokens (see HF tokenizer code)
|
||||
@@ -3896,6 +3898,12 @@ class BertModel(TextModel):
|
||||
SentencePieceTokenTypes.UNKNOWN,
|
||||
] + toktypes[3:-1]
|
||||
|
||||
if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
|
||||
# Add mask token missing from sentencepiece.bpe.model
|
||||
tokens[250001] = b'<mask>'
|
||||
scores[250001] = 0.0
|
||||
toktypes[250001] = SentencePieceTokenTypes.CONTROL
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("t5")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
@@ -4061,6 +4069,34 @@ class NomicBertModel(BertModel):
|
||||
raise ValueError(f"unknown tokenizer: {toktyp}")
|
||||
|
||||
|
||||
@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
|
||||
class NeoBert(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.NEO_BERT
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# NeoBERT uses 2/3 of the intermediate size as feed forward length
|
||||
self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
|
||||
self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
|
||||
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
||||
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
||||
|
||||
self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
|
||||
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
if name.startswith("decoder."):
|
||||
return []
|
||||
|
||||
if name.startswith("model."):
|
||||
name = name[6:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
||||
class XLMRobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
@@ -4800,25 +4836,6 @@ class OlmoeModel(TextModel):
|
||||
class JinaBertV2Model(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.intermediate_size = self.hparams["intermediate_size"]
|
||||
|
||||
def get_tensors(self):
|
||||
for name, data in super().get_tensors():
|
||||
if 'gated_layer' in name:
|
||||
d1 = data[:self.intermediate_size, :]
|
||||
name1 = name.replace('gated_layers', 'gated_layers_w')
|
||||
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
||||
d2 = data[self.intermediate_size:, :]
|
||||
name2 = name.replace('gated_layers', 'gated_layers_v')
|
||||
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
||||
yield name1, d1
|
||||
yield name2, d2
|
||||
continue
|
||||
|
||||
yield name, data
|
||||
|
||||
def set_vocab(self):
|
||||
tokenizer_class = 'BertTokenizer'
|
||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
@@ -4834,14 +4851,6 @@ class JinaBertV2Model(BertModel):
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# if name starts with "bert.", remove the prefix
|
||||
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
if name.startswith("bert."):
|
||||
name = name[5:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("OpenELMForCausalLM")
|
||||
class OpenELMModel(TextModel):
|
||||
@@ -5082,9 +5091,7 @@ class DeepseekModel(TextModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if "head_dim" in hparams:
|
||||
rope_dim = hparams["head_dim"]
|
||||
else:
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
@@ -5288,6 +5295,34 @@ class DeepseekV2Model(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("Dots1ForCausalLM")
|
||||
class Dots1Model(Qwen2MoeModel):
|
||||
model_arch = gguf.MODEL_ARCH.DOTS1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
|
||||
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
||||
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
||||
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
||||
|
||||
if self.hparams["scoring_func"] == "noaux_tc":
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||
else:
|
||||
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
if name.endswith("e_score_correction_bias"):
|
||||
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||
if "shared_experts" in name:
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("PLMForCausalLM")
|
||||
class PLMModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PLM
|
||||
@@ -5946,7 +5981,8 @@ class ExaoneModel(TextModel):
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||
base = self.hparams.get("rope_theta", 10000.0)
|
||||
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||
if (dim := self.hparams.get("head_dim")) is None:
|
||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
|
||||
factor = rope_scaling.get("factor", 8.0)
|
||||
@@ -6058,7 +6094,8 @@ class BailingMoeModel(TextModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
@@ -6090,7 +6127,8 @@ class BailingMoeModel(TextModel):
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
n_embd = self.hparams["hidden_size"]
|
||||
head_dim = self.hparams.get("head_dim") or n_embd // n_head
|
||||
if (head_dim := self.hparams.get("head_dim")) is None:
|
||||
head_dim = n_embd // n_head
|
||||
|
||||
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
- [DataType Supports](#datatype-supports)
|
||||
- [Docker](#docker)
|
||||
- [Linux](#linux)
|
||||
- [Environment variable setup](#environment-variable-setup)
|
||||
- [TODO](#todo)
|
||||
|
||||
|
||||
@@ -290,5 +291,24 @@ Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang
|
||||
|
||||
We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
|
||||
|
||||
## Environment variable setup
|
||||
|
||||
### GGML_CANN_ASYNC_MODE
|
||||
|
||||
Enables asynchronous operator submission. Disabled by default.
|
||||
|
||||
### GGML_CANN_MEM_POOL
|
||||
|
||||
Specifies the memory pool management strategy:
|
||||
|
||||
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
|
||||
|
||||
- prio: Employs a priority queue-based memory pool management.
|
||||
- leg: Uses a fixed-size buffer pool.
|
||||
|
||||
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
|
||||
|
||||
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
|
||||
|
||||
## TODO
|
||||
- Support more models and data types.
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
# Build llama.cpp locally
|
||||
|
||||
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
||||
|
||||
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
|
||||
|
||||
**To get the Code:**
|
||||
|
||||
```bash
|
||||
@@ -63,6 +67,7 @@ cmake --build build --config Release
|
||||
cmake --preset x64-windows-llvm-release
|
||||
cmake --build build-x64-windows-llvm-release
|
||||
```
|
||||
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
|
||||
|
||||
## BLAS Build
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
|
||||
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
|
||||
- Functionary v3.1 / v3.2
|
||||
- Hermes 2/3, Qwen 2.5
|
||||
- Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
|
||||
- Qwen 2.5 Coder
|
||||
- Mistral Nemo
|
||||
- Firefunction v2
|
||||
- Command R7B
|
||||
|
||||
@@ -1,28 +1,42 @@
|
||||
# Install pre-built version of llama.cpp
|
||||
|
||||
## Homebrew
|
||||
| Install via | Windows | Mac | Linux |
|
||||
|-------------|---------|-----|-------|
|
||||
| Winget | ✅ | | |
|
||||
| Homebrew | | ✅ | ✅ |
|
||||
| MacPorts | | ✅ | |
|
||||
| Nix | | ✅ | ✅ |
|
||||
|
||||
On Mac and Linux, the homebrew package manager can be used via
|
||||
## Winget (Windows)
|
||||
|
||||
```sh
|
||||
winget install llama.cpp
|
||||
```
|
||||
|
||||
The package is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/issues/8188
|
||||
|
||||
## Homebrew (Mac and Linux)
|
||||
|
||||
```sh
|
||||
brew install llama.cpp
|
||||
```
|
||||
|
||||
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
|
||||
|
||||
## MacPorts
|
||||
## MacPorts (Mac)
|
||||
|
||||
```sh
|
||||
sudo port install llama.cpp
|
||||
```
|
||||
see also: https://ports.macports.org/port/llama.cpp/details/
|
||||
|
||||
## Nix
|
||||
See also: https://ports.macports.org/port/llama.cpp/details/
|
||||
|
||||
On Mac and Linux, the Nix package manager can be used via
|
||||
## Nix (Mac and Linux)
|
||||
|
||||
```sh
|
||||
nix profile install nixpkgs#llama-cpp
|
||||
```
|
||||
|
||||
For flake enabled installs.
|
||||
|
||||
Or
|
||||
@@ -34,13 +48,3 @@ nix-env --file '<nixpkgs>' --install --attr llama-cpp
|
||||
For non-flake enabled installs.
|
||||
|
||||
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
|
||||
|
||||
## Flox
|
||||
|
||||
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
|
||||
|
||||
```sh
|
||||
flox install llama-cpp
|
||||
```
|
||||
|
||||
Flox follows the nixpkgs build of llama.cpp.
|
||||
|
||||
@@ -107,3 +107,7 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
|
||||
```
|
||||
|
||||
## Finding more models:
|
||||
|
||||
GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf
|
||||
|
||||
@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
|
||||
}
|
||||
|
||||
for i in 1 ..< n_parallel {
|
||||
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
|
||||
llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens)
|
||||
}
|
||||
|
||||
if n_parallel > 1 {
|
||||
|
||||
@@ -37,7 +37,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_self_clear(ctx);
|
||||
llama_memory_clear(llama_get_memory(ctx), true);
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
@@ -236,9 +236,24 @@ int main(int argc, char ** argv) {
|
||||
LOG("\n");
|
||||
}
|
||||
} else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
|
||||
const uint32_t n_cls_out = llama_model_n_cls_out(model);
|
||||
std::vector<std::string> cls_out_labels;
|
||||
|
||||
for (uint32_t i = 0; i < n_cls_out; i++) {
|
||||
const char * label = llama_model_cls_label(model, i);
|
||||
const std::string label_i(label == nullptr ? "" : label);
|
||||
cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
|
||||
}
|
||||
|
||||
for (int j = 0; j < n_embd_count; j++) {
|
||||
// NOTE: if you change this log - update the tests in ci/run.sh
|
||||
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
|
||||
for (uint32_t i = 0; i < n_cls_out; i++) {
|
||||
// NOTE: if you change this log - update the tests in ci/run.sh
|
||||
if (n_cls_out == 1) {
|
||||
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
|
||||
} else {
|
||||
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||
|
||||
@@ -41,12 +41,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
|
||||
// add input to batch (this increments n_tokens)
|
||||
for (int32_t j = 0; j < n_toks; j++) {
|
||||
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
||||
common_batch_add(batch, inputs[j], j, { 0 }, true);
|
||||
}
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_self_clear(ctx);
|
||||
llama_set_embeddings(ctx, true);
|
||||
llama_memory_clear(llama_get_memory(ctx), true);
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
// run model
|
||||
@@ -102,8 +101,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
||||
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
|
||||
llama_kv_self_clear(ctx);
|
||||
llama_set_embeddings(ctx, false);
|
||||
llama_memory_clear(llama_get_memory(ctx), true);
|
||||
llama_set_causal_attn(ctx, true);
|
||||
|
||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||
@@ -166,6 +164,8 @@ int main(int argc, char * argv[]) {
|
||||
llama_model_params mparams = common_model_params_to_llama(params);
|
||||
llama_context_params cparams = common_context_params_to_llama(params);
|
||||
|
||||
cparams.embeddings = true;
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||
@@ -213,6 +213,8 @@ int main(int argc, char * argv[]) {
|
||||
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
|
||||
}
|
||||
|
||||
llama_set_embeddings(ctx, false);
|
||||
|
||||
// ### Generation ###
|
||||
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
||||
{
|
||||
|
||||
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
}
|
||||
|
||||
batch->logits[batch->n_tokens - 1] = true;
|
||||
llama_kv_self_clear(context);
|
||||
llama_memory_clear(llama_get_memory(context), false);
|
||||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
if (llama_decode(context, *batch) != 0) {
|
||||
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
|
||||
LOGi("Benchmark text generation (tg)");
|
||||
|
||||
llama_kv_self_clear(context);
|
||||
llama_memory_clear(llama_get_memory(context), false);
|
||||
const auto t_tg_start = ggml_time_us();
|
||||
for (i = 0; i < tg; i++) {
|
||||
|
||||
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
|
||||
const auto t_tg_end = ggml_time_us();
|
||||
|
||||
llama_kv_self_clear(context);
|
||||
llama_memory_clear(llama_get_memory(context), false);
|
||||
|
||||
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
|
||||
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
|
||||
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
|
||||
llama_memory_clear(llama_get_memory(reinterpret_cast<llama_context *>(context)), true);
|
||||
}
|
||||
|
||||
@@ -210,7 +210,7 @@ actor LlamaContext {
|
||||
}
|
||||
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
|
||||
|
||||
llama_kv_self_clear(context)
|
||||
llama_memory_clear(llama_get_memory(context), false)
|
||||
|
||||
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
@@ -223,7 +223,7 @@ actor LlamaContext {
|
||||
|
||||
// bench text generation
|
||||
|
||||
llama_kv_self_clear(context)
|
||||
llama_memory_clear(llama_get_memory(context), false)
|
||||
|
||||
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
@@ -242,7 +242,7 @@ actor LlamaContext {
|
||||
|
||||
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
llama_kv_self_clear(context)
|
||||
llama_memory_clear(llama_get_memory(context), false)
|
||||
|
||||
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
|
||||
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
|
||||
@@ -292,7 +292,7 @@ actor LlamaContext {
|
||||
func clear() {
|
||||
tokens_list.removeAll()
|
||||
temporary_invalid_cchars.removeAll()
|
||||
llama_kv_self_clear(context)
|
||||
llama_memory_clear(llama_get_memory(context), true)
|
||||
}
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
|
||||
@@ -60,6 +60,8 @@ int main(int argc, char ** argv) {
|
||||
llama_model * model = llama_init.model.get();
|
||||
llama_context * ctx = llama_init.context.get();
|
||||
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
// Tokenize the prompt
|
||||
@@ -94,7 +96,7 @@ int main(int argc, char ** argv) {
|
||||
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
|
||||
|
||||
for (int s = 1; s < W + G + 1; ++s) {
|
||||
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
|
||||
llama_memory_seq_cp(mem, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
@@ -427,17 +429,17 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// KV cache management
|
||||
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
|
||||
llama_kv_self_seq_rm(ctx, -1, n_past, -1);
|
||||
llama_memory_seq_rm(mem, -1, n_past, -1);
|
||||
|
||||
if (seq_id_best != 0) {
|
||||
// if a verification token matched, we keep the best sequence and remove the rest
|
||||
// this leads to some KV cache fragmentation
|
||||
llama_kv_self_seq_keep(ctx, seq_id_best);
|
||||
llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1);
|
||||
llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1);
|
||||
llama_memory_seq_keep(mem, seq_id_best);
|
||||
llama_memory_seq_cp (mem, seq_id_best, 0, -1, -1);
|
||||
llama_memory_seq_rm (mem, seq_id_best, -1, -1);
|
||||
|
||||
for (int s = 1; s < W + G + 1; ++s) {
|
||||
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
|
||||
llama_memory_seq_cp(mem, 0, s, -1, -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,7 +181,7 @@ int main(int argc, char ** argv){
|
||||
|
||||
// KV cache management
|
||||
// clean the cache of draft tokens that weren't accepted
|
||||
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
|
||||
|
||||
common_batch_clear(batch_tgt);
|
||||
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||
|
||||
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.n_predict = 128;
|
||||
params.n_junk = 0;
|
||||
params.n_junk = 1;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||
return 1;
|
||||
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
|
||||
const bool is_sp_shared = params.is_pp_shared;
|
||||
|
||||
// extra text to insert in each client's prompt in order to make it larger
|
||||
const int32_t n_junk = params.n_junk;
|
||||
const int32_t n_junk = std::max(1, params.n_junk);
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
@@ -194,6 +194,8 @@ int main(int argc, char ** argv) {
|
||||
llama_model * model = llama_init.model.get();
|
||||
llama_context * ctx = llama_init.context.get();
|
||||
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
// load the prompts from an external file if there are any
|
||||
@@ -259,7 +261,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// assign the system KV cache to all parallel sequences
|
||||
for (int32_t i = 1; i <= n_clients; ++i) {
|
||||
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
||||
llama_memory_seq_cp(mem, 0, i, -1, -1);
|
||||
}
|
||||
|
||||
LOG_INF("\n");
|
||||
@@ -286,9 +288,9 @@ int main(int argc, char ** argv) {
|
||||
if (batch.n_tokens == 0) {
|
||||
// all sequences have ended - clear the entire KV cache
|
||||
for (int i = 1; i <= n_clients; ++i) {
|
||||
llama_kv_self_seq_rm(ctx, i, -1, -1);
|
||||
llama_memory_seq_rm(mem, i, -1, -1);
|
||||
// but keep the system prompt
|
||||
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
||||
llama_memory_seq_cp(mem, 0, i, -1, -1);
|
||||
}
|
||||
|
||||
LOG_INF("%s: clearing the KV cache\n", __func__);
|
||||
@@ -362,7 +364,9 @@ int main(int argc, char ** argv) {
|
||||
// process in chunks of params.n_batch
|
||||
int32_t n_batch = params.n_batch;
|
||||
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||
int32_t i_next = 0;
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
|
||||
// experiment: process in powers of 2
|
||||
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
|
||||
// n_batch /= 2;
|
||||
@@ -370,7 +374,7 @@ int main(int argc, char ** argv) {
|
||||
// continue;
|
||||
//}
|
||||
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
@@ -390,19 +394,24 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||
LOG_WRN("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||
|
||||
n_cache_miss += 1;
|
||||
|
||||
// retry with half the batch size to try to find a free slot in the KV cache
|
||||
n_batch /= 2;
|
||||
i -= n_batch;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
||||
|
||||
// move the head of the batch forward with the number of tokens we just processed
|
||||
i_next = i + n_tokens;
|
||||
|
||||
// on successful decode, restore the original batch size
|
||||
n_batch = params.n_batch;
|
||||
|
||||
for (auto & client : clients) {
|
||||
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
||||
continue;
|
||||
@@ -440,8 +449,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
||||
llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1);
|
||||
llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
||||
llama_memory_seq_rm(mem, client.id + 1, -1, -1);
|
||||
llama_memory_seq_cp(mem, 0, client.id + 1, -1, -1);
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
|
||||
@@ -126,6 +126,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
int n_past = 0;
|
||||
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
|
||||
// fill the KV cache
|
||||
for (int i = 0; i < n_ctx; i += n_batch) {
|
||||
if (i > 0 && n_grp > 1) {
|
||||
@@ -133,11 +135,10 @@ int main(int argc, char ** argv) {
|
||||
const int ib = i/n_batch - 1;
|
||||
const int bd = n_batch_grp*(n_grp - 1);
|
||||
|
||||
llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
llama_kv_self_update (ctx);
|
||||
llama_memory_seq_add(mem, 0, n_past - n_batch, n_past, ib*bd);
|
||||
llama_memory_seq_div(mem, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
||||
|
||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
||||
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
|
||||
}
|
||||
|
||||
common_batch_clear(batch);
|
||||
@@ -167,12 +168,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||
|
||||
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_self_defrag (ctx);
|
||||
llama_kv_self_update (ctx);
|
||||
llama_memory_seq_rm (mem, 0, n_keep , n_keep + n_discard);
|
||||
llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
|
||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
||||
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
@@ -198,12 +197,10 @@ int main(int argc, char ** argv) {
|
||||
if (n_discard > 0) {
|
||||
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||
|
||||
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
//llama_kv_self_defrag (ctx);
|
||||
llama_kv_self_update (ctx);
|
||||
llama_memory_seq_rm (mem, 0, n_keep , n_keep + n_discard);
|
||||
llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
|
||||
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
||||
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||
|
||||
static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_self_clear(ctx);
|
||||
llama_memory_clear(llama_get_memory(ctx), false);
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
|
||||
@@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
// erase whole kv
|
||||
llama_kv_self_clear(ctx3);
|
||||
llama_memory_clear(llama_get_memory(ctx3), true);
|
||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||
|
||||
// restore kv into seq 1
|
||||
|
||||
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
|
||||
auto generate = [&](const std::string & prompt) {
|
||||
std::string response;
|
||||
|
||||
const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
|
||||
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == 0;
|
||||
|
||||
// tokenize the prompt
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
|
||||
while (true) {
|
||||
// check if we have enough space in the context to evaluate this batch
|
||||
int n_ctx = llama_n_ctx(ctx);
|
||||
int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
|
||||
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0);
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
|
||||
@@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||
|
||||
llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1);
|
||||
}
|
||||
|
||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||
|
||||
@@ -142,6 +142,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
auto * mem_tgt = llama_get_memory(ctx_tgt);
|
||||
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||
|
||||
// Tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
@@ -420,14 +422,14 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
||||
|
||||
llama_kv_self_seq_keep(ctx_dft, s_keep);
|
||||
llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
||||
llama_kv_self_seq_keep(ctx_dft, 0);
|
||||
llama_memory_seq_keep(mem_dft, s_keep);
|
||||
llama_memory_seq_cp (mem_dft, s_keep, 0, -1, -1);
|
||||
llama_memory_seq_keep(mem_dft, 0);
|
||||
|
||||
llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
|
||||
llama_kv_self_seq_keep(ctx_tgt, s_keep);
|
||||
llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
|
||||
llama_kv_self_seq_keep(ctx_tgt, 0);
|
||||
llama_memory_seq_rm (mem_tgt, s_keep, n_past_tgt, -1);
|
||||
llama_memory_seq_keep(mem_tgt, s_keep);
|
||||
llama_memory_seq_cp (mem_tgt, s_keep, 0, -1, -1);
|
||||
llama_memory_seq_keep(mem_tgt, 0);
|
||||
}
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
@@ -444,7 +446,7 @@ int main(int argc, char ** argv) {
|
||||
common_batch_clear(batch_dft);
|
||||
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
||||
|
||||
llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
llama_memory_seq_rm(mem_dft, 0, n_past_dft, -1);
|
||||
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||
llama_decode(ctx_dft, batch_dft);
|
||||
|
||||
@@ -503,8 +505,8 @@ int main(int argc, char ** argv) {
|
||||
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
|
||||
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||
|
||||
llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||
llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
||||
llama_memory_seq_rm(mem_dft, n_seq_cur, -1, -1);
|
||||
llama_memory_seq_cp(mem_dft, s, n_seq_cur, -1, -1);
|
||||
|
||||
// all previous tokens from this branch are now also part of the new branch
|
||||
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
|
||||
@@ -585,9 +587,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// evaluate the target model on the drafted tokens
|
||||
{
|
||||
llama_kv_self_seq_keep(ctx_tgt, 0);
|
||||
llama_memory_seq_keep(mem_tgt, 0);
|
||||
for (int s = 1; s < n_seq_dft; ++s) {
|
||||
llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||
llama_memory_seq_cp(mem_tgt, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||
|
||||
@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
|
||||
message(DEBUG "INS_ENB : ${INS_ENB}")
|
||||
|
||||
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
||||
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||
option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
||||
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
|
||||
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||
@@ -137,7 +137,7 @@ set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
||||
|
||||
|
||||
if (WIN32)
|
||||
if (MINGW)
|
||||
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
|
||||
endif()
|
||||
|
||||
@@ -172,6 +172,7 @@ option(GGML_HIP "ggml: use HIP"
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||
@@ -367,6 +368,8 @@ if (MSVC)
|
||||
/wd4005 # Macro redefinition
|
||||
/wd4244 # Conversion from one type to another type, possible loss of data
|
||||
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
||||
/wd4305 # Conversion from 'type1' to 'type2', possible loss of data
|
||||
/wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
|
||||
/wd4996 # Disable POSIX deprecation warnings
|
||||
/wd4702 # Unreachable code warnings
|
||||
)
|
||||
@@ -386,4 +389,46 @@ if (MSVC)
|
||||
disable_msvc_warnings(ggml-cpu-skylakex)
|
||||
disable_msvc_warnings(ggml-cpu-icelake)
|
||||
disable_msvc_warnings(ggml-cpu-alderlake)
|
||||
|
||||
if (GGML_BUILD_EXAMPLES)
|
||||
disable_msvc_warnings(common-ggml)
|
||||
disable_msvc_warnings(common)
|
||||
|
||||
disable_msvc_warnings(mnist-common)
|
||||
disable_msvc_warnings(mnist-eval)
|
||||
disable_msvc_warnings(mnist-train)
|
||||
|
||||
disable_msvc_warnings(gpt-2-ctx)
|
||||
disable_msvc_warnings(gpt-2-alloc)
|
||||
disable_msvc_warnings(gpt-2-backend)
|
||||
disable_msvc_warnings(gpt-2-sched)
|
||||
disable_msvc_warnings(gpt-2-quantize)
|
||||
disable_msvc_warnings(gpt-2-batched)
|
||||
|
||||
disable_msvc_warnings(gpt-j)
|
||||
disable_msvc_warnings(gpt-j-quantize)
|
||||
|
||||
disable_msvc_warnings(magika)
|
||||
disable_msvc_warnings(yolov3-tiny)
|
||||
disable_msvc_warnings(sam)
|
||||
|
||||
disable_msvc_warnings(simple-ctx)
|
||||
disable_msvc_warnings(simple-backend)
|
||||
endif()
|
||||
|
||||
if (GGML_BUILD_TESTS)
|
||||
disable_msvc_warnings(test-mul-mat)
|
||||
disable_msvc_warnings(test-arange)
|
||||
disable_msvc_warnings(test-backend-ops)
|
||||
disable_msvc_warnings(test-cont)
|
||||
disable_msvc_warnings(test-conv-transpose)
|
||||
disable_msvc_warnings(test-conv-transpose-1d)
|
||||
disable_msvc_warnings(test-conv1d)
|
||||
disable_msvc_warnings(test-conv2d)
|
||||
disable_msvc_warnings(test-conv2d-dw)
|
||||
disable_msvc_warnings(test-customop)
|
||||
disable_msvc_warnings(test-dup)
|
||||
disable_msvc_warnings(test-opt)
|
||||
disable_msvc_warnings(test-pool)
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
||||
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
|
||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
|
||||
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
|
||||
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
|
||||
|
||||
@@ -2095,9 +2095,6 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
|
||||
|
||||
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
||||
|
||||
// print info and performance information for the graph
|
||||
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
||||
|
||||
@@ -2181,6 +2178,7 @@ extern "C" {
|
||||
|
||||
// scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_LOW = -1,
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
|
||||
@@ -125,7 +125,6 @@ if (NOT MSVC)
|
||||
endif()
|
||||
|
||||
if (MINGW)
|
||||
# Target Windows 8 for PrefetchVirtualMemory
|
||||
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
||||
endif()
|
||||
|
||||
@@ -196,6 +195,7 @@ add_library(ggml-base
|
||||
../include/ggml-opt.h
|
||||
../include/gguf.h
|
||||
ggml.c
|
||||
ggml.cpp
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
ggml-opt.cpp
|
||||
@@ -212,6 +212,7 @@ endif()
|
||||
|
||||
add_library(ggml
|
||||
ggml-backend-reg.cpp)
|
||||
add_library(ggml::ggml ALIAS ggml)
|
||||
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
@@ -226,6 +227,7 @@ function(ggml_add_backend_library backend)
|
||||
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
||||
add_dependencies(ggml ${backend})
|
||||
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
else()
|
||||
add_library(${backend} ${ARGN})
|
||||
target_link_libraries(ggml PUBLIC ${backend})
|
||||
@@ -268,17 +270,23 @@ endfunction()
|
||||
function(ggml_add_cpu_backend_variant tag_name)
|
||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
endforeach()
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
endforeach()
|
||||
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_${feat} ON)
|
||||
endforeach()
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_${feat} ON)
|
||||
endforeach()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
endfunction()
|
||||
@@ -288,6 +296,8 @@ ggml_add_backend(CPU)
|
||||
if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
elseif (GGML_CPU_ARM_ARCH)
|
||||
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
@@ -301,8 +311,30 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
# Many of these features are optional so we build versions with popular
|
||||
# combinations and name the backends based on the version they were
|
||||
# first released with
|
||||
ggml_add_cpu_backend_variant(armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
||||
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
||||
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
||||
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
||||
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
# Android-specific backends with SoC-compatible feature sets
|
||||
ggml_add_cpu_backend_variant(android_armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||
// allocate graph
|
||||
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
||||
// the re-allocation may cause the split inputs to be moved to a different address
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
|
||||
for (int i = 0; i < sched->n_backends; i++) {
|
||||
ggml_backend_synchronize(sched->backends[i]);
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
||||
#endif
|
||||
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||
|
||||
ggml_backend_sched_split_graph(sched, graph);
|
||||
|
||||
|
||||
if (!ggml_backend_sched_alloc_splits(sched)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1598,9 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
||||
for (int i = 0; i < sched->n_backends; i++) {
|
||||
ggml_backend_synchronize(sched->backends[i]);
|
||||
}
|
||||
// reset the current copy to 0 so that the graphs will be similar during generation
|
||||
// necessary for CUDA graphs
|
||||
sched->cur_copy = 0;
|
||||
if (!sched->is_alloc) {
|
||||
// if the graph is not already allocated, always use copy 0 after a synchronization
|
||||
// this ensures that during generation the same copy is used every time,
|
||||
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
|
||||
sched->cur_copy = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
||||
|
||||
@@ -81,7 +81,7 @@ if (BLAS_FOUND)
|
||||
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
|
||||
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
|
||||
else()
|
||||
message(ERROR "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
" to set correct GGML_BLAS_VENDOR")
|
||||
message(FATAL_ERROR "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
" to set correct GGML_BLAS_VENDOR")
|
||||
endif()
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
#include <thread>
|
||||
#include <unistd.h>
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
|
||||
#include "../include/ggml-cann.h"
|
||||
#include "../include/ggml.h"
|
||||
@@ -103,6 +104,9 @@ const ggml_cann_device_info& ggml_cann_info();
|
||||
void ggml_cann_set_device(int32_t device);
|
||||
int32_t ggml_cann_get_device();
|
||||
|
||||
std::optional<std::string> get_env(const std::string& name);
|
||||
bool parse_bool(const std::string& value);
|
||||
|
||||
/**
|
||||
* @brief Abstract base class for memory pools used by CANN.
|
||||
*/
|
||||
@@ -354,7 +358,8 @@ struct ggml_backend_cann_context {
|
||||
: device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
|
||||
ggml_cann_set_device(device);
|
||||
description = aclrtGetSocName();
|
||||
async_mode = (getenv("GGML_CANN_ASYNC_MODE") != nullptr);
|
||||
|
||||
bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
|
||||
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
||||
device, async_mode ? "ON" : "OFF");
|
||||
}
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
#include <mutex>
|
||||
#include <queue>
|
||||
#include <chrono>
|
||||
#include <unordered_set>
|
||||
#include <optional>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
@@ -93,6 +95,26 @@ int32_t ggml_cann_get_device() {
|
||||
return id;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the value of the specified environment variable (name).
|
||||
* if not empty, return a std::string object
|
||||
*/
|
||||
std::optional<std::string> get_env(const std::string& name) {
|
||||
const char* val = std::getenv(name.c_str());
|
||||
if (!val) return std::nullopt;
|
||||
std::string res = std::string(val);
|
||||
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Verify whether the environment variable is a valid value.
|
||||
*/
|
||||
bool parse_bool(const std::string& value) {
|
||||
std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
|
||||
return valid_values.find(value) != valid_values.end();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Initialize the CANN device information.
|
||||
*
|
||||
@@ -214,7 +236,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
||||
* @param device The device ID to associate with this buffer pool.
|
||||
*/
|
||||
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
|
||||
disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
|
||||
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -410,7 +432,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
||||
* @param device The device ID to associate with this buffer pool.
|
||||
*/
|
||||
explicit ggml_cann_pool_buf(int device) : device(device) {
|
||||
disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
|
||||
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -731,16 +753,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||
*/
|
||||
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
||||
int device) {
|
||||
bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
|
||||
if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
|
||||
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
|
||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
||||
}
|
||||
bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
|
||||
if (enable_buf_prio) {
|
||||
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
|
||||
|
||||
if (mem_pool_type == "prio") {
|
||||
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
|
||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
|
||||
}
|
||||
|
||||
if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
|
||||
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
|
||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
|
||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
|
||||
}
|
||||
|
||||
@@ -1074,6 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
|
||||
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
||||
GGML_TABLE_END()
|
||||
|
||||
GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
|
||||
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
|
||||
GGML_TABLE_END()
|
||||
|
||||
#define NGRID_IQ1S 2048
|
||||
#define IQ1S_DELTA 0.125f
|
||||
#define IQ1M_DELTA 0.125f
|
||||
|
||||
@@ -1,3 +1,17 @@
|
||||
function(ggml_add_cpu_backend_features cpu_name arch)
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endfunction()
|
||||
|
||||
function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (tag_name)
|
||||
set(GGML_CPU_NAME ggml-cpu-${tag_name})
|
||||
@@ -10,14 +24,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list (APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/ggml-cpu.c
|
||||
ggml-cpu/ggml-cpu.cpp
|
||||
ggml-cpu/ggml-cpu-aarch64.cpp
|
||||
ggml-cpu/ggml-cpu-aarch64.h
|
||||
ggml-cpu/ggml-cpu-hbm.cpp
|
||||
ggml-cpu/ggml-cpu-hbm.h
|
||||
ggml-cpu/ggml-cpu-quants.c
|
||||
ggml-cpu/ggml-cpu-quants.h
|
||||
ggml-cpu/ggml-cpu-traits.cpp
|
||||
ggml-cpu/ggml-cpu-traits.h
|
||||
ggml-cpu/repack.cpp
|
||||
ggml-cpu/repack.h
|
||||
ggml-cpu/hbm.cpp
|
||||
ggml-cpu/hbm.h
|
||||
ggml-cpu/quants.c
|
||||
ggml-cpu/quants.h
|
||||
ggml-cpu/traits.cpp
|
||||
ggml-cpu/traits.h
|
||||
ggml-cpu/amx/amx.cpp
|
||||
ggml-cpu/amx/amx.h
|
||||
ggml-cpu/amx/mmq.cpp
|
||||
@@ -84,6 +98,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
message(STATUS "ARM detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/arch/arm/quants.c
|
||||
ggml-cpu/arch/arm/repack.cpp
|
||||
)
|
||||
|
||||
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
|
||||
else()
|
||||
@@ -138,6 +157,46 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
else()
|
||||
if (GGML_CPU_ARM_ARCH)
|
||||
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
||||
elseif(GGML_CPU_ALL_VARIANTS)
|
||||
# Begin with the lowest baseline
|
||||
set(ARM_MCPU "armv8-a")
|
||||
set(ARCH_TAGS "")
|
||||
set(ARCH_DEFINITIONS "")
|
||||
|
||||
# When a feature is selected, bump the MCPU to the first
|
||||
# version that supported it
|
||||
if (GGML_INTERNAL_DOTPROD)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
|
||||
endif()
|
||||
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
|
||||
endif()
|
||||
if (GGML_INTERNAL_MATMUL_INT8)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE2)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SME)
|
||||
set(ARM_MCPU "armv9.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sme")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -167,6 +226,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
message(STATUS "x86 detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/arch/x86/quants.c
|
||||
ggml-cpu/arch/x86/repack.cpp
|
||||
)
|
||||
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (GGML_NATIVE)
|
||||
@@ -296,21 +360,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||
message(STATUS "PowerPC detected")
|
||||
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c)
|
||||
if (GGML_NATIVE)
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
file(READ "/proc/cpuinfo" POWER10_M)
|
||||
@@ -318,7 +372,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
endif()
|
||||
|
||||
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
|
||||
string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
|
||||
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
|
||||
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
||||
|
||||
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
||||
@@ -337,6 +392,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
|
||||
message(STATUS "loongarch64 detected")
|
||||
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c)
|
||||
|
||||
list(APPEND ARCH_FLAGS -march=loongarch64)
|
||||
if (GGML_LASX)
|
||||
list(APPEND ARCH_FLAGS -mlasx)
|
||||
@@ -346,6 +403,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
|
||||
message(STATUS "riscv64 detected")
|
||||
list(APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/arch/riscv/quants.c
|
||||
ggml-cpu/arch/riscv/repack.cpp
|
||||
)
|
||||
if (GGML_RVV)
|
||||
if (GGML_XTHEADVECTOR)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
|
||||
@@ -357,6 +418,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
||||
message(STATUS "s390x detected")
|
||||
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
|
||||
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
||||
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
|
||||
|
||||
@@ -380,12 +442,16 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_VXE)
|
||||
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
||||
endif()
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
||||
message(STATUS "Wasm detected")
|
||||
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|
||||
else()
|
||||
message(STATUS "Unknown architecture")
|
||||
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
|
||||
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_AARCH64)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
|
||||
if (GGML_CPU_REPACK)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_KLEIDIAI)
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "traits.h"
|
||||
|
||||
#if defined(__gnu_linux__)
|
||||
#include <sys/syscall.h>
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include "mmq.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-cpu-quants.h"
|
||||
#include "quants.h"
|
||||
#include "ggml-quants.h"
|
||||
#include <algorithm>
|
||||
#include <type_traits>
|
||||
|
||||
184
ggml/src/ggml-cpu/arch-fallback.h
Normal file
184
ggml/src/ggml-cpu/arch-fallback.h
Normal file
@@ -0,0 +1,184 @@
|
||||
#pragma once
|
||||
|
||||
// Rename `_generic` functions if no native implementation is available.
|
||||
// This effectively selects the generic implementation.
|
||||
|
||||
#if defined(GGML_CPU_GENERIC)
|
||||
// quants.c
|
||||
#define quantize_row_q8_0_generic quantize_row_q8_0
|
||||
#define quantize_row_q8_1_generic quantize_row_q8_1
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
|
||||
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
|
||||
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
|
||||
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__POWERPC__) || defined(__powerpc__)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__loongarch64)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__s390x__)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__wasm__)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#endif
|
||||
94
ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp
Normal file
94
ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp
Normal file
@@ -0,0 +1,94 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__aarch64__)
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/auxv.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_I8MM)
|
||||
#define HWCAP2_I8MM (1 << 13)
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_SME)
|
||||
#define HWCAP2_SME (1 << 23)
|
||||
#endif
|
||||
|
||||
struct aarch64_features {
|
||||
// has_neon not needed, aarch64 has NEON guaranteed
|
||||
bool has_dotprod = false;
|
||||
bool has_fp16_va = false;
|
||||
bool has_sve = false;
|
||||
bool has_sve2 = false;
|
||||
bool has_i8mm = false;
|
||||
bool has_sme = false;
|
||||
|
||||
aarch64_features() {
|
||||
#if defined(__linux__)
|
||||
uint32_t hwcap = getauxval(AT_HWCAP);
|
||||
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||
|
||||
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
||||
has_fp16_va = !!(hwcap & HWCAP_FPHP);
|
||||
has_sve = !!(hwcap & HWCAP_SVE);
|
||||
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
|
||||
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||
has_sme = !!(hwcap2 & HWCAP2_SME);
|
||||
#elif defined(__APPLE__)
|
||||
int oldp = 0;
|
||||
size_t size = sizeof(oldp);
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
|
||||
has_dotprod = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
|
||||
has_i8mm = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
|
||||
has_sme = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
// Apple apparently does not implement SVE yet
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
static int ggml_backend_cpu_aarch64_score() {
|
||||
int score = 1;
|
||||
aarch64_features af;
|
||||
|
||||
#ifdef GGML_USE_DOTPROD
|
||||
if (!af.has_dotprod) { return 0; }
|
||||
score += 1<<1;
|
||||
#endif
|
||||
#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
|
||||
if (!af.has_fp16_va) { return 0; }
|
||||
score += 1<<2;
|
||||
#endif
|
||||
#ifdef GGML_USE_SVE
|
||||
if (!af.has_sve) { return 0; }
|
||||
score += 1<<3;
|
||||
#endif
|
||||
#ifdef GGML_USE_MATMUL_INT8
|
||||
if (!af.has_i8mm) { return 0; }
|
||||
score += 1<<4;
|
||||
#endif
|
||||
#ifdef GGML_USE_SVE2
|
||||
if (!af.has_sve2) { return 0; }
|
||||
score += 1<<5;
|
||||
#endif
|
||||
#ifdef GGML_USE_SME
|
||||
if (!af.has_sme) { return 0; }
|
||||
score += 1<<6;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
|
||||
|
||||
# endif // defined(__aarch64__)
|
||||
4113
ggml/src/ggml-cpu/arch/arm/quants.c
Normal file
4113
ggml/src/ggml-cpu/arch/arm/quants.c
Normal file
File diff suppressed because it is too large
Load Diff
2174
ggml/src/ggml-cpu/arch/arm/repack.cpp
Normal file
2174
ggml/src/ggml-cpu/arch/arm/repack.cpp
Normal file
File diff suppressed because it is too large
Load Diff
2638
ggml/src/ggml-cpu/arch/loongarch/quants.c
Normal file
2638
ggml/src/ggml-cpu/arch/loongarch/quants.c
Normal file
File diff suppressed because it is too large
Load Diff
2731
ggml/src/ggml-cpu/arch/powerpc/quants.c
Normal file
2731
ggml/src/ggml-cpu/arch/powerpc/quants.c
Normal file
File diff suppressed because it is too large
Load Diff
2068
ggml/src/ggml-cpu/arch/riscv/quants.c
Normal file
2068
ggml/src/ggml-cpu/arch/riscv/quants.c
Normal file
File diff suppressed because it is too large
Load Diff
396
ggml/src/ggml-cpu/arch/riscv/repack.cpp
Normal file
396
ggml/src/ggml-cpu/arch/riscv/repack.cpp
Normal file
@@ -0,0 +1,396 @@
|
||||
#define GGML_COMMON_IMPL_CPP
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cstdlib> // for qsort
|
||||
#include <cstdio> // for GGML_ASSERT
|
||||
|
||||
#define GGML_CPU_CLANG_WORKAROUND
|
||||
#include "../../repack.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
for (int l = 0; l < nb; l++) {
|
||||
const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
|
||||
const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
|
||||
const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
|
||||
const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment constraints
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
|
||||
|
||||
const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
|
||||
const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
|
||||
const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
|
||||
const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
|
||||
const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
|
||||
const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
|
||||
const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
|
||||
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
// vector version needs Zvfhmin extension
|
||||
const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
const float b_scales[8] = {
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[3]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[7])
|
||||
};
|
||||
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
|
||||
sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
__riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
{
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nr % 4 == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
|
||||
for (int l = 0; l < nb; l++) {
|
||||
const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
|
||||
const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
|
||||
const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
|
||||
const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
|
||||
const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
|
||||
const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
|
||||
const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
|
||||
|
||||
// vector version needs Zvfhmin extension
|
||||
const float a_scales[4] = {
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[0]),
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[1]),
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[2]),
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[3])
|
||||
};
|
||||
const float b_scales[8] = {
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[3]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[7])
|
||||
};
|
||||
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
||||
|
||||
const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
|
||||
const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
|
||||
const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
|
||||
const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l0;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l0 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
|
||||
sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
|
||||
const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
|
||||
const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
|
||||
const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
|
||||
const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l1;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l1 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
|
||||
sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
|
||||
const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
|
||||
const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
|
||||
const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
|
||||
const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l2;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l2 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
|
||||
sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
|
||||
const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
|
||||
const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
|
||||
const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
|
||||
const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
|
||||
__asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
|
||||
vint16m4_t sumi_l3;
|
||||
{
|
||||
const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
|
||||
const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
|
||||
const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
|
||||
const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
|
||||
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
|
||||
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
|
||||
const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
|
||||
|
||||
sumi_l3 = sumi_hi_m;
|
||||
}
|
||||
|
||||
{
|
||||
const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
|
||||
const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
|
||||
const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
|
||||
const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
|
||||
const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
|
||||
const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
|
||||
const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
|
||||
const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
|
||||
const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
|
||||
const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
|
||||
const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
|
||||
const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
|
||||
sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
|
||||
}
|
||||
}
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
|
||||
__riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||
float sumf[4][8];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
1299
ggml/src/ggml-cpu/arch/s390/quants.c
Normal file
1299
ggml/src/ggml-cpu/arch/s390/quants.c
Normal file
File diff suppressed because it is too large
Load Diff
1480
ggml/src/ggml-cpu/arch/wasm/quants.c
Normal file
1480
ggml/src/ggml-cpu/arch/wasm/quants.c
Normal file
File diff suppressed because it is too large
Load Diff
4310
ggml/src/ggml-cpu/arch/x86/quants.c
Normal file
4310
ggml/src/ggml-cpu/arch/x86/quants.c
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
||||
@@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
||||
// TODO: move to ggml-threading
|
||||
void ggml_barrier(struct ggml_threadpool * tp);
|
||||
|
||||
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
|
||||
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -3,11 +3,11 @@
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-quants.h"
|
||||
#include "quants.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "unary-ops.h"
|
||||
#include "binary-ops.h"
|
||||
@@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
|
||||
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
||||
}
|
||||
|
||||
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
|
||||
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
||||
}
|
||||
|
||||
#if defined(__gnu_linux__)
|
||||
static cpu_set_t ggml_get_numa_affinity(void) {
|
||||
cpu_set_t cpuset;
|
||||
@@ -2418,12 +2426,32 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
// This is up to the applications.
|
||||
DWORD p = THREAD_PRIORITY_NORMAL;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
|
||||
case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
|
||||
case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
|
||||
case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
|
||||
}
|
||||
|
||||
if (prio != GGML_SCHED_PRIO_LOW) {
|
||||
// Tell Windows that this thread should not be throttled (needs its own CPU core).
|
||||
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
||||
// all our threads onto the first 4 cores which results in terrible performance with
|
||||
// n_threads > 4
|
||||
#if _WIN32_WINNT >= 0x0602
|
||||
THREAD_POWER_THROTTLING_STATE t;
|
||||
ZeroMemory(&t, sizeof(t));
|
||||
t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
|
||||
t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
|
||||
t.StateMask = 0;
|
||||
|
||||
if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
|
||||
GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
||||
// Keep inherited policy/priority
|
||||
return true;
|
||||
@@ -2451,6 +2479,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
struct sched_param p;
|
||||
int32_t policy = SCHED_OTHER;
|
||||
switch (prio) {
|
||||
// TODO: there seems to be no way to set lower prio on Apple platforms
|
||||
case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
||||
case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
|
||||
case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
|
||||
@@ -2507,6 +2537,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
struct sched_param p;
|
||||
int32_t policy = SCHED_OTHER;
|
||||
switch (prio) {
|
||||
case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
|
||||
case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
|
||||
case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
|
||||
case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-aarch64.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "repack.h"
|
||||
#include "traits.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "amx/amx.h"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#include <vector>
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
# include "ggml-cpu-hbm.h"
|
||||
# include "hbm.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_KLEIDIAI
|
||||
@@ -51,9 +51,9 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_AARCH64
|
||||
if (ggml_backend_cpu_aarch64_buffer_type()) {
|
||||
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
|
||||
#ifdef GGML_USE_CPU_REPACK
|
||||
if (ggml_backend_cpu_repack_buffer_type()) {
|
||||
bufts.push_back(ggml_backend_cpu_repack_buffer_type());
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -596,8 +596,8 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
#ifdef GGML_USE_CPU_KLEIDIAI
|
||||
features.push_back({ "KLEIDIAI", "1" });
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU_AARCH64
|
||||
features.push_back({ "AARCH64_REPACK", "1" });
|
||||
#ifdef GGML_USE_CPU_REPACK
|
||||
features.push_back({ "REPACK", "1" });
|
||||
#endif
|
||||
|
||||
features.push_back({ nullptr, nullptr });
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include "ggml-cpu-hbm.h"
|
||||
#include "hbm.h"
|
||||
|
||||
// buffer type HBM
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include "kernels.h"
|
||||
|
||||
|
||||
@@ -53,7 +53,6 @@
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
|
||||
@@ -394,8 +393,6 @@ class tinyBLAS {
|
||||
|
||||
template <int RM, int RN, int BM>
|
||||
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
||||
static std::atomic<int64_t> current_chunk;
|
||||
|
||||
GGML_ASSERT(m % (RM * BM) == 0);
|
||||
const int64_t ytiles = m / (RM * BM);
|
||||
const int64_t xtiles = (n + RN -1) / RN;
|
||||
@@ -410,7 +407,7 @@ class tinyBLAS {
|
||||
if (params->ith == 0) {
|
||||
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||
std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed);
|
||||
ggml_threadpool_chunk_set(params->threadpool, params->nth);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
@@ -439,8 +436,7 @@ class tinyBLAS {
|
||||
GGML_ASSERT(jj == jj2);
|
||||
}
|
||||
|
||||
// next step.
|
||||
job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed);
|
||||
job = ggml_threadpool_chunk_add(params->threadpool, 1);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
@@ -8132,8 +8132,8 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
||||
#define WKV_VECTOR_SIZE 4
|
||||
#endif
|
||||
|
||||
int wkv_vector_size;
|
||||
#ifdef WKV_VECTOR_SIZE
|
||||
int wkv_vector_size;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
wkv_vector_size = svcntw();
|
||||
#else
|
||||
@@ -8348,8 +8348,8 @@ static void ggml_compute_forward_gla_f32(
|
||||
#define GLA_VECTOR_SIZE 4
|
||||
#endif
|
||||
|
||||
int gla_vector_size;
|
||||
#ifdef GLA_VECTOR_SIZE
|
||||
int gla_vector_size;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
gla_vector_size = svcntw();
|
||||
#else
|
||||
|
||||
1157
ggml/src/ggml-cpu/quants.c
Normal file
1157
ggml/src/ggml-cpu/quants.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -58,6 +58,32 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
// Generic implementation
|
||||
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
1555
ggml/src/ggml-cpu/repack.cpp
Normal file
1555
ggml/src/ggml-cpu/repack.cpp
Normal file
File diff suppressed because it is too large
Load Diff
98
ggml/src/ggml-cpu/repack.h
Normal file
98
ggml/src/ggml-cpu/repack.h
Normal file
@@ -0,0 +1,98 @@
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "traits.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void);
|
||||
|
||||
template <int K> constexpr int QK_0() {
|
||||
if constexpr (K == 4) {
|
||||
return QK4_0;
|
||||
}
|
||||
if constexpr (K == 8) {
|
||||
return QK8_0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
template <int K, int N> struct block {
|
||||
ggml_half d[N]; // deltas for N qK_0 blocks
|
||||
int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
|
||||
};
|
||||
|
||||
// control size
|
||||
static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
|
||||
static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
|
||||
static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
|
||||
static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
|
||||
|
||||
using block_q4_0x4 = block<4, 4>;
|
||||
using block_q4_0x8 = block<4, 8>;
|
||||
using block_q8_0x4 = block<8, 4>;
|
||||
using block_q8_0x8 = block<8, 8>;
|
||||
|
||||
struct block_q4_Kx8 {
|
||||
ggml_half d[8]; // super-block scale for quantized scales
|
||||
ggml_half dmin[8]; // super-block scale for quantized mins
|
||||
uint8_t scales[96]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[1024]; // 4--bit quants
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
||||
|
||||
struct block_q8_Kx4 {
|
||||
float d[4]; // delta
|
||||
int8_t qs[QK_K * 4]; // quants
|
||||
int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
|
||||
|
||||
struct block_iq4_nlx4 {
|
||||
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
||||
uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// Native implementations
|
||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
@@ -1,4 +1,4 @@
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
@@ -207,9 +207,9 @@ typedef float2 dfloat2;
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
@@ -262,11 +262,11 @@ static bool cp_async_available(const int cc) {
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
return __AMDGCN_WAVEFRONT_SIZE;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
return 64;
|
||||
#else
|
||||
return 32;
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
@@ -466,9 +466,6 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
// TODO: move to ggml-common.h
|
||||
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||
|
||||
static __device__ __forceinline__ float get_alibi_slope(
|
||||
@@ -635,6 +632,7 @@ struct ggml_cuda_device_info {
|
||||
int nsm; // number of streaming multiprocessors
|
||||
size_t smpb; // max. shared memory per block
|
||||
size_t smpbo; // max. shared memory per block (with opt-in)
|
||||
bool integrated; // Device is integrated as opposed to discrete
|
||||
bool vmm; // virtual memory support
|
||||
size_t vmm_granularity; // granularity of virtual memory
|
||||
size_t total_vram;
|
||||
|
||||
@@ -652,9 +652,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
float KQ_max_scale[cols_per_thread];
|
||||
#pragma unroll
|
||||
for (int col = 0; col < cols_per_thread; ++col) {
|
||||
KQ_max_scale[col] = expf(KQ_max[col] - KQ_max_new[col]);
|
||||
const float KQ_max_diff = KQ_max[col] - KQ_max_new[col];
|
||||
KQ_max_scale[col] = expf(KQ_max_diff);
|
||||
KQ_max[col] = KQ_max_new[col];
|
||||
|
||||
*((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD;
|
||||
|
||||
// Scale previous KQ_rowsum to account for a potential increase in KQ_max:
|
||||
KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_rowsum_add[col];
|
||||
}
|
||||
@@ -1246,7 +1249,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
NO_DEVICE_CODE;
|
||||
return;
|
||||
}
|
||||
#endif __CUDA_ARCH__ == GGML_CUDA_CC_TURING
|
||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
|
||||
|
||||
static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");
|
||||
|
||||
|
||||
@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
|
||||
info.default_tensor_split[id] = total_vram;
|
||||
total_vram += prop.totalGlobalMem;
|
||||
|
||||
info.devices[id].nsm = prop.multiProcessorCount;
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
info.devices[id].warp_size = prop.warpSize;
|
||||
info.devices[id].integrated = prop.integrated;
|
||||
info.devices[id].nsm = prop.multiProcessorCount;
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
info.devices[id].warp_size = prop.warpSize;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||
|
||||
@@ -615,9 +615,8 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
CUDA_CHECK(cudaMemsetAsync(ctx->dev_ptr, value, buffer->size, cudaStreamPerThread));
|
||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
||||
}
|
||||
|
||||
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
||||
@@ -1065,6 +1064,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||
}
|
||||
@@ -1140,7 +1143,6 @@ typedef void (*ggml_cuda_op_mul_mat_t)(
|
||||
static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
|
||||
const char * src_ptr = (const char *) src->data;
|
||||
char * dst_ptr = (char *) dst;
|
||||
|
||||
@@ -1423,8 +1425,6 @@ static void ggml_cuda_op_mul_mat(
|
||||
const int64_t nb2 = dst->nb[2];
|
||||
const int64_t nb3 = dst->nb[3];
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
||||
ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context;
|
||||
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context;
|
||||
|
||||
@@ -1746,7 +1746,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
||||
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft));
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
|
||||
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
|
||||
@@ -2641,6 +2641,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
// flag used to determine whether it is an integrated_gpu
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
|
||||
while (!graph_evaluated_or_captured) {
|
||||
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
||||
@@ -2659,10 +2661,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
if (node->src[j] != nullptr) {
|
||||
assert(node->src[j]->buffer);
|
||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
|
||||
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
|
||||
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
GGML_UNUSED(integrated);
|
||||
#endif // NDEBUG
|
||||
|
||||
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
||||
if (!ok) {
|
||||
@@ -3266,7 +3270,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
|
||||
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
|
||||
const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
|
||||
return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
|
||||
}
|
||||
|
||||
static int64_t get_op_batch_size(const ggml_tensor * op) {
|
||||
|
||||
@@ -10,6 +10,8 @@ __global__ void __launch_bounds__(splitD, 2)
|
||||
float * __restrict__ dst, const int64_t L) {
|
||||
GGML_UNUSED(src1_nb0);
|
||||
GGML_UNUSED(src2_nb0);
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
const int bidx = blockIdx.x; // split along B
|
||||
const int bidy = blockIdx.y; // split along D
|
||||
const int tid = threadIdx.x;
|
||||
@@ -44,16 +46,16 @@ __global__ void __launch_bounds__(splitD, 2)
|
||||
if (N == 16) {
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < splitD / 4; i += 2) {
|
||||
float value = A_block[(wid * warpSize + i) * stride_A + wtid];
|
||||
float value = A_block[(wid * warp_size + i) * stride_A + wtid];
|
||||
// todo: bank conflict
|
||||
// I am always confused with how to use the swizzling method to solve
|
||||
// bank conflit. Hoping somebody can tell me.
|
||||
smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < splitD / 4; i += 2) {
|
||||
float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
|
||||
smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid];
|
||||
smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -113,6 +113,10 @@ if (GGML_HIP_ROCWMMA_FATTN)
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_CUDA_FA)
|
||||
add_compile_definitions(GGML_CUDA_NO_FA)
|
||||
endif()
|
||||
|
||||
@@ -32,6 +32,8 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_print_backtrace(void);
|
||||
|
||||
#ifndef MIN
|
||||
# define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
@@ -44,21 +44,22 @@ if (GGML_METAL_EMBED_LIBRARY)
|
||||
set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${METALLIB_EMBED_ASM}
|
||||
OUTPUT "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "Embedding Metal library"
|
||||
COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED_TMP}
|
||||
COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}' -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED}
|
||||
COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${METALLIB_SOURCE}" > "${METALLIB_SOURCE_EMBED_TMP}"
|
||||
COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}"
|
||||
COMMAND echo ".section __DATA,__ggml_metallib" > "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo ".globl _ggml_metallib_start" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "_ggml_metallib_start:" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\"" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo ".globl _ggml_metallib_end" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "_ggml_metallib_end:" >> "${METALLIB_EMBED_ASM}"
|
||||
DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
|
||||
COMMENT "Generate assembly for embedded Metal library"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM})
|
||||
target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}")
|
||||
else()
|
||||
if (GGML_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
|
||||
@@ -4766,6 +4766,8 @@ static bool ggml_metal_encode_node(
|
||||
GGML_ASSERT(nqptg % 8 == 0);
|
||||
GGML_ASSERT(ncpsg % 32 == 0);
|
||||
|
||||
const int is_q = ggml_is_quantized(src1->type) ? 1 : 0;
|
||||
|
||||
// 2*(2*ncpsg + nqptg)*(nsg)
|
||||
// ncpsg soft_max values + ncpsg mask values + a diagonal scaling matrix (in float)
|
||||
//
|
||||
@@ -4773,7 +4775,7 @@ static bool ggml_metal_encode_node(
|
||||
// the shared memory needed for the simdgroups to load the KV cache
|
||||
// each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
|
||||
//
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(2*ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + is_q*(16*32*(nsg)))*(sizeof(float)/2), 16))
|
||||
|
||||
int64_t nsgmax = 2;
|
||||
|
||||
@@ -4810,9 +4812,9 @@ static bool ggml_metal_encode_node(
|
||||
// and store the soft_max values and the mask
|
||||
//
|
||||
// ne00*(nsg)
|
||||
// each simdgroup has a full f16 head vector in shared mem to accumulate results
|
||||
// each simdgroup has a full f32 head vector in shared mem to accumulate results
|
||||
//
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + ne20*(nsg))*(sizeof(float)/2), 16))
|
||||
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*ne20*(nsg))*(sizeof(float)/2), 16))
|
||||
|
||||
int64_t nsgmax = 2;
|
||||
while (true) {
|
||||
|
||||
@@ -3328,14 +3328,12 @@ kernel void kernel_flash_attn_ext(
|
||||
constexpr short NW = N_SIMDWIDTH;
|
||||
constexpr short SH = (2*C + Q); // shared memory per simdgroup (s_t == float)
|
||||
|
||||
const short TS = nsg*SH; // shared memory size per query in (s_t == float)
|
||||
const short T = DK + 2*TS; // shared memory size per query in (half)
|
||||
const short TS = nsg*SH; // shared memory size per query in (s_t == float)
|
||||
const short T = 2*DK + 2*TS; // shared memory size per query in (half)
|
||||
|
||||
threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||
threadgroup o_t * so = (threadgroup o_t *) (shmem_f16 + 0*DK); // reuse query data for accumulation
|
||||
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 0*DK); // same as above but in o4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + Q*DK); // scratch buffer for attention, mask and diagonal matrix
|
||||
threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + 2*Q*DK); // scratch buffer for attention, mask and diagonal matrix
|
||||
|
||||
threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
|
||||
threadgroup k4x4_t * sk4x4 = (threadgroup k4x4_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // same as above but in k4x4_t
|
||||
@@ -3354,7 +3352,7 @@ kernel void kernel_flash_attn_ext(
|
||||
if (iq1 + j < args.ne01) {
|
||||
sq4[j*DK4 + i] = (q4_t) q4[i];
|
||||
} else {
|
||||
sq4[j*DK4 + i] = (q4_t) 0.0f;
|
||||
sq4[j*DK4 + i] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3548,20 +3546,20 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
// O = diag(ms)*O
|
||||
{
|
||||
s8x8_t mm;
|
||||
simdgroup_load(mm, ss + 2*C, TS, 0, false);
|
||||
s8x8_t ms;
|
||||
simdgroup_load(ms, ss + 2*C, TS, 0, false);
|
||||
|
||||
#pragma unroll(DV8)
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_multiply(lo[i], mm, lo[i]);
|
||||
simdgroup_multiply(lo[i], ms, lo[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// O = O + (Q*K^T)*V
|
||||
{
|
||||
for (short cc = 0; cc < C/8; ++cc) {
|
||||
s8x8_t ms;
|
||||
simdgroup_load(ms, ss + 8*cc, TS, 0, false);
|
||||
s8x8_t vs;
|
||||
simdgroup_load(vs, ss + 8*cc, TS, 0, false);
|
||||
|
||||
if (is_same<vd4x4_t, v4x4_t>::value) {
|
||||
// we can read directly from global memory
|
||||
@@ -3572,7 +3570,7 @@ kernel void kernel_flash_attn_ext(
|
||||
v8x8_t mv;
|
||||
simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20
|
||||
|
||||
simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
|
||||
simdgroup_multiply_accumulate(lo[i], vs, mv, lo[i]);
|
||||
}
|
||||
} else {
|
||||
for (short ii = 0; ii < DV16; ii += 4) {
|
||||
@@ -3593,10 +3591,10 @@ kernel void kernel_flash_attn_ext(
|
||||
v8x8_t mv;
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
|
||||
}
|
||||
} else {
|
||||
if (ii + tx < DV16) {
|
||||
@@ -3611,10 +3609,10 @@ kernel void kernel_flash_attn_ext(
|
||||
v8x8_t mv;
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3624,93 +3622,89 @@ kernel void kernel_flash_attn_ext(
|
||||
}
|
||||
|
||||
// these are needed for reducing the results from the simdgroups (reuse the ss buffer)
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
if (tiisg == 0) {
|
||||
ss[j*TS + 0] = S[j];
|
||||
ss[j*TS + 1] = M[j];
|
||||
}
|
||||
for (short j = tiisg; j < Q; j += NW) {
|
||||
ss[j*TS + 0] = S[j];
|
||||
ss[j*TS + 1] = M[j];
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
threadgroup float * so = (threadgroup float *) (shmem_f16 + 0*DK); // reuse query data for accumulation
|
||||
threadgroup float4 * so4 = (threadgroup float4 *) (shmem_f16 + 0*DK);
|
||||
|
||||
// store result to shared memory in F32
|
||||
if (sgitg == 0) {
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
//simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
simdgroup_float8x8 t(1.0f);
|
||||
simdgroup_multiply(t, lo[i], t);
|
||||
simdgroup_store(t, so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// reduce the warps sequentially
|
||||
for (ushort sg = 1; sg < nsg; ++sg) {
|
||||
float S = { 0.0f };
|
||||
float M = { -__FLT_MAX__/2 };
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// each simdgroup stores its output to shared memory, reusing sq
|
||||
if (sgitg == sg) {
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
for (short j = tiisg; j < Q; j += NW) {
|
||||
const float S0 = ss[j*TS - 1*SH + 0];
|
||||
const float S1 = ss[j*TS + 0];
|
||||
|
||||
const float M0 = ss[j*TS - 1*SH + 1];
|
||||
const float M1 = ss[j*TS + 1];
|
||||
|
||||
const float M = max(M0, M1);
|
||||
|
||||
float ms0 = exp(M0 - M);
|
||||
float ms1 = exp(M1 - M);
|
||||
|
||||
const float S = S0*ms0 + S1*ms1;
|
||||
|
||||
ss[j*TS + 0] = S;
|
||||
ss[j*TS + 1] = M;
|
||||
|
||||
ss[j*TS + 2*C + j - 1*SH] = ms0;
|
||||
ss[j*TS + 2*C + j ] = ms1;
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// the first simdgroup accumulates the results from the other simdgroups
|
||||
if (sgitg == 0) {
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
const float S0 = ss[j*TS + 0];
|
||||
const float S1 = ss[j*TS + sg*SH + 0];
|
||||
|
||||
const float M0 = ss[j*TS + 1];
|
||||
const float M1 = ss[j*TS + sg*SH + 1];
|
||||
|
||||
M = max(M0, M1);
|
||||
|
||||
const float ms0 = exp(M0 - M);
|
||||
const float ms1 = exp(M1 - M);
|
||||
|
||||
S = S0*ms0 + S1*ms1;
|
||||
|
||||
if (tiisg == 0) {
|
||||
ss[j*TS + 0] = S;
|
||||
ss[j*TS + 1] = M;
|
||||
|
||||
ss[j*TS + 2*C + j ] = ms0;
|
||||
ss[j*TS + 2*C + j + sg*SH] = ms1;
|
||||
}
|
||||
}
|
||||
//simdgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
|
||||
{
|
||||
s8x8_t ms0;
|
||||
s8x8_t ms1;
|
||||
|
||||
simdgroup_load(ms0, ss + 2*C, TS, 0, false);
|
||||
simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
|
||||
simdgroup_load(ms0, ss + 2*C - 1*SH, TS, 0, false);
|
||||
simdgroup_load(ms1, ss + 2*C, TS, 0, false);
|
||||
|
||||
#pragma unroll(DV8)
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
o8x8_t t;
|
||||
simdgroup_float8x8 t;
|
||||
|
||||
simdgroup_load (t, so + i*8, DV, 0, false);
|
||||
simdgroup_multiply(t, ms1, t);
|
||||
simdgroup_multiply(t, ms0, t);
|
||||
|
||||
simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
|
||||
simdgroup_multiply_accumulate(t, ms1, lo[i], t);
|
||||
simdgroup_store(t, so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
// store result to shared memory (reuse sq)
|
||||
if (sgitg == 0) {
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
|
||||
device float4 * dst4 = (device float4 *) dst;
|
||||
threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*(nsg-1)*SH + 2*Q*DK);
|
||||
|
||||
// final rescale with 1/S and store to global memory
|
||||
if (sgitg == 0) {
|
||||
for (short j = 0; j < Q && iq1 + j < args.ne01; ++j) {
|
||||
const float S = ss[j*TS + 0];
|
||||
for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) {
|
||||
const float S = 1.0f/sf[j*TS + 0];
|
||||
|
||||
for (short i = tiisg; i < DV4; i += NW) {
|
||||
dst4[((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*DV4 + i] = (float4) so4[j*DV4 + i]/S;
|
||||
}
|
||||
device float4 * dst4 = (device float4 *) dst + ((uint64_t)iq3*args.ne2*args.ne1 + iq2 + (uint64_t)(iq1 + j)*args.ne1)*DV4;
|
||||
|
||||
for (short i = tiisg; i < DV4; i += NW) {
|
||||
dst4[i] = (float4) so4[j*DV4 + i]*S;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3719,12 +3713,22 @@ kernel void kernel_flash_attn_ext(
|
||||
// template to be able to explore different combinations
|
||||
//
|
||||
#define FA_TYPES \
|
||||
half, half4, simdgroup_half8x8, \
|
||||
half, half4x4, simdgroup_half8x8, \
|
||||
half, half4x4, simdgroup_half8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
half, half4, simdgroup_half8x8
|
||||
float, float4, simdgroup_float8x8, \
|
||||
half, half4x4, simdgroup_half8x8, \
|
||||
half, half4x4, simdgroup_half8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
half, half4, simdgroup_half8x8
|
||||
//float, float4, simdgroup_float8x8
|
||||
|
||||
#define FA_TYPES_BF \
|
||||
bfloat, bfloat4, simdgroup_bfloat8x8, \
|
||||
bfloat, bfloat4x4, simdgroup_bfloat8x8, \
|
||||
bfloat, bfloat4x4, simdgroup_bfloat8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
half, half4, simdgroup_half8x8
|
||||
//float, float4, simdgroup_float8x8
|
||||
|
||||
typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>) flash_attn_ext_t;
|
||||
|
||||
@@ -3739,15 +3743,15 @@ template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_f16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 576, 512>;
|
||||
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 80, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 96, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 112, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 128, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 576, 512>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 64, 64>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 80, 80>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h96" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 96, 96>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 112, 112>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 128, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 192>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_hk192_hv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 128>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_h256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256, 256>;
|
||||
template [[host_name("kernel_flash_attn_ext_bf16_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 576, 512>;
|
||||
#endif
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_q4_0_h64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64, 64>;
|
||||
@@ -3801,6 +3805,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_h256")]] kernel flash_at
|
||||
template [[host_name("kernel_flash_attn_ext_q8_0_hk576_hv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 576, 512>;
|
||||
|
||||
#undef FA_TYPES
|
||||
#undef FA_TYPES_BF
|
||||
|
||||
template<
|
||||
typename q4_t, // query types in shared memory
|
||||
@@ -3847,12 +3852,12 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
|
||||
const short T = DK + nsg*SH; // shared memory size per query in (half)
|
||||
|
||||
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*DK); // scratch buffer for attention
|
||||
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*DK); // same as above but in s4_t
|
||||
threadgroup float * sm = (threadgroup float *) (shmem_f16 + sgitg*SH + 2*C + Q*DK); // scratch buffer for mask
|
||||
threadgroup o4_t * sr4 = (threadgroup o4_t *) (shmem_f16 + sgitg*DV + Q*T); // scratch buffer for the results
|
||||
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*DK); // scratch buffer for attention
|
||||
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*DK); // same as above but in s4_t
|
||||
threadgroup float * sm = (threadgroup float *) (shmem_f16 + sgitg*SH + 2*C + Q*DK); // scratch buffer for mask
|
||||
threadgroup o4_t * sr4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*DV + Q*T); // scratch buffer for the results
|
||||
|
||||
// store the result for all queries in local memory (the O matrix from the paper)
|
||||
o4_t lo[DV4/NL];
|
||||
@@ -4157,7 +4162,7 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
half4, \
|
||||
float, \
|
||||
float, float4, \
|
||||
half4
|
||||
float4
|
||||
|
||||
typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
|
||||
|
||||
|
||||
@@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mv_q4_0_f32_1d_8x_flat
|
||||
mul_mv_q4_0_f32_1d_16x_flat
|
||||
mul_mv_q6_k
|
||||
mul_mv_id_q4_0_f32_8x_flat
|
||||
mul
|
||||
norm
|
||||
relu
|
||||
@@ -95,6 +96,12 @@ set(GGML_OPENCL_KERNELS
|
||||
sub
|
||||
sum_rows
|
||||
transpose
|
||||
concat
|
||||
tsembd
|
||||
upscale
|
||||
tanh
|
||||
pad
|
||||
repeat
|
||||
)
|
||||
|
||||
foreach (K ${GGML_OPENCL_KERNELS})
|
||||
|
||||
@@ -315,6 +315,13 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_softmax_4_f16;
|
||||
cl_program program_argsort_f32_i32;
|
||||
cl_program program_sum_rows_f32;
|
||||
cl_program program_repeat;
|
||||
cl_program program_pad;
|
||||
cl_program program_tanh;
|
||||
cl_program program_upscale;
|
||||
cl_program program_concat;
|
||||
cl_program program_tsembd;
|
||||
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
cl_kernel kernel_add, kernel_add_row;
|
||||
cl_kernel kernel_mul, kernel_mul_row;
|
||||
@@ -351,6 +358,16 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
||||
cl_kernel kernel_argsort_f32_i32;
|
||||
cl_kernel kernel_sum_rows_f32;
|
||||
cl_kernel kernel_repeat;
|
||||
cl_kernel kernel_pad;
|
||||
cl_kernel kernel_tanh_f32_nd;
|
||||
cl_kernel kernel_tanh_f16_nd;
|
||||
cl_kernel kernel_upscale;
|
||||
cl_kernel kernel_upscale_bilinear;
|
||||
cl_kernel kernel_concat_f32_contiguous;
|
||||
cl_kernel kernel_concat_f32_non_contiguous;
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Transpose kernels
|
||||
@@ -1097,6 +1114,166 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// repeat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "repeat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("repeat.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_repeat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
|
||||
backend_ctx->program_repeat = nullptr;
|
||||
backend_ctx->kernel_repeat = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// pad
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "pad.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("pad.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_pad =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n");
|
||||
backend_ctx->program_pad = nullptr;
|
||||
backend_ctx->kernel_pad = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// tanh
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "tanh.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("tanh.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_tanh =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
|
||||
backend_ctx->program_tanh = nullptr;
|
||||
backend_ctx->kernel_tanh_f32_nd = nullptr;
|
||||
backend_ctx->kernel_tanh_f16_nd = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// upscale
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "upscale.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("upscale.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_upscale =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
|
||||
if (backend_ctx->program_upscale) {
|
||||
cl_int err_bilinear;
|
||||
backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
|
||||
if (err_bilinear != CL_SUCCESS) {
|
||||
GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
|
||||
backend_ctx->kernel_upscale_bilinear = nullptr;
|
||||
}
|
||||
} else {
|
||||
backend_ctx->kernel_upscale_bilinear = nullptr;
|
||||
}
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n");
|
||||
backend_ctx->program_upscale = nullptr;
|
||||
backend_ctx->kernel_upscale = nullptr;
|
||||
backend_ctx->kernel_upscale_bilinear = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// concat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "concat.cl.h"
|
||||
};
|
||||
#else
|
||||
|
||||
const std::string kernel_src = read_file("concat.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_concat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
|
||||
backend_ctx->program_concat = nullptr;
|
||||
backend_ctx->kernel_concat_f32_contiguous = nullptr;
|
||||
backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// timestep_embedding
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "tsembd.cl.h"
|
||||
};
|
||||
#else
|
||||
|
||||
const std::string kernel_src = read_file("tsembd.cl");
|
||||
#endif
|
||||
if (!kernel_src.empty()) {
|
||||
backend_ctx->program_tsembd =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
} else {
|
||||
GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
|
||||
backend_ctx->program_tsembd = nullptr;
|
||||
backend_ctx->kernel_timestep_embedding = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// mul_mv_id_q4_0_f32_8x_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
|
||||
#endif
|
||||
backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// Adreno kernels
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// transpose
|
||||
@@ -1863,7 +2040,12 @@ static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const g
|
||||
}
|
||||
|
||||
static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
|
||||
GGML_UNUSED(backend);
|
||||
auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseEvent(evt));
|
||||
}
|
||||
|
||||
// Syncronizes the 'backend_ctx's device with others so that commands
|
||||
@@ -1976,9 +2158,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_UNARY_OP_TANH:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
||||
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -1988,6 +2173,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return true;
|
||||
case GGML_OP_REPEAT:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
|
||||
case GGML_OP_PAD:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
|
||||
op->src[0]->ne[3] == 1 && op->ne[3] == 1;
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONCAT:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_MUL_MAT:
|
||||
@@ -2000,6 +2196,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
}
|
||||
return false;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
if (op->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
if (op->src[1]->type == GGML_TYPE_F32) {
|
||||
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
@@ -2052,7 +2255,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
|
||||
/* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
|
||||
/* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
|
||||
/* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
|
||||
/* .synchronize = */ NULL, /* ggml_backend_opencl_synchronize */
|
||||
/* .synchronize = */ ggml_backend_opencl_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
@@ -4108,6 +4311,536 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
UNUSED(src1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_tanh_f32_nd;
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
kernel = backend_ctx->kernel_tanh_f16_nd;
|
||||
} else {
|
||||
GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
|
||||
}
|
||||
GGML_ASSERT(kernel != nullptr);
|
||||
|
||||
const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
|
||||
const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
|
||||
const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
||||
|
||||
size_t global_work_size[3];
|
||||
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
||||
return;
|
||||
}
|
||||
global_work_size[0] = (size_t)ne10;
|
||||
global_work_size[1] = (size_t)ne11;
|
||||
global_work_size[2] = (size_t)ne12;
|
||||
|
||||
size_t lws0 = 16, lws1 = 4, lws2 = 1;
|
||||
if (ne10 < 16) lws0 = ne10;
|
||||
if (ne11 < 4) lws1 = ne11;
|
||||
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
|
||||
|
||||
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
|
||||
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
|
||||
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
|
||||
|
||||
|
||||
size_t local_work_size[] = {lws0, lws1, lws2};
|
||||
|
||||
size_t* local_work_size_ptr = local_work_size;
|
||||
if (!backend_ctx->non_uniform_workgroups) {
|
||||
if (global_work_size[0] % local_work_size[0] != 0 ||
|
||||
global_work_size[1] % local_work_size[1] != 0 ||
|
||||
global_work_size[2] % local_work_size[2] != 0) {
|
||||
local_work_size_ptr = NULL;
|
||||
}
|
||||
}
|
||||
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
||||
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_ASSERT(dst->type == src0->type);
|
||||
|
||||
UNUSED(src1_shape_def);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
if (backend_ctx->kernel_repeat == nullptr) {
|
||||
GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
||||
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
||||
|
||||
const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
|
||||
const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
|
||||
|
||||
const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
|
||||
const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_repeat;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
|
||||
|
||||
size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
|
||||
size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
|
||||
size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
|
||||
|
||||
size_t global_work_size[] = { gws0, gws1, gws2 };
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
if (backend_ctx->kernel_pad == nullptr) {
|
||||
GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
||||
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
||||
|
||||
const int s_ne0 = src0->ne[0];
|
||||
const int s_ne1 = src0->ne[1];
|
||||
const int s_ne2 = src0->ne[2];
|
||||
|
||||
const int d_ne0 = dst->ne[0];
|
||||
const int d_ne1 = dst->ne[1];
|
||||
const int d_ne2 = dst->ne[2];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_pad;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne2));
|
||||
|
||||
size_t lws0 = 64;
|
||||
size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
|
||||
|
||||
size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
|
||||
size_t local_work_size[] = { lws0, 1, 1 };
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
||||
cl_kernel kernel = nullptr;
|
||||
|
||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||
kernel = backend_ctx->kernel_upscale;
|
||||
if (kernel == nullptr) {
|
||||
GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||
kernel = backend_ctx->kernel_upscale_bilinear;
|
||||
if (kernel == nullptr) {
|
||||
GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
||||
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne00_src = src0->ne[0];
|
||||
const int ne01_src = src0->ne[1];
|
||||
|
||||
const int ne10_dst = dst->ne[0];
|
||||
const int ne11_dst = dst->ne[1];
|
||||
const int ne12_dst = dst->ne[2];
|
||||
const int ne13_dst = dst->ne[3];
|
||||
|
||||
const float sf0 = (float)dst->ne[0] / src0->ne[0];
|
||||
const float sf1 = (float)dst->ne[1] / src0->ne[1];
|
||||
const float sf2 = (float)dst->ne[2] / src0->ne[2];
|
||||
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
|
||||
|
||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
|
||||
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
|
||||
}
|
||||
|
||||
|
||||
size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
if (dst_total_elements == 0) {
|
||||
return;
|
||||
}
|
||||
size_t global_work_size[] = { dst_total_elements, 1, 1 };
|
||||
size_t local_work_size_pref = 256;
|
||||
size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr;
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
size_t profiling_gws[3] = {global_work_size[0], 1, 1};
|
||||
size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1};
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
|
||||
GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
|
||||
cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
|
||||
cl_ulong off_dst = extrad_cl->offset + dst->view_offs;
|
||||
|
||||
const int32_t dim = ((const int32_t *) dst->op_params)[0];
|
||||
GGML_ASSERT(dim >= 0 && dim <= 3);
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
||||
if (dim == 3) {
|
||||
|
||||
size_t nbytes_src0 = ggml_nbytes(src0);
|
||||
size_t nbytes_src1 = ggml_nbytes(src1);
|
||||
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
|
||||
off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
|
||||
off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
|
||||
} else {
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
|
||||
size_t global_work_size[3];
|
||||
|
||||
for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
|
||||
cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
|
||||
cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
|
||||
cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
|
||||
|
||||
int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
|
||||
int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
|
||||
int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), ¤t_off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), ¤t_off_src1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), ¤t_off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
|
||||
|
||||
global_work_size[0] = d_ne0;
|
||||
global_work_size[1] = d_ne1;
|
||||
global_work_size[2] = d_ne2;
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
|
||||
|
||||
long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
|
||||
cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
|
||||
|
||||
cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
|
||||
|
||||
long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
|
||||
cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
|
||||
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long), &d_ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long), &d_ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long), &d_ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long), &d_ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
|
||||
|
||||
size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
|
||||
d_ne2 > 0 ? (size_t)d_ne2 : 1,
|
||||
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
|
||||
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
if (backend_ctx->kernel_timestep_embedding == nullptr) {
|
||||
GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
||||
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
||||
|
||||
const int logical_dim = dst->op_params[0];
|
||||
const int max_period = dst->op_params[1];
|
||||
const int dst_nb1_bytes = dst->nb[1];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period));
|
||||
|
||||
size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
|
||||
|
||||
size_t gws1 = (size_t)src0->ne[0];
|
||||
|
||||
size_t global_work_size[] = {gws0, gws1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1};
|
||||
size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -4828,6 +5561,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
GGML_ASSERT(src2);
|
||||
GGML_ASSERT(src2->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
||||
#endif
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
|
||||
const int ne20 = src2->ne[0];
|
||||
const int ne21 = src2->ne[1];
|
||||
|
||||
const cl_ulong nb21 = src2->nb[1];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
const int r2 = ne12/ne02;
|
||||
const int r3 = ne13/ne03;
|
||||
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
|
||||
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
|
||||
int sgs = 32; // subgroup size
|
||||
int nsg = 1; // number of subgroups
|
||||
int nrows = 1; // number of row in src1
|
||||
int ndst = 4; // number of values produced by each subgroup
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
// subgroup mat vec
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0: {
|
||||
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
sgs = 16;
|
||||
nsg = 1;
|
||||
ndst = 8;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
sgs = 64;
|
||||
nsg = 1;
|
||||
ndst = 8;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false && "not implemented");;
|
||||
}
|
||||
|
||||
int _ne1 = 1;
|
||||
int ne123 = dst_rows;
|
||||
|
||||
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
||||
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -5667,6 +6530,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_sigmoid;
|
||||
break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_tanh;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
} break;
|
||||
@@ -5694,12 +6563,48 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_group_norm;
|
||||
break;
|
||||
case GGML_OP_REPEAT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_repeat;
|
||||
break;
|
||||
case GGML_OP_PAD:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
ggml_cl_pad(backend, tensor->src[0], tensor);
|
||||
return true;
|
||||
case GGML_OP_UPSCALE:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
ggml_cl_upscale(backend, tensor->src[0], tensor);
|
||||
return true;
|
||||
case GGML_OP_CONCAT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_concat;
|
||||
break;
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
|
||||
return true;
|
||||
case GGML_OP_MUL_MAT:
|
||||
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_mul_mat;
|
||||
break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_mul_mat_id;
|
||||
break;
|
||||
case GGML_OP_SCALE:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
109
ggml/src/ggml-opencl/kernels/concat.cl
Normal file
109
ggml/src/ggml-opencl/kernels/concat.cl
Normal file
@@ -0,0 +1,109 @@
|
||||
kernel void kernel_concat_f32_contiguous(
|
||||
global const char * p_src0, ulong off_src0,
|
||||
global const char * p_src1, ulong off_src1,
|
||||
global char * p_dst, ulong off_dst,
|
||||
int d_ne00, int d_ne01, int d_ne02, // src0->ne[0..2] for the slice
|
||||
int d_ne10, int d_ne11, int d_ne12, // src1->ne[0..2] for the slice (d_ne1X must match d_ne0X on non-concat axes)
|
||||
int d_ne0, int d_ne1, int d_ne2, // dst->ne[0..2] for the slice
|
||||
int dim
|
||||
) {
|
||||
global const float * src0 = (global const float*)((global char*)p_src0 + off_src0);
|
||||
global const float * src1 = (global const float*)((global char*)p_src1 + off_src1);
|
||||
global float * dst = (global float*)((global char*)p_dst + off_dst);
|
||||
|
||||
int i0 = get_global_id(0); // Index along dst's 0th dimension
|
||||
int i1 = get_global_id(1); // Index along dst's 1st dimension
|
||||
int i2 = get_global_id(2); // Index along dst's 2nd dimension
|
||||
|
||||
if (i0 >= d_ne0 || i1 >= d_ne1 || i2 >= d_ne2) {
|
||||
return;
|
||||
}
|
||||
|
||||
ulong dst_idx = (ulong)i2 * d_ne0 * d_ne1 + (ulong)i1 * d_ne0 + i0;
|
||||
ulong src_idx;
|
||||
|
||||
if (dim == 0) {
|
||||
if (i0 < d_ne00) { // Data from src0
|
||||
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
|
||||
dst[dst_idx] = src0[src_idx];
|
||||
} else { // Data from src1
|
||||
src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + (i0 - d_ne00);
|
||||
dst[dst_idx] = src1[src_idx];
|
||||
}
|
||||
} else if (dim == 1) {
|
||||
if (i1 < d_ne01) { // Data from src0
|
||||
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
|
||||
dst[dst_idx] = src0[src_idx];
|
||||
} else { // Data from src1
|
||||
src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)(i1 - d_ne01) * d_ne10 + i0;
|
||||
dst[dst_idx] = src1[src_idx];
|
||||
}
|
||||
} else if (dim == 2) {
|
||||
if (i2 < d_ne02) { // Data from src0
|
||||
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
|
||||
dst[dst_idx] = src0[src_idx];
|
||||
} else { // Data from src1
|
||||
|
||||
src_idx = (ulong)(i2 - d_ne02) * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + i0;
|
||||
dst[dst_idx] = src1[src_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_concat_f32_non_contiguous(
|
||||
global const char * p_src0, ulong off_src0,
|
||||
global const char * p_src1, ulong off_src1,
|
||||
global char * p_dst, ulong off_dst,
|
||||
|
||||
long ne00, long ne01, long ne02, long ne03,
|
||||
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
||||
|
||||
ulong nb10, ulong nb11, ulong nb12, ulong nb13, // Strides for src1
|
||||
|
||||
long d_ne0, long d_ne1, long d_ne2, long d_ne3,
|
||||
ulong d_nb0, ulong d_nb1, ulong d_nb2, ulong d_nb3,
|
||||
int dim
|
||||
) {
|
||||
global const char * src0_base = p_src0 + off_src0;
|
||||
global const char * src1_base = p_src1 + off_src1;
|
||||
global char * dst_base = p_dst + off_dst;
|
||||
|
||||
long current_i1 = get_global_id(0); // Index for dst_dim_1
|
||||
long current_i2 = get_global_id(1); // Index for dst_dim_2
|
||||
long current_i3 = get_global_id(2); // Index for dst_dim_3
|
||||
|
||||
if (current_i1 >= d_ne1 || current_i2 >= d_ne2 || current_i3 >= d_ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
global const float * x_val_ptr;
|
||||
global float * y_val_ptr;
|
||||
|
||||
for (long current_i0 = 0; current_i0 < d_ne0; ++current_i0) {
|
||||
bool use_src0;
|
||||
long s_i0 = current_i0, s_i1 = current_i1, s_i2 = current_i2, s_i3 = current_i3;
|
||||
|
||||
if (dim == 0) {
|
||||
use_src0 = (current_i0 < ne00);
|
||||
if (!use_src0) { s_i0 = current_i0 - ne00; }
|
||||
} else if (dim == 1) {
|
||||
use_src0 = (current_i1 < ne01);
|
||||
if (!use_src0) { s_i1 = current_i1 - ne01; }
|
||||
} else if (dim == 2) {
|
||||
use_src0 = (current_i2 < ne02);
|
||||
if (!use_src0) { s_i2 = current_i2 - ne02; }
|
||||
} else { // dim == 3
|
||||
use_src0 = (current_i3 < ne03);
|
||||
if (!use_src0) { s_i3 = current_i3 - ne03; }
|
||||
}
|
||||
|
||||
if (use_src0) {
|
||||
x_val_ptr = (global const float *)(src0_base + (ulong)s_i3*nb03 + (ulong)s_i2*nb02 + (ulong)s_i1*nb01 + (ulong)s_i0*nb00);
|
||||
} else {
|
||||
x_val_ptr = (global const float *)(src1_base + (ulong)s_i3*nb13 + (ulong)s_i2*nb12 + (ulong)s_i1*nb11 + (ulong)s_i0*nb10);
|
||||
}
|
||||
|
||||
y_val_ptr = (global float *)(dst_base + (ulong)current_i3*d_nb3 + (ulong)current_i2*d_nb2 + (ulong)current_i1*d_nb1 + (ulong)current_i0*d_nb0);
|
||||
*y_val_ptr = *x_val_ptr;
|
||||
}
|
||||
}
|
||||
283
ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
Normal file
283
ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
Normal file
@@ -0,0 +1,283 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK4_0 32
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef short int16_t;
|
||||
typedef ushort uint16_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_0
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q4_0
|
||||
{
|
||||
half d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
|
||||
// This function requires the original shuffled weights.
|
||||
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
|
||||
// packed together in a byte, so are (q[1], q[17]) and so on.
|
||||
inline float block_q_4_0_dot_y_flat(
|
||||
global uchar * x,
|
||||
global half * dh,
|
||||
float sumy,
|
||||
float16 yl,
|
||||
int il
|
||||
) {
|
||||
float d = *dh;
|
||||
global ushort * qs = ((global ushort *)x + il/2);
|
||||
float acc = 0.f;
|
||||
|
||||
acc += yl.s0 * (qs[0] & 0x000F);
|
||||
acc += yl.s1 * (qs[0] & 0x0F00);
|
||||
acc += yl.s8 * (qs[0] & 0x00F0);
|
||||
acc += yl.s9 * (qs[0] & 0xF000);
|
||||
|
||||
acc += yl.s2 * (qs[1] & 0x000F);
|
||||
acc += yl.s3 * (qs[1] & 0x0F00);
|
||||
acc += yl.sa * (qs[1] & 0x00F0);
|
||||
acc += yl.sb * (qs[1] & 0xF000);
|
||||
|
||||
acc += yl.s4 * (qs[2] & 0x000F);
|
||||
acc += yl.s5 * (qs[2] & 0x0F00);
|
||||
acc += yl.sc * (qs[2] & 0x00F0);
|
||||
acc += yl.sd * (qs[2] & 0xF000);
|
||||
|
||||
acc += yl.s6 * (qs[3] & 0x000F);
|
||||
acc += yl.s7 * (qs[3] & 0x0F00);
|
||||
acc += yl.se * (qs[3] & 0x00F0);
|
||||
acc += yl.sf * (qs[3] & 0xF000);
|
||||
|
||||
return d * (sumy * -8.f + acc);
|
||||
}
|
||||
|
||||
//
|
||||
// This variant outputs 8 values.
|
||||
//
|
||||
#undef N_DST
|
||||
#undef N_SIMDGROUP
|
||||
#undef N_SIMDWIDTH
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 8 // each SIMD group works on 8 rows
|
||||
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 16 // subgroup size
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 8
|
||||
#define N_SIMDGROUP 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
inline void mul_vec_q_n_f32_8x_flat(
|
||||
global char * src0_q,
|
||||
global half * src0_d,
|
||||
global float * src1,
|
||||
global float * dst,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
const ulong nb = ne00/QK4_0;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = 0;
|
||||
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
// The number of scales is the same as the number of blocks.
|
||||
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
||||
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
||||
|
||||
global uchar * x = (global uchar *) src0_q + offset0_q;
|
||||
global half * d = (global half *) src0_d + offset0_d;
|
||||
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float16 yl;
|
||||
float8 sumf = 0.f;
|
||||
|
||||
int ix = get_sub_group_local_id()/2;
|
||||
int il = 8*(get_sub_group_local_id()%2);
|
||||
|
||||
global float * yb = y + ix*QK4_0 + il;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
float sumy = 0.f;
|
||||
|
||||
sumy += yb[0];
|
||||
sumy += yb[1];
|
||||
sumy += yb[2];
|
||||
sumy += yb[3];
|
||||
sumy += yb[4];
|
||||
sumy += yb[5];
|
||||
sumy += yb[6];
|
||||
sumy += yb[7];
|
||||
|
||||
sumy += yb[16];
|
||||
sumy += yb[17];
|
||||
sumy += yb[18];
|
||||
sumy += yb[19];
|
||||
sumy += yb[20];
|
||||
sumy += yb[21];
|
||||
sumy += yb[22];
|
||||
sumy += yb[23];
|
||||
|
||||
yl.s0 = yb[0];
|
||||
yl.s1 = yb[1]/256.f;
|
||||
|
||||
yl.s2 = yb[2];
|
||||
yl.s3 = yb[3]/256.f;
|
||||
|
||||
yl.s4 = yb[4];
|
||||
yl.s5 = yb[5]/256.f;
|
||||
|
||||
yl.s6 = yb[6];
|
||||
yl.s7 = yb[7]/256.f;
|
||||
|
||||
yl.s8 = yb[16]/16.f;
|
||||
yl.s9 = yb[17]/4096.f;
|
||||
|
||||
yl.sa = yb[18]/16.f;
|
||||
yl.sb = yb[19]/4096.f;
|
||||
|
||||
yl.sc = yb[20]/16.f;
|
||||
yl.sd = yb[21]/4096.f;
|
||||
|
||||
yl.se = yb[22]/16.f;
|
||||
yl.sf = yb[23]/4096.f;
|
||||
|
||||
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
||||
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
||||
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
||||
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
||||
|
||||
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
||||
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
||||
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
||||
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
||||
|
||||
yb += QK4_0 * (N_SIMDWIDTH/2);
|
||||
}
|
||||
|
||||
float8 tot = (float8)(
|
||||
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
||||
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
||||
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
||||
);
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
}
|
||||
|
||||
if (first_row + 4 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
||||
}
|
||||
if (first_row + 5 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
||||
}
|
||||
if (first_row + 6 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
||||
}
|
||||
if (first_row + 7 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
|
||||
global char * src0_q,
|
||||
global half * src0_d,
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global char * src2,
|
||||
ulong offset2,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb02,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int ne20,
|
||||
int ne21,
|
||||
ulong nb21,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float *)((global char *)src1 + offset1);
|
||||
src2 = (global char *)((global char *)src2 + offset2);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
const int iid1 = get_group_id(2)/ne20;
|
||||
const int idx = get_group_id(2)%ne20;
|
||||
|
||||
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
|
||||
|
||||
const int i11 = idx%ne11;
|
||||
const int i12 = iid1;
|
||||
|
||||
const int i1 = idx;
|
||||
const int i2 = i12;
|
||||
|
||||
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
|
||||
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
|
||||
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
|
||||
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
|
||||
|
||||
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||
}
|
||||
30
ggml/src/ggml-opencl/kernels/pad.cl
Normal file
30
ggml/src/ggml-opencl/kernels/pad.cl
Normal file
@@ -0,0 +1,30 @@
|
||||
kernel void kernel_pad(
|
||||
global const void * src0_ptr,
|
||||
ulong src0_offset,
|
||||
global void * dst_ptr,
|
||||
ulong dst_offset,
|
||||
int s_ne0, int s_ne1, int s_ne2,
|
||||
int d_ne0, int d_ne1, int d_ne2
|
||||
) {
|
||||
global const float * src0 = (global const float *)((global const char *)src0_ptr + src0_offset);
|
||||
global float * dst = (global float *)((global char *)dst_ptr + dst_offset);
|
||||
|
||||
int nidx = get_global_id(0);
|
||||
int idx_d1 = get_group_id(1);
|
||||
int idx_d2 = get_group_id(2);
|
||||
|
||||
if (nidx >= d_ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int dst_el_offset = nidx + idx_d1 * d_ne0 + idx_d2 * d_ne0 * d_ne1;
|
||||
|
||||
bool in_src_bounds = (nidx < s_ne0) && (idx_d1 < s_ne1) && (idx_d2 < s_ne2);
|
||||
|
||||
if (in_src_bounds) {
|
||||
int src_el_offset = nidx + idx_d1 * s_ne0 + idx_d2 * s_ne0 * s_ne1;
|
||||
dst[dst_el_offset] = src0[src_el_offset];
|
||||
} else {
|
||||
dst[dst_el_offset] = 0.0f;
|
||||
}
|
||||
}
|
||||
39
ggml/src/ggml-opencl/kernels/repeat.cl
Normal file
39
ggml/src/ggml-opencl/kernels/repeat.cl
Normal file
@@ -0,0 +1,39 @@
|
||||
kernel void kernel_repeat(
|
||||
global const char * src0_data_in,
|
||||
global char * dst_data_in,
|
||||
ulong src0_offset,
|
||||
ulong dst_offset,
|
||||
int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3,
|
||||
ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3,
|
||||
int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3,
|
||||
ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3
|
||||
) {
|
||||
global const char * src0_data = src0_data_in + src0_offset;
|
||||
global char * dst_data = dst_data_in + dst_offset;
|
||||
|
||||
const int d3 = get_global_id(2);
|
||||
const int d2 = get_global_id(1);
|
||||
const int d1 = get_global_id(0);
|
||||
|
||||
if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int s3 = d3 % src0_ne3;
|
||||
const int s2 = d2 % src0_ne2;
|
||||
const int s1 = d1 % src0_ne1;
|
||||
|
||||
const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1;
|
||||
global char * p_dst_slice = dst_data + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1;
|
||||
|
||||
for (int d0 = 0; d0 < dst_ne0; ++d0) {
|
||||
// Determine source index for dimension 0 based on tiling/broadcasting.
|
||||
const int s0 = d0 % src0_ne0;
|
||||
|
||||
const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0;
|
||||
global char * restrict current_dst_el_ptr = p_dst_slice + (ulong)d0*dst_nb0;
|
||||
for (int k = 0; k < src0_nb0; ++k) {
|
||||
current_dst_el_ptr[k] = current_src_el_ptr[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
63
ggml/src/ggml-opencl/kernels/tanh.cl
Normal file
63
ggml/src/ggml-opencl/kernels/tanh.cl
Normal file
@@ -0,0 +1,63 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
kernel void kernel_tanh_f32_nd(
|
||||
global void * p_src0_base, ulong off_src0_abs,
|
||||
global void * p_dst_base, ulong off_dst_abs,
|
||||
int ne00, int ne01, int ne02, int ne03,
|
||||
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
ulong nb10, ulong nb11, ulong nb12, ulong nb13
|
||||
) {
|
||||
int i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int i2 = get_global_id(2);
|
||||
|
||||
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||
|
||||
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||
|
||||
*dst_val_ptr = tanh(*src_val_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_tanh_f16_nd(
|
||||
global void * p_src0_base, ulong off_src0_abs,
|
||||
global void * p_dst_base, ulong off_dst_abs,
|
||||
int ne00, int ne01, int ne02, int ne03,
|
||||
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
ulong nb10, ulong nb11, ulong nb12, ulong nb13
|
||||
) {
|
||||
int i0 = get_global_id(0);
|
||||
int i1 = get_global_id(1);
|
||||
int i2 = get_global_id(2);
|
||||
|
||||
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
||||
for (int i3 = 0; i3 < ne13; ++i3) {
|
||||
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
||||
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
||||
|
||||
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
||||
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
||||
|
||||
*dst_val_ptr = tanh(*src_val_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
48
ggml/src/ggml-opencl/kernels/tsembd.cl
Normal file
48
ggml/src/ggml-opencl/kernels/tsembd.cl
Normal file
@@ -0,0 +1,48 @@
|
||||
kernel void kernel_timestep_embedding(
|
||||
global const void * p_timesteps,
|
||||
ulong off_timesteps,
|
||||
global void * p_dst,
|
||||
ulong off_dst,
|
||||
int dst_nb1_bytes,
|
||||
int logical_dim,
|
||||
int max_period
|
||||
) {
|
||||
int local_i;
|
||||
int local_j;
|
||||
int local_half_dim;
|
||||
float local_timestep_val;
|
||||
float local_freq;
|
||||
float local_arg;
|
||||
global float * local_embed_data_ptr;
|
||||
global const float * local_timesteps_input_ptr;
|
||||
global float * local_dst_output_base_ptr;
|
||||
|
||||
local_timesteps_input_ptr = (global const float *)((global char *)p_timesteps + off_timesteps);
|
||||
local_dst_output_base_ptr = (global float *)((global char *)p_dst + off_dst);
|
||||
|
||||
local_i = get_global_id(1);
|
||||
local_j = get_global_id(0);
|
||||
|
||||
local_half_dim = logical_dim / 2;
|
||||
local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes);
|
||||
|
||||
if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) {
|
||||
local_embed_data_ptr[logical_dim] = 0.0f;
|
||||
}
|
||||
|
||||
if (local_j >= local_half_dim) {
|
||||
return;
|
||||
}
|
||||
|
||||
local_timestep_val = local_timesteps_input_ptr[local_i];
|
||||
|
||||
if (local_half_dim == 0) {
|
||||
local_freq = 1.0f;
|
||||
} else {
|
||||
local_freq = exp(-log((float)max_period) * (float)local_j / (float)local_half_dim);
|
||||
}
|
||||
|
||||
local_arg = local_timestep_val * local_freq;
|
||||
local_embed_data_ptr[local_j] = cos(local_arg);
|
||||
local_embed_data_ptr[local_j + local_half_dim] = sin(local_arg);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user