mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
1 Commits
b4585
...
gg/ci-pyth
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
492eaad571 |
@@ -2,10 +2,6 @@ ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
ARG GGML_CPU_ARM_ARCH=armv8-a
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
@@ -13,14 +9,7 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
||||
else \
|
||||
echo "Unsupported architecture"; \
|
||||
exit 1; \
|
||||
fi && \
|
||||
RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
@@ -13,13 +13,9 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
exec ./llama-quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
exec ./llama-cli "$@"
|
||||
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
|
||||
exec ./llama-bench "$@"
|
||||
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
|
||||
exec ./llama-perplexity "$@"
|
||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in $(ls $1/$2/ggml-model-f16.bin*); do
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
if [ -f "${i/f16/q4_0}" ]; then
|
||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||
else
|
||||
@@ -34,10 +30,6 @@ else
|
||||
echo "Available commands: "
|
||||
echo " --run (-r): Run a model previously converted into ggml"
|
||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
|
||||
echo " ex: -m model.gguf"
|
||||
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
|
||||
echo " ex: -m model.gguf -f file.txt"
|
||||
echo " --convert (-c): Convert a llama model into ggml"
|
||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
ARG UBUNTU_VERSION=jammy
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
@@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget
|
||||
|
||||
# Install Vulkan SDK and cURL
|
||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||
apt update -y && \
|
||||
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
||||
|
||||
@@ -34,7 +34,7 @@ RUN mkdir -p /app/full \
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl libvulkan-dev \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
@@ -55,9 +55,8 @@ RUN apt-get update \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
157
.github/workflows/build.yml
vendored
157
.github/workflows/build.yml
vendored
@@ -56,7 +56,6 @@ jobs:
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
@@ -88,7 +87,6 @@ jobs:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
@@ -121,7 +119,6 @@ jobs:
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
@@ -152,7 +149,6 @@ jobs:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
@@ -162,8 +158,8 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||
name: llama-bin-macos-x64.zip
|
||||
|
||||
ubuntu-cpu-cmake:
|
||||
runs-on: ubuntu-22.04
|
||||
ubuntu-latest-cmake:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -183,10 +179,7 @@ jobs:
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DGGML_RPC=ON
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
@@ -224,7 +217,6 @@ jobs:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
@@ -242,7 +234,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
build_type: [Debug, Release]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -261,10 +253,7 @@ jobs:
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
@@ -273,11 +262,7 @@ jobs:
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
@@ -307,8 +292,7 @@ jobs:
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. \
|
||||
-DGGML_RPC=ON
|
||||
cmake -DGGML_RPC=ON ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
@@ -338,8 +322,7 @@ jobs:
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. \
|
||||
-DGGML_VULKAN=ON
|
||||
cmake -DGGML_VULKAN=ON ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
@@ -366,18 +349,13 @@ jobs:
|
||||
- name: Build with native CMake HIP support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGGML_HIP=ON
|
||||
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Build with legacy HIP support
|
||||
id: cmake_build_legacy_hip
|
||||
run: |
|
||||
cmake -B build2 -S . \
|
||||
-DCMAKE_C_COMPILER=hipcc \
|
||||
-DCMAKE_CXX_COMPILER=hipcc \
|
||||
-DGGML_HIP=ON
|
||||
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
|
||||
cmake --build build2 --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
@@ -398,8 +376,7 @@ jobs:
|
||||
- name: Build with native CMake MUSA support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DGGML_MUSA=ON
|
||||
cmake -B build -S . -DGGML_MUSA=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-sycl:
|
||||
@@ -440,10 +417,7 @@ jobs:
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx
|
||||
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-sycl-fp16:
|
||||
@@ -484,13 +458,42 @@ jobs:
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DGGML_SYCL_F16=ON
|
||||
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||
# how to debug it.
|
||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
||||
# would be great if we fix these
|
||||
macOS-latest-cmake:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
macOS-latest-cmake-ios:
|
||||
runs-on: macos-latest
|
||||
|
||||
@@ -613,7 +616,6 @@ jobs:
|
||||
msystem: ${{matrix.sys}}
|
||||
install: >-
|
||||
base-devel
|
||||
git
|
||||
mingw-w64-${{matrix.env}}-toolchain
|
||||
mingw-w64-${{matrix.env}}-cmake
|
||||
mingw-w64-${{matrix.env}}-openblas
|
||||
@@ -794,7 +796,6 @@ jobs:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
|
||||
Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
@@ -822,13 +823,7 @@ jobs:
|
||||
|
||||
- name: Build with CMake
|
||||
run: |
|
||||
cmake -S . -B build -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_CUDA_ARCHITECTURES=89-real \
|
||||
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CUDA=ON
|
||||
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
|
||||
cmake --build build
|
||||
|
||||
windows-2019-cmake-cuda:
|
||||
@@ -917,11 +912,7 @@ jobs:
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DLLAMA_BUILD_SERVER=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DGGML_RPC=ON
|
||||
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
@@ -1074,12 +1065,7 @@ jobs:
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_RPC=ON
|
||||
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
windows-latest-cmake-hip-release:
|
||||
@@ -1117,13 +1103,7 @@ jobs:
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_RPC=ON
|
||||
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||
@@ -1217,7 +1197,8 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- ubuntu-cpu-cmake
|
||||
- ubuntu-latest-cmake
|
||||
- macOS-latest-cmake
|
||||
- windows-latest-cmake
|
||||
- windows-2019-cmake-cuda
|
||||
- windows-latest-cmake-hip-release
|
||||
@@ -1476,37 +1457,3 @@ jobs:
|
||||
# popd
|
||||
# emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
# make
|
||||
|
||||
openEuler-latest-cmake-cann:
|
||||
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -el {0}
|
||||
runs-on: ubuntu-24.04-arm
|
||||
strategy:
|
||||
matrix:
|
||||
cann:
|
||||
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
|
||||
device:
|
||||
- 'ascend910b3'
|
||||
build:
|
||||
- 'Release'
|
||||
container: ascendai/cann:${{ matrix.cann }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
yum update -y
|
||||
yum install -y git gcc gcc-c++ make cmake
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${{ matrix.device }}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
3
.github/workflows/docker.yml
vendored
3
.github/workflows/docker.yml
vendored
@@ -28,11 +28,10 @@ jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
# Multi-stage build
|
||||
|
||||
25
.github/workflows/server.yml
vendored
25
.github/workflows/server.yml
vendored
@@ -112,9 +112,9 @@ jobs:
|
||||
-DGGML_OPENMP=OFF ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Build (sanitizers)
|
||||
id: cmake_build_sanitizers
|
||||
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
@@ -124,31 +124,12 @@ jobs:
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Build (sanitizers)
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer == '' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ matrix.sanitizer == '' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
./tests.sh
|
||||
|
||||
- name: Tests (sanitizers)
|
||||
id: server_integration_tests_sanitizers
|
||||
if: ${{ matrix.sanitizer != '' }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
LLAMA_SANITIZE=1 ./tests.sh
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -18,7 +18,6 @@
|
||||
*.metallib
|
||||
*.o
|
||||
*.so
|
||||
*.swp
|
||||
*.tmp
|
||||
|
||||
# IDE / OS
|
||||
|
||||
@@ -16,7 +16,6 @@ endif()
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
|
||||
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||
set(LLAMA_STANDALONE ON)
|
||||
@@ -50,8 +49,6 @@ endif()
|
||||
if (MSVC)
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
|
||||
endif()
|
||||
|
||||
#
|
||||
@@ -86,8 +83,11 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
||||
|
||||
# override ggml options
|
||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
|
||||
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
|
||||
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
|
||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||
|
||||
# change the default for these ggml options
|
||||
if (NOT DEFINED GGML_LLAMAFILE)
|
||||
@@ -117,62 +117,16 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
|
||||
|
||||
if (NOT MSVC)
|
||||
if (LLAMA_SANITIZE_THREAD)
|
||||
message(STATUS "Using -fsanitize=thread")
|
||||
|
||||
add_compile_options(-fsanitize=thread)
|
||||
link_libraries (-fsanitize=thread)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SANITIZE_ADDRESS)
|
||||
message(STATUS "Using -fsanitize=address")
|
||||
|
||||
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
link_libraries (-fsanitize=address)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SANITIZE_UNDEFINED)
|
||||
message(STATUS "Using -fsanitize=undefined")
|
||||
|
||||
add_compile_options(-fsanitize=undefined)
|
||||
link_libraries (-fsanitize=undefined)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# 3rd-party
|
||||
# build the library
|
||||
#
|
||||
|
||||
if (NOT TARGET ggml)
|
||||
add_subdirectory(ggml)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
|
||||
#
|
||||
# build the library
|
||||
#
|
||||
|
||||
add_subdirectory(src)
|
||||
|
||||
#
|
||||
# utils, programs, examples and tests
|
||||
#
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
add_subdirectory(common)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
include(CTest)
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
add_subdirectory(pocs)
|
||||
endif()
|
||||
|
||||
#
|
||||
# install
|
||||
#
|
||||
@@ -188,14 +142,27 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
|
||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||
|
||||
# At the moment some compile definitions are placed within the ggml/src
|
||||
# directory but not exported on the `ggml` target. This could be improved by
|
||||
# determining _precisely_ which defines are necessary for the llama-config
|
||||
# package.
|
||||
#
|
||||
set(GGML_TRANSIENT_DEFINES)
|
||||
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
|
||||
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
|
||||
if (GGML_DIR_DEFINES)
|
||||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
|
||||
endif()
|
||||
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
|
||||
if (GGML_TARGET_DEFINES)
|
||||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
|
||||
endif()
|
||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||
# all public headers
|
||||
set(LLAMA_PUBLIC_HEADERS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
|
||||
|
||||
set_target_properties(llama
|
||||
PROPERTIES
|
||||
PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
|
||||
configure_package_config_file(
|
||||
@@ -233,3 +200,21 @@ configure_file(cmake/llama.pc.in
|
||||
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||
DESTINATION lib/pkgconfig)
|
||||
|
||||
#
|
||||
# utils, programs, examples and tests
|
||||
#
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
add_subdirectory(common)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
include(CTest)
|
||||
add_subdirectory(tests)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
add_subdirectory(pocs)
|
||||
endif()
|
||||
|
||||
2
Makefile
2
Makefile
@@ -1361,9 +1361,7 @@ llama-server: \
|
||||
examples/server/httplib.h \
|
||||
examples/server/index.html.hpp \
|
||||
examples/server/loading.html.hpp \
|
||||
common/chat-template.hpp \
|
||||
common/json.hpp \
|
||||
common/minja.hpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
|
||||
@@ -16,10 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427
|
||||
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
|
||||
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
|
||||
@@ -207,7 +204,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
||||
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
||||
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
|
||||
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
14
ci/run.sh
14
ci/run.sh
@@ -299,7 +299,7 @@ function gg_run_open_llama_7b_v2 {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -433,7 +433,7 @@ function gg_run_pythia_1_4b {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -564,7 +564,7 @@ function gg_run_pythia_2_8b {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -699,7 +699,7 @@ function gg_run_embd_bge_small {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -747,7 +747,7 @@ function gg_run_rerank_tiny {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
|
||||
@@ -814,8 +814,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
mkdir -p ${mnt_models}
|
||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||
|
||||
# Create a fresh python3 venv and enter it
|
||||
if ! python3 -m venv "$MNT/venv"; then
|
||||
# Create a fresh python venv and enter it
|
||||
if ! python -m venv "$MNT/venv"; then
|
||||
echo "Error: Failed to create Python virtual environment at $MNT/venv."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -44,7 +44,7 @@ if(MSVC)
|
||||
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
||||
else()
|
||||
execute_process(
|
||||
COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
|
||||
COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
|
||||
OUTPUT_VARIABLE OUT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
|
||||
@@ -3,13 +3,159 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||
|
||||
set(GGML_STATIC @GGML_STATIC@)
|
||||
set(GGML_NATIVE @GGML_NATIVE@)
|
||||
set(GGML_LTO @GGML_LTO@)
|
||||
set(GGML_CCACHE @GGML_CCACHE@)
|
||||
set(GGML_AVX @GGML_AVX@)
|
||||
set(GGML_AVX2 @GGML_AVX2@)
|
||||
set(GGML_AVX512 @GGML_AVX512@)
|
||||
set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@)
|
||||
set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@)
|
||||
set(GGML_AVX512_BF16 @GGML_AVX512_BF16@)
|
||||
set(GGML_AMX_TILE @GGML_AMX_TILE@)
|
||||
set(GGML_AMX_INT8 @GGML_AMX_INT8@)
|
||||
set(GGML_AMX_BF16 @GGML_AMX_BF16@)
|
||||
set(GGML_FMA @GGML_FMA@)
|
||||
set(GGML_LASX @GGML_LASX@)
|
||||
set(GGML_LSX @GGML_LSX@)
|
||||
set(GGML_RVV @GGML_RVV@)
|
||||
set(GGML_SVE @GGML_SVE@)
|
||||
|
||||
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||
set(GGML_OPENMP @GGML_OPENMP@)
|
||||
set(GGML_CPU_HBM @GGML_CPU_HBM@)
|
||||
set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@)
|
||||
|
||||
set(GGML_CUDA_FORCE_MMQ @GGML_CUDA_FORCE_MMQ@)
|
||||
set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@)
|
||||
set(GGML_CUDA_F16 @GGML_CUDA_F16@)
|
||||
set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@)
|
||||
set(GGML_CUDA_NO_PEER_COPY @GGML_CUDA_NO_PEER_COPY@)
|
||||
set(GGML_CUDA_NO_VMM @GGML_CUDA_NO_VMM@)
|
||||
set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@)
|
||||
set(GGML_CUDA_GRAPHS @GGML_CUDA_GRAPHS@)
|
||||
|
||||
set(GGML_HIP_UMA @GGML_HIP_UMA@)
|
||||
|
||||
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
|
||||
set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
|
||||
set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
|
||||
set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@)
|
||||
set(GGML_VULKAN_PERF @GGML_VULKAN_PERF@)
|
||||
set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
|
||||
set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@)
|
||||
|
||||
set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@)
|
||||
set(GGML_METAL_NDEBUG @GGML_METAL_NDEBUG@)
|
||||
set(GGML_METAL_SHADER_DEBUG @GGML_METAL_SHADER_DEBUG@)
|
||||
set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@)
|
||||
set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@)
|
||||
set(GGML_METAL_STD @GGML_METAL_STD@)
|
||||
|
||||
set(GGML_SYCL_F16 @GGML_SYCL_F16@)
|
||||
set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@)
|
||||
set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@)
|
||||
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||
|
||||
find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
|
||||
set(_llama_link_deps "")
|
||||
set(_llama_link_opts "")
|
||||
foreach(_ggml_lib ggml ggml-base)
|
||||
string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
|
||||
find_library(${_ggml_lib_var} ${_ggml_lib}
|
||||
REQUIRED
|
||||
HINTS ${LLAMA_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH
|
||||
)
|
||||
list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
|
||||
message(STATUS "Found ${${_ggml_lib_var}}")
|
||||
endforeach()
|
||||
|
||||
foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
|
||||
string(TOUPPER "GGML_${backend}" backend_id)
|
||||
set(_ggml_lib "ggml-${backend}")
|
||||
string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
|
||||
|
||||
find_library(${_ggml_lib_var} ${_ggml_lib}
|
||||
HINTS ${LLAMA_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH
|
||||
)
|
||||
if(${_ggml_lib_var})
|
||||
list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
|
||||
set(${backend_id} ON)
|
||||
message(STATUS "Found backend ${${_ggml_lib_var}}")
|
||||
else()
|
||||
set(${backend_id} OFF)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if (NOT LLAMA_SHARED_LIB)
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||
list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP REQUIRED)
|
||||
list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
find_library(memkind memkind REQUIRED)
|
||||
list(APPEND _llama_link_deps memkind)
|
||||
endif()
|
||||
|
||||
if (GGML_BLAS)
|
||||
find_package(BLAS REQUIRED)
|
||||
list(APPEND _llama_link_deps ${BLAS_LIBRARIES})
|
||||
list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS})
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY}
|
||||
${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN)
|
||||
find_package(Vulkan REQUIRED)
|
||||
list(APPEND _llama_link_deps Vulkan::Vulkan)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL)
|
||||
find_package(DNNL)
|
||||
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
list(APPEND _llama_link_deps DNNL::dnnl)
|
||||
endif()
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_library(llama_LIBRARY llama
|
||||
REQUIRED
|
||||
@@ -21,10 +167,12 @@ add_library(llama UNKNOWN IMPORTED)
|
||||
set_target_properties(llama
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
||||
INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
|
||||
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
|
||||
INTERFACE_LINK_OPTIONS "${_llama_link_opts}"
|
||||
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${llama_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
INTERFACE_COMPILE_FEATURES cxx_std_11
|
||||
POSITION_INDEPENDENT_CODE ON )
|
||||
|
||||
check_required_components(Llama)
|
||||
|
||||
@@ -56,7 +56,6 @@ add_library(${TARGET} STATIC
|
||||
arg.cpp
|
||||
arg.h
|
||||
base64.hpp
|
||||
chat-template.hpp
|
||||
common.cpp
|
||||
common.h
|
||||
console.cpp
|
||||
@@ -65,7 +64,6 @@ add_library(${TARGET} STATIC
|
||||
json.hpp
|
||||
log.cpp
|
||||
log.h
|
||||
minja.hpp
|
||||
ngram-cache.cpp
|
||||
ngram-cache.h
|
||||
sampling.cpp
|
||||
|
||||
@@ -133,8 +133,7 @@ static void common_params_handle_model_default(
|
||||
const std::string & model_url,
|
||||
std::string & hf_repo,
|
||||
std::string & hf_file,
|
||||
const std::string & hf_token,
|
||||
const std::string & model_default) {
|
||||
const std::string & hf_token) {
|
||||
if (!hf_repo.empty()) {
|
||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||
if (hf_file.empty()) {
|
||||
@@ -164,7 +163,7 @@ static void common_params_handle_model_default(
|
||||
model = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
||||
}
|
||||
} else if (model.empty()) {
|
||||
model = model_default;
|
||||
model = DEFAULT_MODEL_PATH;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -300,9 +299,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
}
|
||||
|
||||
// TODO: refactor model params in a common struct
|
||||
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token, DEFAULT_MODEL_PATH);
|
||||
common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
|
||||
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token, "");
|
||||
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token);
|
||||
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token);
|
||||
|
||||
if (params.escape) {
|
||||
string_process_escapes(params.prompt);
|
||||
@@ -325,14 +323,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
||||
}
|
||||
|
||||
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
||||
throw std::runtime_error(string_format(
|
||||
"error: the supplied chat template is not supported: %s%s\n",
|
||||
params.chat_template.c_str(),
|
||||
params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
|
||||
));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -386,30 +376,6 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
|
||||
return devices;
|
||||
}
|
||||
|
||||
static void add_rpc_devices(std::string servers) {
|
||||
auto rpc_servers = string_split<std::string>(servers, ',');
|
||||
if (rpc_servers.empty()) {
|
||||
throw std::invalid_argument("no RPC servers specified");
|
||||
}
|
||||
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||
if (!rpc_reg) {
|
||||
throw std::invalid_argument("failed to find RPC backend");
|
||||
}
|
||||
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
|
||||
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
||||
if (!ggml_backend_rpc_add_device_fn) {
|
||||
throw std::invalid_argument("failed to find RPC device add function");
|
||||
}
|
||||
for (const auto & server : rpc_servers) {
|
||||
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
|
||||
if (dev) {
|
||||
ggml_backend_device_register(dev);
|
||||
} else {
|
||||
throw std::invalid_argument("failed to register RPC device");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
||||
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
||||
@@ -877,7 +843,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.warmup = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--spm-infill"},
|
||||
string_format(
|
||||
@@ -1419,8 +1385,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"--rpc"}, "SERVERS",
|
||||
"comma separated list of RPC servers",
|
||||
[](common_params & params, const std::string & value) {
|
||||
add_rpc_devices(value);
|
||||
GGML_UNUSED(params);
|
||||
params.rpc_servers = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_RPC"));
|
||||
}
|
||||
@@ -1639,13 +1604,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.hf_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HF_REPO"));
|
||||
add_opt(common_arg(
|
||||
{"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
|
||||
"Same as --hf-repo, but for the draft model (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.hf_repo = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_HFD_REPO"));
|
||||
add_opt(common_arg(
|
||||
{"-hff", "--hf-file"}, "FILE",
|
||||
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
|
||||
@@ -1955,44 +1913,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--jinja"},
|
||||
"use jinja template for chat (default: disabled)",
|
||||
[](common_params & params) {
|
||||
params.use_jinja = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template"}, "JINJA_TEMPLATE",
|
||||
string_format(
|
||||
"set custom jinja chat template (default: template taken from model's metadata)\n"
|
||||
"if suffix/prefix are specified, template will be disabled\n"
|
||||
"only commonly used templates are accepted (unless --jinja is set before this flag):\n"
|
||||
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
|
||||
),
|
||||
[](common_params & params, const std::string & value) {
|
||||
if (!common_chat_verify_template(value)) {
|
||||
throw std::runtime_error(string_format(
|
||||
"error: the supplied chat template is not supported: %s\n"
|
||||
"note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
|
||||
value.c_str()
|
||||
));
|
||||
}
|
||||
params.chat_template = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
||||
string_format(
|
||||
"set custom jinja chat template file (default: template taken from model's metadata)\n"
|
||||
"if suffix/prefix are specified, template will be disabled\n"
|
||||
"only commonly used templates are accepted (unless --jinja is set before this flag):\n"
|
||||
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
|
||||
),
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream file(value);
|
||||
if (!file) {
|
||||
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||
}
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(params.chat_template));
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
||||
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
||||
@@ -2291,13 +2229,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.vocoder.model = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--tts-use-guide-tokens"},
|
||||
"Use guide tokens to improve TTS word recall",
|
||||
[](common_params & params) {
|
||||
params.vocoder.use_guide_tokens = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
||||
|
||||
// model-specific
|
||||
add_opt(common_arg(
|
||||
|
||||
@@ -1,268 +0,0 @@
|
||||
/*
|
||||
Copyright 2024 Google LLC
|
||||
|
||||
Use of this source code is governed by an MIT-style
|
||||
license that can be found in the LICENSE file or at
|
||||
https://opensource.org/licenses/MIT.
|
||||
*/
|
||||
// SPDX-License-Identifier: MIT
|
||||
#pragma once
|
||||
|
||||
#include "minja.hpp"
|
||||
#include <json.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
namespace minja {
|
||||
|
||||
class chat_template {
|
||||
public:
|
||||
|
||||
private:
|
||||
bool supports_tools_ = true;
|
||||
// Meta-Llama-3.1-8B-Instruct's template expects arguments to be an object.
|
||||
// Most other templates (and OpenAI's API) expect the arguments object to be stringified.
|
||||
bool requires_object_arguments_ = false;
|
||||
bool requires_typed_content_ = false;
|
||||
bool supports_system_role_ = true;
|
||||
bool supports_parallel_tool_calls_ = false;
|
||||
std::string source_;
|
||||
std::string bos_token_;
|
||||
std::string eos_token_;
|
||||
std::shared_ptr<minja::TemplateNode> template_root_;
|
||||
|
||||
std::string try_raw_render(
|
||||
const nlohmann::ordered_json & messages,
|
||||
const nlohmann::ordered_json & tools,
|
||||
bool add_generation_prompt,
|
||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
|
||||
{
|
||||
try {
|
||||
auto prompt = apply(messages, tools, add_generation_prompt, extra_context, /* adjust_inputs= */ false);
|
||||
// fprintf(stderr, "Prompt: %s\n", prompt.c_str());
|
||||
return prompt;
|
||||
} catch (const std::exception & e) {
|
||||
// fprintf(stderr, "Error: %s\n", e.what());
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token)
|
||||
: source_(source), bos_token_(bos_token), eos_token_(eos_token)
|
||||
{
|
||||
template_root_ = minja::Parser::parse(source_, {
|
||||
/* .trim_blocks = */ true,
|
||||
/* .lstrip_blocks = */ true,
|
||||
/* .keep_trailing_newline = */ false,
|
||||
});
|
||||
supports_tools_ = source.find("tools") != std::string::npos;
|
||||
|
||||
auto renders_string_arguments =
|
||||
try_raw_render({
|
||||
{
|
||||
{"role", "user"},
|
||||
{"content", "Hey"}
|
||||
},
|
||||
{
|
||||
{"role", "assistant"},
|
||||
{"tool_calls", json::array({
|
||||
{
|
||||
{"id", "call_1___"},
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"arguments", "{\"code\": \"print('Hello, World!')\"}"},
|
||||
{"name", "ipython"},
|
||||
}},
|
||||
},
|
||||
})},
|
||||
}
|
||||
}, {}, false).find("{\"code\": \"print") != std::string::npos;
|
||||
if (!renders_string_arguments) {
|
||||
auto renders_object_arguments =
|
||||
try_raw_render({
|
||||
{
|
||||
{"role", "user"},
|
||||
{"content", "Hey"}
|
||||
},
|
||||
{
|
||||
{"role", "assistant"},
|
||||
{"tool_calls", json::array({
|
||||
{
|
||||
{"id", "call_1___"},
|
||||
{"type", "function"},
|
||||
{"function", {
|
||||
{"arguments", {
|
||||
{"code", "print('Hello, World!')"},
|
||||
}},
|
||||
{"name", "ipython"},
|
||||
}},
|
||||
},
|
||||
})},
|
||||
}
|
||||
}, {}, false).find("{\"code\": \"print") != std::string::npos;
|
||||
requires_object_arguments_ = renders_object_arguments;
|
||||
}
|
||||
supports_parallel_tool_calls_ = source.find("tool_call_id") != std::string::npos;
|
||||
|
||||
supports_system_role_ = try_raw_render({
|
||||
{{"role", "system"}, {"content", "<System Needle>"}},
|
||||
{{"role", "user"}, {"content", "Hey"}}
|
||||
}, {}, false).find("<System Needle>") != std::string::npos;
|
||||
|
||||
requires_typed_content_ = try_raw_render({{{"role", "user"}, {"content", "Hey"}}}, {}, false).find("Hey") == std::string::npos
|
||||
&& try_raw_render({{{"role", "user"}, {"content", {{{"type", "text"}, {"text", "Hey"}}}}}}, {}, false).find("Hey") != std::string::npos;
|
||||
}
|
||||
|
||||
const std::string & source() const { return source_; }
|
||||
const std::string & bos_token() const { return bos_token_; }
|
||||
const std::string & eos_token() const { return eos_token_; }
|
||||
bool supports_tools() const { return supports_tools_; }
|
||||
bool supports_parallel_tool_calls() const { return supports_parallel_tool_calls_; }
|
||||
|
||||
std::string apply(
|
||||
const nlohmann::ordered_json & messages,
|
||||
const nlohmann::ordered_json & tools,
|
||||
bool add_generation_prompt,
|
||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
|
||||
bool adjust_inputs = true) const
|
||||
{
|
||||
json actual_messages;
|
||||
|
||||
// First, "fix" messages so they have a chance to be rendered correctly by the template
|
||||
|
||||
if (adjust_inputs && (requires_object_arguments_ || !supports_system_role_ || !supports_tools_ || requires_typed_content_)) {
|
||||
actual_messages = json::array();
|
||||
|
||||
auto add_message = [&](const json & msg) {
|
||||
if (requires_typed_content_ && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
|
||||
actual_messages.push_back({
|
||||
{"role", msg.at("role")},
|
||||
{"content", {{
|
||||
{"type", "text"},
|
||||
{"text", msg.at("content")},
|
||||
}}},
|
||||
});
|
||||
} else {
|
||||
actual_messages.push_back(msg);
|
||||
}
|
||||
};
|
||||
|
||||
std::string pending_system;
|
||||
auto flush_sys = [&]() {
|
||||
if (!pending_system.empty()) {
|
||||
add_message({
|
||||
{"role", "user"},
|
||||
{"content", pending_system},
|
||||
});
|
||||
pending_system.clear();
|
||||
}
|
||||
};
|
||||
for (const auto & message_ : messages) {
|
||||
auto message = message_;
|
||||
if (!message.contains("role") || !message.contains("content")) {
|
||||
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
|
||||
}
|
||||
std::string role = message.at("role");
|
||||
|
||||
if (message.contains("tool_calls")) {
|
||||
if (requires_object_arguments_ || !supports_tools_) {
|
||||
for (auto & tool_call : message.at("tool_calls")) {
|
||||
if (tool_call["type"] == "function") {
|
||||
auto & function = tool_call.at("function");
|
||||
std::string arguments = function.at("arguments");
|
||||
function["arguments"] = json::parse(arguments);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!supports_tools_) {
|
||||
auto content = message.at("content");
|
||||
auto tool_calls = json::array();
|
||||
for (const auto & tool_call : message.at("tool_calls")) {
|
||||
if (tool_call.at("type") != "function") {
|
||||
continue;
|
||||
}
|
||||
const auto & function = tool_call.at("function");
|
||||
auto tc = json {
|
||||
{"name", function.at("name")},
|
||||
{"arguments", function.at("arguments")},
|
||||
};
|
||||
if (tool_call.contains("id")) {
|
||||
tc["id"] = tool_call["id"];
|
||||
}
|
||||
tool_calls.push_back(tc);
|
||||
}
|
||||
auto obj = json {
|
||||
{"tool_calls", tool_calls},
|
||||
};
|
||||
if (!content.is_null() && content != "") {
|
||||
obj["content"] = content;
|
||||
}
|
||||
message["content"] = obj.dump(2);
|
||||
message.erase("tool_calls");
|
||||
}
|
||||
}
|
||||
if (!supports_tools_ && role == "tool") {
|
||||
message["role"] = "user";
|
||||
auto obj = json {
|
||||
{"tool_response", {
|
||||
{"tool", message.at("name")},
|
||||
{"content", message.at("content")},
|
||||
}},
|
||||
};
|
||||
if (message.contains("tool_call_id")) {
|
||||
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
|
||||
}
|
||||
message["content"] = obj.dump(2);
|
||||
message.erase("name");
|
||||
}
|
||||
|
||||
if (!message["content"].is_null() && !supports_system_role_) {
|
||||
std::string content = message.at("content");
|
||||
if (role == "system") {
|
||||
if (!pending_system.empty()) pending_system += "\n";
|
||||
pending_system += content;
|
||||
continue;
|
||||
} else {
|
||||
if (role == "user") {
|
||||
if (!pending_system.empty()) {
|
||||
message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
|
||||
pending_system.clear();
|
||||
}
|
||||
} else {
|
||||
flush_sys();
|
||||
}
|
||||
}
|
||||
}
|
||||
add_message(message);
|
||||
}
|
||||
flush_sys();
|
||||
} else {
|
||||
actual_messages = messages;
|
||||
}
|
||||
|
||||
auto context = minja::Context::make(json({
|
||||
{"messages", actual_messages},
|
||||
{"add_generation_prompt", add_generation_prompt},
|
||||
{"bos_token", bos_token_},
|
||||
{"eos_token", eos_token_},
|
||||
}));
|
||||
|
||||
if (!tools.is_null()) {
|
||||
auto tools_val = minja::Value(tools);
|
||||
context->set("tools", tools_val);
|
||||
}
|
||||
if (!extra_context.is_null()) {
|
||||
for (auto & kv : extra_context.items()) {
|
||||
minja::Value val(kv.value());
|
||||
context->set(kv.key(), val);
|
||||
}
|
||||
}
|
||||
|
||||
return template_root_->render(context);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace minja
|
||||
@@ -12,7 +12,6 @@
|
||||
#include "json.hpp"
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "llama.h"
|
||||
#include "chat-template.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
@@ -484,48 +483,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
||||
std::ostringstream result;
|
||||
for (size_t i = 0; i < values.size(); ++i) {
|
||||
if (i > 0) {
|
||||
result << separator;
|
||||
}
|
||||
result << values[i];
|
||||
}
|
||||
return result.str();
|
||||
}
|
||||
|
||||
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
|
||||
std::vector<std::string> parts;
|
||||
size_t start = 0;
|
||||
size_t end = str.find(delimiter);
|
||||
|
||||
while (end != std::string::npos) {
|
||||
parts.push_back(str.substr(start, end - start));
|
||||
start = end + delimiter.length();
|
||||
end = str.find(delimiter, start);
|
||||
}
|
||||
|
||||
parts.push_back(str.substr(start));
|
||||
|
||||
return parts;
|
||||
}
|
||||
|
||||
std::string string_repeat(const std::string & str, size_t n) {
|
||||
if (n == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string result;
|
||||
result.reserve(str.length() * n);
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
result += str;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string string_from(bool value) {
|
||||
return value ? "true" : "false";
|
||||
}
|
||||
@@ -1086,6 +1043,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
@@ -1771,75 +1729,67 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
|
||||
// Chat template utils
|
||||
//
|
||||
|
||||
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
|
||||
if (use_jinja) {
|
||||
try {
|
||||
auto chat_template = minja::chat_template(tmpl, "<s>", "</s>");
|
||||
chat_template.apply({{
|
||||
{"role", "user"},
|
||||
{"content", "test"},
|
||||
}}, json(), true);
|
||||
return true;
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
std::string common_get_builtin_chat_template(const struct llama_model * model) {
|
||||
const char * ptr_tmpl = llama_model_chat_template(model);
|
||||
return ptr_tmpl == nullptr ? "" : ptr_tmpl;
|
||||
}
|
||||
|
||||
bool common_chat_verify_template(const std::string & tmpl) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||
return res >= 0;
|
||||
}
|
||||
|
||||
std::string common_chat_apply_template(
|
||||
const common_chat_template & tmpl,
|
||||
std::string common_chat_apply_template(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<common_chat_msg> & msgs,
|
||||
bool add_ass,
|
||||
bool use_jinja) {
|
||||
if (use_jinja) {
|
||||
auto messages = json::array();
|
||||
for (const auto & msg : msgs) {
|
||||
messages.push_back({{"role", msg.role}, {"content", msg.content}});
|
||||
}
|
||||
return tmpl.apply(messages, /* tools= */ json(), add_ass);
|
||||
}
|
||||
|
||||
bool add_ass) {
|
||||
int alloc_size = 0;
|
||||
bool fallback = false; // indicate if we must fallback to default chatml
|
||||
std::vector<llama_chat_message> chat;
|
||||
for (const auto & msg : msgs) {
|
||||
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
||||
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
||||
}
|
||||
|
||||
const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
|
||||
std::vector<char> buf(alloc_size);
|
||||
|
||||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
|
||||
// error: chat template is not supported
|
||||
if (res < 0) {
|
||||
// if the custom "tmpl" is not supported, we throw an error
|
||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
||||
throw std::runtime_error("this custom template is not supported");
|
||||
if (ptr_tmpl != nullptr) {
|
||||
// if the custom "tmpl" is not supported, we throw an error
|
||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
||||
throw std::runtime_error("this custom template is not supported");
|
||||
}
|
||||
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
fallback = true;
|
||||
}
|
||||
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
res = llama_chat_apply_template(
|
||||
fallback ? "chatml" : ptr_tmpl,
|
||||
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
}
|
||||
|
||||
std::string formatted_chat(buf.data(), res);
|
||||
return formatted_chat;
|
||||
}
|
||||
|
||||
std::string common_chat_format_single(
|
||||
const common_chat_template & tmpl,
|
||||
std::string common_chat_format_single(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<common_chat_msg> & past_msg,
|
||||
const common_chat_msg & new_msg,
|
||||
bool add_ass,
|
||||
bool use_jinja) {
|
||||
bool add_ass) {
|
||||
std::ostringstream ss;
|
||||
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
|
||||
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
|
||||
std::vector<common_chat_msg> chat_new(past_msg);
|
||||
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
||||
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
||||
@@ -1847,74 +1797,21 @@ std::string common_chat_format_single(
|
||||
};
|
||||
// format chat with new_msg
|
||||
chat_new.push_back(new_msg);
|
||||
auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
|
||||
auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
|
||||
// get the diff part
|
||||
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
|
||||
std::string common_chat_format_example(const struct llama_model * model,
|
||||
const std::string & tmpl) {
|
||||
std::vector<common_chat_msg> msgs = {
|
||||
{"system", "You are a helpful assistant"},
|
||||
{"user", "Hello"},
|
||||
{"assistant", "Hi there"},
|
||||
{"user", "How are you?"},
|
||||
};
|
||||
return common_chat_apply_template(tmpl, msgs, true, use_jinja);
|
||||
}
|
||||
|
||||
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
|
||||
{
|
||||
auto vocab = llama_model_get_vocab(model);
|
||||
std::string default_template_src = chat_template_override;
|
||||
std::string template_tool_use_src = chat_template_override;
|
||||
bool has_explicit_template = !chat_template_override.empty();
|
||||
if (chat_template_override.empty()) {
|
||||
auto str = llama_model_chat_template(model, /* name */ nullptr);
|
||||
if (str) {
|
||||
default_template_src = str;
|
||||
has_explicit_template = true;
|
||||
}
|
||||
str = llama_model_chat_template(model, /* name */ "tool_use");
|
||||
if (str) {
|
||||
template_tool_use_src = str;
|
||||
has_explicit_template = true;
|
||||
}
|
||||
}
|
||||
if (default_template_src.empty() || default_template_src == "chatml") {
|
||||
if (!template_tool_use_src.empty()) {
|
||||
default_template_src = template_tool_use_src;
|
||||
} else {
|
||||
default_template_src = R"(
|
||||
{%- for message in messages -%}
|
||||
{{- "<|im_start|>" + message.role + "\n" + message.content + "<|im_end|>\n" -}}
|
||||
{%- endfor -%}
|
||||
{%- if add_generation_prompt -%}
|
||||
{{- "<|im_start|>assistant\n" -}}
|
||||
{%- endif -%}
|
||||
)";
|
||||
}
|
||||
}
|
||||
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
|
||||
if (token == LLAMA_TOKEN_NULL) {
|
||||
if (default_template_src.find(jinja_variable_name) != std::string::npos
|
||||
|| template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
|
||||
LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
|
||||
}
|
||||
return std::string();
|
||||
} else {
|
||||
return common_token_to_piece(vocab, token, true);
|
||||
}
|
||||
};
|
||||
auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
|
||||
auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
|
||||
return {
|
||||
has_explicit_template,
|
||||
std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
|
||||
template_tool_use_src.empty()
|
||||
? nullptr
|
||||
: std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos)
|
||||
};
|
||||
return common_chat_apply_template(model, tmpl, msgs, true);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -175,11 +175,7 @@ struct common_params_speculative {
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
|
||||
std::string model = ""; // draft model for speculative decoding // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
std::string model = ""; // draft model for speculative decoding // NOLINT
|
||||
};
|
||||
|
||||
struct common_params_vocoder {
|
||||
@@ -188,8 +184,6 @@ struct common_params_vocoder {
|
||||
|
||||
std::string model = ""; // model path // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
|
||||
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
||||
};
|
||||
|
||||
struct common_params {
|
||||
@@ -252,6 +246,7 @@ struct common_params {
|
||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
||||
|
||||
std::vector<std::string> in_files; // all input files
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
@@ -334,7 +329,6 @@ struct common_params {
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = ""; // NOLINT
|
||||
std::string chat_template = ""; // NOLINT
|
||||
bool use_jinja = false; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
@@ -429,10 +423,6 @@ std::string string_format(const char * fmt, ...);
|
||||
std::string string_strip(const std::string & str);
|
||||
std::string string_get_sortable_timestamp();
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
|
||||
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
|
||||
std::string string_repeat(const std::string & str, size_t n);
|
||||
|
||||
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
|
||||
|
||||
template<class T>
|
||||
@@ -517,14 +507,12 @@ struct llama_model * common_load_model_from_url(
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params);
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const std::string & repo,
|
||||
const std::string & remote_path,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params);
|
||||
|
||||
std::pair<std::string, std::string> common_get_hf_file(
|
||||
const std::string & hf_repo_with_tag,
|
||||
const std::string & hf_token);
|
||||
@@ -608,43 +596,30 @@ struct common_chat_msg {
|
||||
std::string content;
|
||||
};
|
||||
|
||||
// Get the built-in chat template for the model. Return empty string if not present.
|
||||
std::string common_get_builtin_chat_template(const struct llama_model * model);
|
||||
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
|
||||
|
||||
namespace minja {
|
||||
class chat_template;
|
||||
}
|
||||
|
||||
typedef minja::chat_template common_chat_template;
|
||||
|
||||
struct common_chat_templates {
|
||||
bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
||||
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
||||
std::unique_ptr<common_chat_template> template_tool_use;
|
||||
};
|
||||
bool common_chat_verify_template(const std::string & tmpl);
|
||||
|
||||
// CPP wrapper for llama_chat_apply_template
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
// If the custom "tmpl" is not supported, we throw an error
|
||||
std::string common_chat_apply_template(
|
||||
const common_chat_template & tmpl,
|
||||
std::string common_chat_apply_template(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<common_chat_msg> & chat,
|
||||
bool add_ass,
|
||||
bool use_jinja);
|
||||
bool add_ass);
|
||||
|
||||
// Format single message, while taking into account the position of that message in chat history
|
||||
std::string common_chat_format_single(
|
||||
const common_chat_template & tmpl,
|
||||
std::string common_chat_format_single(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<common_chat_msg> & past_msg,
|
||||
const common_chat_msg & new_msg,
|
||||
bool add_ass,
|
||||
bool use_jinja);
|
||||
bool add_ass);
|
||||
|
||||
// Returns an example of formatted chat
|
||||
std::string common_chat_format_example(
|
||||
const common_chat_template & tmpl, bool use_jinja);
|
||||
|
||||
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
|
||||
std::string common_chat_format_example(const struct llama_model * model,
|
||||
const std::string & tmpl);
|
||||
|
||||
//
|
||||
// KV cache utils
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
@@ -13,6 +11,11 @@
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
template <typename Iterator>
|
||||
static std::string join(Iterator begin, Iterator end, const std::string & separator);
|
||||
|
||||
static std::string repeat(const std::string & str, size_t n);
|
||||
|
||||
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
|
||||
auto has_max = max_items != std::numeric_limits<int>::max();
|
||||
|
||||
@@ -125,8 +128,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
||||
if (sub_len > 0) {
|
||||
auto from_sub = from.substr(i + 1);
|
||||
auto to_sub = to.substr(i + 1);
|
||||
auto sub_zeros = string_repeat("0", sub_len);
|
||||
auto sub_nines = string_repeat("9", sub_len);
|
||||
auto sub_zeros = repeat("0", sub_len);
|
||||
auto sub_nines = repeat("9", sub_len);
|
||||
|
||||
auto to_reached = false;
|
||||
out << "(";
|
||||
@@ -185,8 +188,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
||||
auto max_digits = max_s.length();
|
||||
|
||||
for (auto digits = min_digits; digits < max_digits; digits++) {
|
||||
uniform_range(min_s, string_repeat("9", digits));
|
||||
min_s = "1" + string_repeat("0", digits);
|
||||
uniform_range(min_s, repeat("9", digits));
|
||||
min_s = "1" + repeat("0", digits);
|
||||
out << " | ";
|
||||
}
|
||||
uniform_range(min_s, max_s);
|
||||
@@ -315,6 +318,49 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
||||
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
||||
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
||||
|
||||
template <typename Iterator>
|
||||
std::string join(Iterator begin, Iterator end, const std::string & separator) {
|
||||
std::ostringstream result;
|
||||
if (begin != end) {
|
||||
result << *begin;
|
||||
for (Iterator it = begin + 1; it != end; ++it) {
|
||||
result << separator << *it;
|
||||
}
|
||||
}
|
||||
return result.str();
|
||||
}
|
||||
|
||||
static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
|
||||
std::vector<std::string> tokens;
|
||||
size_t start = 0;
|
||||
size_t end = str.find(delimiter);
|
||||
|
||||
while (end != std::string::npos) {
|
||||
tokens.push_back(str.substr(start, end - start));
|
||||
start = end + delimiter.length();
|
||||
end = str.find(delimiter, start);
|
||||
}
|
||||
|
||||
tokens.push_back(str.substr(start));
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
static std::string repeat(const std::string & str, size_t n) {
|
||||
if (n == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string result;
|
||||
result.reserve(str.length() * n);
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
result += str;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
|
||||
std::smatch match;
|
||||
std::string result;
|
||||
@@ -343,7 +389,6 @@ static std::string format_literal(const std::string & literal) {
|
||||
|
||||
class SchemaConverter {
|
||||
private:
|
||||
friend std::string build_grammar(const std::function<void(const llama_grammar_builder &)> & cb);
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
std::map<std::string, std::string> _rules;
|
||||
@@ -373,7 +418,7 @@ private:
|
||||
for (size_t i = 0; i < alt_schemas.size(); i++) {
|
||||
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
|
||||
}
|
||||
return string_join(rules, " | ");
|
||||
return join(rules.begin(), rules.end(), " | ");
|
||||
}
|
||||
|
||||
std::string _visit_pattern(const std::string & pattern, const std::string & name) {
|
||||
@@ -436,7 +481,7 @@ private:
|
||||
for (const auto & item : ret) {
|
||||
results.push_back(to_rule(item));
|
||||
}
|
||||
return std::make_pair(string_join(results, " "), false);
|
||||
return std::make_pair(join(results.begin(), results.end(), " "), false);
|
||||
};
|
||||
|
||||
while (i < length) {
|
||||
@@ -494,7 +539,7 @@ private:
|
||||
}
|
||||
curly_brackets += '}';
|
||||
i++;
|
||||
auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
|
||||
auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
|
||||
int min_times = 0;
|
||||
int max_times = std::numeric_limits<int>::max();
|
||||
try {
|
||||
@@ -809,7 +854,7 @@ public:
|
||||
return;
|
||||
}
|
||||
std::string pointer = ref.substr(ref.find('#') + 1);
|
||||
std::vector<std::string> tokens = string_split(pointer, "/");
|
||||
std::vector<std::string> tokens = split(pointer, "/");
|
||||
for (size_t i = 1; i < tokens.size(); ++i) {
|
||||
std::string sel = tokens[i];
|
||||
if (target.is_null() || !target.contains(sel)) {
|
||||
@@ -860,7 +905,7 @@ public:
|
||||
for (const auto & v : schema["enum"]) {
|
||||
enum_values.push_back(_generate_constant_rule(v));
|
||||
}
|
||||
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
|
||||
return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
|
||||
} else if ((schema_type.is_null() || schema_type == "object")
|
||||
&& (schema.contains("properties") ||
|
||||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
||||
@@ -974,10 +1019,10 @@ public:
|
||||
|
||||
void check_errors() {
|
||||
if (!_errors.empty()) {
|
||||
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
||||
throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
|
||||
}
|
||||
if (!_warnings.empty()) {
|
||||
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
||||
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -991,27 +1036,10 @@ public:
|
||||
};
|
||||
|
||||
std::string json_schema_to_grammar(const json & schema) {
|
||||
return build_grammar([&](const llama_grammar_builder & callbacks) {
|
||||
auto copy = schema;
|
||||
callbacks.resolve_refs(copy);
|
||||
callbacks.add_schema("", copy);
|
||||
});
|
||||
}
|
||||
|
||||
std::string build_grammar(const std::function<void(const llama_grammar_builder &)> & cb) {
|
||||
SchemaConverter converter([&](const std::string &) { return json(); }, /* dotall= */ false);
|
||||
llama_grammar_builder builder {
|
||||
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
||||
return converter._add_rule(name, rule);
|
||||
},
|
||||
/* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
|
||||
return converter.visit(schema, name == "root" ? "" : name);
|
||||
},
|
||||
/* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
|
||||
converter.resolve_refs(schema, "");
|
||||
}
|
||||
};
|
||||
cb(builder);
|
||||
SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
|
||||
auto copy = schema;
|
||||
converter.resolve_refs(copy, "input");
|
||||
converter.visit(copy, "");
|
||||
converter.check_errors();
|
||||
return converter.format_grammar();
|
||||
}
|
||||
|
||||
@@ -5,12 +5,4 @@
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
|
||||
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema);
|
||||
|
||||
struct llama_grammar_builder {
|
||||
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
||||
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
||||
std::function<void(nlohmann::ordered_json &)> resolve_refs;
|
||||
};
|
||||
|
||||
std::string build_grammar(const std::function<void(const llama_grammar_builder &)> & cb);
|
||||
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
|
||||
|
||||
2812
common/minja.hpp
2812
common/minja.hpp
File diff suppressed because it is too large
Load Diff
@@ -696,9 +696,6 @@ class Model:
|
||||
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
|
||||
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
||||
res = "deepseek-v3"
|
||||
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
|
||||
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
||||
res = "deepseek-r1-qwen"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2885,66 +2882,6 @@ class InternLM2Model(Model):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("InternLM3ForCausalLM")
|
||||
class InternLM3Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
|
||||
def set_vocab(self):
|
||||
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
if "added_tokens_decoder" in tokenizer_config_json:
|
||||
for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
|
||||
if token_data.get("special"):
|
||||
token_id = int(token_id)
|
||||
token = token_data["content"]
|
||||
special_vocab._set_special_token(token, token_id)
|
||||
# update eos token
|
||||
if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
|
||||
special_vocab.special_token_ids["eos"] = token_id
|
||||
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
|
||||
if "head_dim" in hparams:
|
||||
rope_dim = hparams["head_dim"]
|
||||
else:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
|
||||
class BertModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
@@ -65,50 +65,49 @@ else:
|
||||
|
||||
# TODO: add models here, base models preferred
|
||||
models = [
|
||||
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
||||
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
||||
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
|
||||
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
||||
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
||||
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
||||
{"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
|
||||
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
|
||||
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
||||
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
||||
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
||||
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
|
||||
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
||||
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
||||
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
||||
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
||||
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
|
||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
|
||||
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
|
||||
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
||||
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
||||
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
||||
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
||||
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
||||
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
||||
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
||||
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
|
||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
|
||||
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
||||
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
||||
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
||||
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
|
||||
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
|
||||
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
||||
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
||||
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
|
||||
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
||||
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
||||
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
||||
{"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
|
||||
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
|
||||
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
||||
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
||||
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
||||
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
|
||||
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
||||
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
||||
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
||||
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
||||
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
|
||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
|
||||
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
|
||||
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
||||
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
||||
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
||||
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
||||
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
||||
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
||||
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
||||
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
|
||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
|
||||
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
|
||||
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
||||
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
||||
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -133,7 +133,7 @@ The docker build option is currently limited to *intel GPU* targets.
|
||||
### Build image
|
||||
```sh
|
||||
# Using FP16
|
||||
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
|
||||
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
||||
```
|
||||
|
||||
*Notes*:
|
||||
|
||||
@@ -286,7 +286,7 @@ You don't need to install Vulkan SDK. It will be installed inside the container.
|
||||
|
||||
```sh
|
||||
# Build the image
|
||||
docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
|
||||
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
|
||||
|
||||
# Then, use it:
|
||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
|
||||
@@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
|
||||
## Building Docker locally
|
||||
|
||||
```bash
|
||||
docker build -t local/llama.cpp:full-cuda --target full -f .devops/cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:light-cuda --target light -f .devops/cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:server-cuda --target server -f .devops/cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
|
||||
```
|
||||
|
||||
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||
@@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
|
||||
## Building Docker locally
|
||||
|
||||
```bash
|
||||
docker build -t local/llama.cpp:full-musa --target full -f .devops/musa.Dockerfile .
|
||||
docker build -t local/llama.cpp:light-musa --target light -f .devops/musa.Dockerfile .
|
||||
docker build -t local/llama.cpp:server-musa --target server -f .devops/musa.Dockerfile .
|
||||
docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
|
||||
docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
|
||||
docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
|
||||
```
|
||||
|
||||
You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
|
||||
|
||||
@@ -345,18 +345,8 @@ struct lora_merge_ctx {
|
||||
gf = ggml_new_graph(ctx0);
|
||||
struct ggml_tensor * cur = inp_base;
|
||||
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||
struct ggml_tensor * delta;
|
||||
bool is_tok_embd = string_starts_with(name_base, "token_embd");
|
||||
if (is_tok_embd) {
|
||||
printf("%s : detected token embeddings tensor\n", __func__);
|
||||
delta = ggml_mul_mat(ctx0,
|
||||
ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32),
|
||||
ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32));
|
||||
} else {
|
||||
delta = ggml_mul_mat(ctx0,
|
||||
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))),
|
||||
ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
|
||||
}
|
||||
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
|
||||
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
|
||||
// scale
|
||||
const float alpha = adapters[i]->alpha;
|
||||
const float rank = (float) inp_b[i]->ne[0];
|
||||
|
||||
@@ -41,7 +41,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 2b. Test the sharded model is loading properly
|
||||
$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
@@ -51,7 +51,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 3b. Test the merged model is loading properly
|
||||
$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
@@ -61,7 +61,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 4b. Test the sharded model is loading properly
|
||||
$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
@@ -71,7 +71,7 @@ echo
|
||||
#echo
|
||||
|
||||
# 5b. Test the merged model is loading properly
|
||||
#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
|
||||
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
|
||||
#echo PASS
|
||||
#echo
|
||||
|
||||
@@ -81,7 +81,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 6b. Test the sharded model is loading properly
|
||||
$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
|
||||
@@ -683,7 +683,7 @@ struct cmd_params_instance {
|
||||
bool cpu_strict;
|
||||
int poll;
|
||||
int n_gpu_layers;
|
||||
std::string rpc_servers_str;
|
||||
std::string rpc_servers;
|
||||
llama_split_mode split_mode;
|
||||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
@@ -696,37 +696,8 @@ struct cmd_params_instance {
|
||||
llama_model_params mparams = llama_model_default_params();
|
||||
|
||||
mparams.n_gpu_layers = n_gpu_layers;
|
||||
if (!rpc_servers_str.empty()) {
|
||||
auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
|
||||
|
||||
// add RPC devices
|
||||
if (!rpc_servers.empty()) {
|
||||
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||
if (!rpc_reg) {
|
||||
fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
|
||||
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
||||
if (!ggml_backend_rpc_add_device_fn) {
|
||||
fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
static std::vector<ggml_backend_dev_t> devices;
|
||||
devices.clear();
|
||||
for (const std::string & server : rpc_servers) {
|
||||
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
|
||||
if (dev) {
|
||||
devices.push_back(dev);
|
||||
} else {
|
||||
fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
devices.push_back(nullptr);
|
||||
mparams.devices = devices.data();
|
||||
}
|
||||
if (!rpc_servers.empty()) {
|
||||
mparams.rpc_servers = rpc_servers.c_str();
|
||||
}
|
||||
mparams.split_mode = split_mode;
|
||||
mparams.main_gpu = main_gpu;
|
||||
@@ -737,7 +708,7 @@ struct cmd_params_instance {
|
||||
}
|
||||
|
||||
bool equal_mparams(const cmd_params_instance & other) const {
|
||||
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
|
||||
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
|
||||
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
|
||||
tensor_split == other.tensor_split;
|
||||
}
|
||||
|
||||
@@ -347,7 +347,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
jlong context_pointer,
|
||||
jlong batch_pointer,
|
||||
jstring jtext,
|
||||
jboolean format_chat,
|
||||
jint n_len
|
||||
) {
|
||||
|
||||
@@ -357,8 +356,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||
|
||||
bool parse_special = (format_chat == JNI_TRUE);
|
||||
const auto tokens_list = common_tokenize(context, text, true, parse_special);
|
||||
const auto tokens_list = common_tokenize(context, text, 1);
|
||||
|
||||
auto n_ctx = llama_n_ctx(context);
|
||||
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
||||
@@ -370,7 +368,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
}
|
||||
|
||||
for (auto id : tokens_list) {
|
||||
LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id);
|
||||
LOGi("%s", common_token_to_piece(context, id).c_str());
|
||||
}
|
||||
|
||||
common_batch_clear(*batch);
|
||||
|
||||
@@ -65,7 +65,6 @@ class LLamaAndroid {
|
||||
context: Long,
|
||||
batch: Long,
|
||||
text: String,
|
||||
formatChat: Boolean,
|
||||
nLen: Int
|
||||
): Int
|
||||
|
||||
@@ -116,10 +115,10 @@ class LLamaAndroid {
|
||||
}
|
||||
}
|
||||
|
||||
fun send(message: String, formatChat: Boolean = false): Flow<String> = flow {
|
||||
fun send(message: String): Flow<String> = flow {
|
||||
when (val state = threadLocalState.get()) {
|
||||
is State.Loaded -> {
|
||||
val ncur = IntVar(completion_init(state.context, state.batch, message, formatChat, nlen))
|
||||
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
|
||||
while (ncur.value <= nlen) {
|
||||
val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
|
||||
if (str == null) {
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
## MiniCPM-o 2.6
|
||||
Currently, this readme only supports minicpm-omni's image capabilities, and we will update the full-mode support as soon as possible.
|
||||
|
||||
### Prepare models and code
|
||||
|
||||
Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone git@github.com:OpenBMB/llama.cpp.git
|
||||
cd llama.cpp
|
||||
git checkout minicpm-omni
|
||||
```
|
||||
|
||||
### Usage of MiniCPM-o 2.6
|
||||
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md
|
||||
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
Inference on Linux or Mac
|
||||
```
|
||||
# run f16 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run quantized int4 version
|
||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# or run in interactive mode
|
||||
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
|
||||
```
|
||||
@@ -718,9 +718,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
else if (ctx->minicpmv_version == 3) {
|
||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
|
||||
}
|
||||
ggml_set_name(pos_embed, "pos_embed");
|
||||
ggml_set_input(pos_embed);
|
||||
}
|
||||
@@ -1056,11 +1053,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
n_head = hidden_size/d_head;
|
||||
num_query = 64;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
hidden_size = 3584;
|
||||
n_head = hidden_size/d_head;
|
||||
num_query = 64;
|
||||
}
|
||||
|
||||
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
||||
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||
@@ -2049,7 +2041,6 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
||||
images[images.size()-1].push_back(patch);
|
||||
}
|
||||
}
|
||||
clip_image_u8_free(refine_image);
|
||||
}
|
||||
return images;
|
||||
}
|
||||
@@ -2088,13 +2079,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
clip_image_f32_free(res);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||
if (imgs[i][j] != nullptr) {
|
||||
clip_image_u8_free(imgs[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
else if (ctx->has_qwen2vl_merger) {
|
||||
@@ -2351,9 +2335,6 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
|
||||
else if (ctx->minicpmv_version == 3) {
|
||||
n_patches = 64;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
n_patches = 64;
|
||||
}
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
int patch_size = params.patch_size * 2;
|
||||
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
||||
@@ -2533,8 +2514,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||
int bucket_coords_h[1024];
|
||||
int bucket_coords_w[1024];
|
||||
int bucket_coords_h[70];
|
||||
int bucket_coords_w[70];
|
||||
for (int i = 0; i < pos_h; i++){
|
||||
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
|
||||
}
|
||||
@@ -2562,9 +2543,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
else if (ctx->minicpmv_version == 3) {
|
||||
embed_dim = 3584;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
embed_dim = 3584;
|
||||
}
|
||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||
|
||||
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
|
||||
@@ -2808,9 +2786,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
else if (ctx->minicpmv_version == 3) {
|
||||
return 3584;
|
||||
}
|
||||
else if (ctx->minicpmv_version == 4) {
|
||||
return 3584;
|
||||
}
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
return ctx->vision_model.mm_1_b->ne[0];
|
||||
|
||||
@@ -216,7 +216,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
return true;
|
||||
}
|
||||
|
||||
static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
|
||||
static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
|
||||
int width = image->nx;
|
||||
int height = image->ny;
|
||||
int num_patches = (height / patch_size) * (width / patch_size);
|
||||
@@ -277,7 +277,13 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||
}
|
||||
else {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (!encoded) {
|
||||
@@ -307,9 +313,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
load_image_size->height = img->ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
delete[] img_res_v.data;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
|
||||
@@ -140,9 +140,6 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
@@ -230,9 +227,6 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
user_prompt = "<|im_start|>user\n" + prompt;
|
||||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
@@ -242,9 +236,6 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
else if (has_minicpmv_projector == 4) {
|
||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||
}
|
||||
|
||||
// generate the response
|
||||
|
||||
@@ -317,6 +308,7 @@ int main(int argc, char ** argv) {
|
||||
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
printf("%s", tmp);// mistral llava-1.6
|
||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||
fflush(stdout);
|
||||
|
||||
@@ -501,7 +501,7 @@ default_image_mean = [0.48145466, 0.4578275, 0.40821073]
|
||||
default_image_std = [0.26862954, 0.26130258, 0.27577711]
|
||||
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2)
|
||||
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)
|
||||
|
||||
# with proper
|
||||
args = ap.parse_args()
|
||||
@@ -545,19 +545,12 @@ if args.use_f32:
|
||||
|
||||
minicpmv_version = args.minicpmv_version
|
||||
emb_dim = 4096
|
||||
block_count = 26
|
||||
if minicpmv_version == 1:
|
||||
emb_dim = 2304
|
||||
block_count = 26
|
||||
elif minicpmv_version == 2:
|
||||
emb_dim = 4096
|
||||
block_count = 27
|
||||
elif minicpmv_version == 3:
|
||||
emb_dim = 3584
|
||||
block_count = 27
|
||||
elif minicpmv_version == 4:
|
||||
emb_dim = 3584
|
||||
block_count = 27
|
||||
|
||||
default_vision_config = {
|
||||
"hidden_size": 1152,
|
||||
@@ -574,9 +567,6 @@ model = Idefics2VisionTransformer(vision_config)
|
||||
if minicpmv_version == 3:
|
||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||
model = SiglipVisionTransformer(vision_config)
|
||||
elif minicpmv_version == 4:
|
||||
vision_config = SiglipVisionConfig(**default_vision_config)
|
||||
model = SiglipVisionTransformer(vision_config)
|
||||
|
||||
processor = None
|
||||
# if model.attn_pool is not None:
|
||||
@@ -597,7 +587,7 @@ elif args.minicpmv_projector is not None:
|
||||
fname_middle = "mmproj-"
|
||||
has_text_encoder = False
|
||||
has_minicpmv_projector = True
|
||||
minicpmv_version = 4
|
||||
minicpmv_version = 3
|
||||
elif args.vision_only:
|
||||
fname_middle = "vision-"
|
||||
has_text_encoder = False
|
||||
@@ -635,6 +625,7 @@ if has_vision_encoder:
|
||||
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
|
||||
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
block_count = 26
|
||||
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
|
||||
|
||||
if processor is not None:
|
||||
|
||||
@@ -8,7 +8,7 @@ ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
|
||||
args = ap.parse_args()
|
||||
|
||||
# find the model part that includes the the multimodal projector weights
|
||||
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16)
|
||||
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
|
||||
checkpoint = model.state_dict()
|
||||
|
||||
# get a list of mm tensor names
|
||||
|
||||
32
examples/main-cmake-pkg/CMakeLists.txt
Normal file
32
examples/main-cmake-pkg/CMakeLists.txt
Normal file
@@ -0,0 +1,32 @@
|
||||
cmake_minimum_required(VERSION 3.12)
|
||||
project("llama-cli-cmake-pkg" C CXX)
|
||||
set(TARGET llama-cli-cmake-pkg)
|
||||
|
||||
find_package(Llama 0.0.1 REQUIRED)
|
||||
|
||||
# Bake common functionality in with target. Because applications
|
||||
# using the relocatable Llama package should be outside of the
|
||||
# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in.
|
||||
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
|
||||
add_library(common OBJECT)
|
||||
file(GLOB _common_files
|
||||
"${_common_path}/*.h"
|
||||
"${_common_path}/*.cpp"
|
||||
)
|
||||
target_sources(common PRIVATE ${_common_files})
|
||||
|
||||
# If the common project was part of "llama-cli-cmake-pkg" the transient
|
||||
# defines would automatically be attached. Because the common func-
|
||||
# tionality is separate, but dependent upon the defines, it must be
|
||||
# explicitly extracted from the "llama" target.
|
||||
#
|
||||
get_target_property(_llama_transient_defines llama
|
||||
INTERFACE_COMPILE_DEFINITIONS)
|
||||
|
||||
target_compile_definitions(common PRIVATE "${_llama_transient_defines}")
|
||||
|
||||
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
|
||||
target_include_directories(${TARGET} PRIVATE ${_common_path})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
31
examples/main-cmake-pkg/README.md
Normal file
31
examples/main-cmake-pkg/README.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# llama.cpp/example/main-cmake-pkg
|
||||
|
||||
This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||
|
||||
## Building
|
||||
|
||||
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
|
||||
|
||||
### Considerations
|
||||
|
||||
When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
|
||||
|
||||
### Build llama.cpp and install to C:\LlamaCPP directory
|
||||
|
||||
```cmd
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix C:/LlamaCPP
|
||||
```
|
||||
|
||||
### Build llama-cli-cmake-pkg
|
||||
|
||||
|
||||
```cmd
|
||||
cd ..\examples\main-cmake-pkg
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix C:/MyLlamaApp
|
||||
```
|
||||
@@ -310,9 +310,9 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||
|
||||
### Batch Size
|
||||
|
||||
- `-ub N`, `--ubatch-size N`: Physical batch size. This is the maximum number of tokens that may be processed at a time. Increasing this value may improve performance during prompt processing, at the expense of higher memory usage. Default: `512`.
|
||||
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
|
||||
|
||||
- `-b N`, `--batch-size N`: Logical batch size. Increasing this value above the value of the physical batch size may improve prompt processing performance when using multiple GPUs with pipeline parallelism. Default: `2048`.
|
||||
- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.
|
||||
|
||||
### Prompt Caching
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
#include "log.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
#include "chat-template.hpp"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
@@ -85,6 +84,14 @@ static void sigint_handler(int signo) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
|
||||
common_chat_msg new_msg{role, content};
|
||||
auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||
chat_msgs.push_back({role, content});
|
||||
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
||||
return formatted;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
g_params = ¶ms;
|
||||
@@ -158,7 +165,6 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
|
||||
|
||||
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||
|
||||
@@ -201,7 +207,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// auto enable conversation mode if chat template is available
|
||||
const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
|
||||
const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
|
||||
if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
|
||||
if (has_chat_template) {
|
||||
LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
|
||||
@@ -219,7 +225,7 @@ int main(int argc, char ** argv) {
|
||||
// print chat template example in conversation mode
|
||||
if (params.conversation_mode) {
|
||||
if (params.enable_chat_template) {
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
|
||||
} else {
|
||||
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||
}
|
||||
@@ -263,18 +269,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
|
||||
common_chat_msg new_msg{role, content};
|
||||
auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
|
||||
chat_msgs.push_back({role, content});
|
||||
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
||||
return formatted;
|
||||
};
|
||||
|
||||
{
|
||||
auto prompt = (params.conversation_mode && params.enable_chat_template)
|
||||
// format the system prompt in conversation mode (fallback to default if empty)
|
||||
? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
|
||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
|
||||
// otherwise use the prompt as is
|
||||
: params.prompt;
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
@@ -781,7 +779,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.enable_chat_template) {
|
||||
chat_add_and_format("assistant", assistant_ss.str());
|
||||
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
||||
}
|
||||
is_interacting = true;
|
||||
LOG("\n");
|
||||
@@ -846,7 +844,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
bool format_chat = params.conversation_mode && params.enable_chat_template;
|
||||
std::string user_inp = format_chat
|
||||
? chat_add_and_format("user", std::move(buffer))
|
||||
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||
: std::move(buffer);
|
||||
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||
const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
|
||||
|
||||
@@ -47,7 +47,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 3a. Test the requanted model is loading properly
|
||||
$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
@@ -57,7 +57,7 @@ echo PASS
|
||||
echo
|
||||
|
||||
# 4b. Test the requanted model is loading properly
|
||||
$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
|
||||
$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
|
||||
echo PASS
|
||||
echo
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-run)
|
||||
add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
|
||||
add_executable(${TARGET} run.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -3,10 +3,11 @@
|
||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
|
||||
|
||||
```bash
|
||||
llama-run granite3-moe
|
||||
llama-run granite-code
|
||||
```
|
||||
|
||||
```bash
|
||||
llama-run -h
|
||||
Description:
|
||||
Runs a llm
|
||||
|
||||
@@ -16,7 +17,7 @@ Usage:
|
||||
Options:
|
||||
-c, --context-size <value>
|
||||
Context size (default: 2048)
|
||||
-n, -ngl, --ngl <value>
|
||||
-n, --ngl <value>
|
||||
Number of GPU layers (default: 0)
|
||||
--temp <value>
|
||||
Temperature (default: 0.8)
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
|
||||
Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
|
||||
Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,128 +0,0 @@
|
||||
/* linenoise.h -- VERSION 1.0
|
||||
*
|
||||
* Guerrilla line editing library against the idea that a line editing lib
|
||||
* needs to be 20,000 lines of C++ code.
|
||||
*
|
||||
* See linenoise.cpp for more information.
|
||||
*
|
||||
* ------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
|
||||
* Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
|
||||
* Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __LINENOISE_H
|
||||
#define __LINENOISE_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stddef.h> /* For size_t. */
|
||||
#include <stdlib.h>
|
||||
|
||||
extern const char *linenoiseEditMore;
|
||||
|
||||
/* The linenoiseState structure represents the state during line editing.
|
||||
* We pass this state to functions implementing specific editing
|
||||
* functionalities. */
|
||||
struct linenoiseState {
|
||||
int in_completion; /* The user pressed TAB and we are now in completion
|
||||
* mode, so input is handled by completeLine(). */
|
||||
size_t completion_idx; /* Index of next completion to propose. */
|
||||
int ifd; /* Terminal stdin file descriptor. */
|
||||
int ofd; /* Terminal stdout file descriptor. */
|
||||
char *buf; /* Edited line buffer. */
|
||||
size_t buflen; /* Edited line buffer size. */
|
||||
const char *prompt; /* Prompt to display. */
|
||||
size_t plen; /* Prompt length. */
|
||||
size_t pos; /* Current cursor position. */
|
||||
size_t oldpos; /* Previous refresh cursor position. */
|
||||
size_t len; /* Current edited line length. */
|
||||
size_t cols; /* Number of columns in terminal. */
|
||||
size_t oldrows; /* Rows used by last refrehsed line (multiline mode) */
|
||||
int history_index; /* The history index we are currently editing. */
|
||||
};
|
||||
|
||||
struct linenoiseCompletions {
|
||||
size_t len = 0;
|
||||
char ** cvec = nullptr;
|
||||
bool to_free = true;
|
||||
|
||||
~linenoiseCompletions() {
|
||||
if (!to_free) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
free(cvec[i]);
|
||||
}
|
||||
|
||||
free(cvec);
|
||||
}
|
||||
};
|
||||
|
||||
/* Non blocking API. */
|
||||
int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
|
||||
const char *linenoiseEditFeed(struct linenoiseState *l);
|
||||
void linenoiseEditStop(struct linenoiseState *l);
|
||||
void linenoiseHide(struct linenoiseState *l);
|
||||
void linenoiseShow(struct linenoiseState *l);
|
||||
|
||||
/* Blocking API. */
|
||||
const char *linenoise(const char *prompt);
|
||||
void linenoiseFree(void *ptr);
|
||||
|
||||
/* Completion API. */
|
||||
typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
|
||||
typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
|
||||
typedef void(linenoiseFreeHintsCallback)(const char *);
|
||||
void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
|
||||
void linenoiseSetHintsCallback(linenoiseHintsCallback *);
|
||||
void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
|
||||
void linenoiseAddCompletion(linenoiseCompletions *, const char *);
|
||||
|
||||
/* History API. */
|
||||
int linenoiseHistoryAdd(const char *line);
|
||||
int linenoiseHistorySetMaxLen(int len);
|
||||
int linenoiseHistorySave(const char *filename);
|
||||
int linenoiseHistoryLoad(const char *filename);
|
||||
|
||||
/* Other utilities. */
|
||||
void linenoiseClearScreen(void);
|
||||
void linenoiseSetMultiLine(int ml);
|
||||
void linenoisePrintKeyCodes(void);
|
||||
void linenoiseMaskModeEnable(void);
|
||||
void linenoiseMaskModeDisable(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __LINENOISE_H */
|
||||
@@ -19,16 +19,13 @@
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "common.h"
|
||||
#include "json.hpp"
|
||||
#include "linenoise.cpp/linenoise.h"
|
||||
#include "llama-cpp.h"
|
||||
#include "chat-template.hpp"
|
||||
|
||||
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
|
||||
[[noreturn]] static void sigint_handler(int) {
|
||||
@@ -106,7 +103,6 @@ class Opt {
|
||||
llama_model_params model_params;
|
||||
std::string model_;
|
||||
std::string user;
|
||||
bool use_jinja = false;
|
||||
int context_size = -1, ngl = -1;
|
||||
float temperature = -1;
|
||||
bool verbose = false;
|
||||
@@ -147,8 +143,7 @@ class Opt {
|
||||
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
|
||||
return 1;
|
||||
}
|
||||
} else if (options_parsing &&
|
||||
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
||||
} else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
|
||||
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
|
||||
return 1;
|
||||
}
|
||||
@@ -159,8 +154,6 @@ class Opt {
|
||||
} else if (options_parsing &&
|
||||
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
|
||||
verbose = true;
|
||||
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
|
||||
use_jinja = true;
|
||||
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
|
||||
help = true;
|
||||
return 0;
|
||||
@@ -181,10 +174,6 @@ class Opt {
|
||||
}
|
||||
}
|
||||
|
||||
if (model_.empty()){
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -199,7 +188,7 @@ class Opt {
|
||||
"Options:\n"
|
||||
" -c, --context-size <value>\n"
|
||||
" Context size (default: %d)\n"
|
||||
" -n, -ngl, --ngl <value>\n"
|
||||
" -n, --ngl <value>\n"
|
||||
" Number of GPU layers (default: %d)\n"
|
||||
" --temp <value>\n"
|
||||
" Temperature (default: %.1f)\n"
|
||||
@@ -323,10 +312,6 @@ class HttpClient {
|
||||
public:
|
||||
int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
|
||||
const bool progress, std::string * response_str = nullptr) {
|
||||
if (std::filesystem::exists(output_file)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string output_file_partial;
|
||||
curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
@@ -354,11 +339,7 @@ class HttpClient {
|
||||
data.file_size = set_resume_point(output_file_partial);
|
||||
set_progress_options(progress, data);
|
||||
set_headers(headers);
|
||||
CURLcode res = perform(url);
|
||||
if (res != CURLE_OK){
|
||||
printe("Fetching resource '%s' failed: %s\n", url.c_str(), curl_easy_strerror(res));
|
||||
return 1;
|
||||
}
|
||||
perform(url);
|
||||
if (!output_file.empty()) {
|
||||
std::filesystem::rename(output_file_partial, output_file);
|
||||
}
|
||||
@@ -423,12 +404,16 @@ class HttpClient {
|
||||
}
|
||||
}
|
||||
|
||||
CURLcode perform(const std::string & url) {
|
||||
void perform(const std::string & url) {
|
||||
CURLcode res;
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
|
||||
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
|
||||
return curl_easy_perform(curl);
|
||||
res = curl_easy_perform(curl);
|
||||
if (res != CURLE_OK) {
|
||||
printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
|
||||
}
|
||||
}
|
||||
|
||||
static std::string human_readable_time(double seconds) {
|
||||
@@ -551,7 +536,7 @@ class LlamaData {
|
||||
llama_sampler_ptr sampler;
|
||||
llama_context_ptr context;
|
||||
std::vector<llama_chat_message> messages;
|
||||
std::list<std::string> msg_strs;
|
||||
std::vector<std::string> msg_strs;
|
||||
std::vector<char> fmtted;
|
||||
|
||||
int init(Opt & opt) {
|
||||
@@ -566,14 +551,13 @@ class LlamaData {
|
||||
}
|
||||
|
||||
sampler = initialize_sampler(opt);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
#ifdef LLAMA_USE_CURL
|
||||
int download(const std::string & url, const std::string & output_file, const bool progress,
|
||||
const std::vector<std::string> & headers = {}, std::string * response_str = nullptr) {
|
||||
int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
|
||||
const bool progress, std::string * response_str = nullptr) {
|
||||
HttpClient http;
|
||||
if (http.init(url, headers, output_file, progress, response_str)) {
|
||||
return 1;
|
||||
@@ -582,85 +566,48 @@ class LlamaData {
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
int download(const std::string &, const std::string &, const bool, const std::vector<std::string> & = {},
|
||||
int download(const std::string &, const std::vector<std::string> &, const std::string &, const bool,
|
||||
std::string * = nullptr) {
|
||||
printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Helper function to handle model tag extraction and URL construction
|
||||
std::pair<std::string, std::string> extract_model_and_tag(std::string & model, const std::string & base_url) {
|
||||
std::string model_tag = "latest";
|
||||
const size_t colon_pos = model.find(':');
|
||||
int huggingface_dl(const std::string & model, const std::vector<std::string> headers, const std::string & bn) {
|
||||
// Find the second occurrence of '/' after protocol string
|
||||
size_t pos = model.find('/');
|
||||
pos = model.find('/', pos + 1);
|
||||
if (pos == std::string::npos) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string hfr = model.substr(0, pos);
|
||||
const std::string hff = model.substr(pos + 1);
|
||||
const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
|
||||
return download(url, headers, bn, true);
|
||||
}
|
||||
|
||||
int ollama_dl(std::string & model, const std::vector<std::string> headers, const std::string & bn) {
|
||||
if (model.find('/') == std::string::npos) {
|
||||
model = "library/" + model;
|
||||
}
|
||||
|
||||
std::string model_tag = "latest";
|
||||
size_t colon_pos = model.find(':');
|
||||
if (colon_pos != std::string::npos) {
|
||||
model_tag = model.substr(colon_pos + 1);
|
||||
model = model.substr(0, colon_pos);
|
||||
}
|
||||
|
||||
std::string url = base_url + model + "/manifests/" + model_tag;
|
||||
|
||||
return { model, url };
|
||||
}
|
||||
|
||||
// Helper function to download and parse the manifest
|
||||
int download_and_parse_manifest(const std::string & url, const std::vector<std::string> & headers,
|
||||
nlohmann::json & manifest) {
|
||||
std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag;
|
||||
std::string manifest_str;
|
||||
int ret = download(url, "", false, headers, &manifest_str);
|
||||
const int ret = download(manifest_url, headers, "", false, &manifest_str);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
manifest = nlohmann::json::parse(manifest_str);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int huggingface_dl(std::string & model, const std::string & bn) {
|
||||
// Find the second occurrence of '/' after protocol string
|
||||
size_t pos = model.find('/');
|
||||
pos = model.find('/', pos + 1);
|
||||
std::string hfr, hff;
|
||||
std::vector<std::string> headers = { "User-Agent: llama-cpp", "Accept: application/json" };
|
||||
std::string url;
|
||||
|
||||
if (pos == std::string::npos) {
|
||||
auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/");
|
||||
hfr = model_name;
|
||||
|
||||
nlohmann::json manifest;
|
||||
int ret = download_and_parse_manifest(manifest_url, headers, manifest);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
hff = manifest["ggufFile"]["rfilename"];
|
||||
} else {
|
||||
hfr = model.substr(0, pos);
|
||||
hff = model.substr(pos + 1);
|
||||
}
|
||||
|
||||
url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
|
||||
|
||||
return download(url, bn, true, headers);
|
||||
}
|
||||
|
||||
int ollama_dl(std::string & model, const std::string & bn) {
|
||||
const std::vector<std::string> headers = { "Accept: application/vnd.docker.distribution.manifest.v2+json" };
|
||||
if (model.find('/') == std::string::npos) {
|
||||
model = "library/" + model;
|
||||
}
|
||||
|
||||
auto [model_name, manifest_url] = extract_model_and_tag(model, "https://registry.ollama.ai/v2/");
|
||||
nlohmann::json manifest;
|
||||
int ret = download_and_parse_manifest(manifest_url, {}, manifest);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string layer;
|
||||
nlohmann::json manifest = nlohmann::json::parse(manifest_str);
|
||||
std::string layer;
|
||||
for (const auto & l : manifest["layers"]) {
|
||||
if (l["mediaType"] == "application/vnd.ollama.image.model") {
|
||||
layer = l["digest"];
|
||||
@@ -668,34 +615,8 @@ class LlamaData {
|
||||
}
|
||||
}
|
||||
|
||||
std::string blob_url = "https://registry.ollama.ai/v2/" + model_name + "/blobs/" + layer;
|
||||
|
||||
return download(blob_url, bn, true, headers);
|
||||
}
|
||||
|
||||
int github_dl(const std::string & model, const std::string & bn) {
|
||||
std::string repository = model;
|
||||
std::string branch = "main";
|
||||
const size_t at_pos = model.find('@');
|
||||
if (at_pos != std::string::npos) {
|
||||
repository = model.substr(0, at_pos);
|
||||
branch = model.substr(at_pos + 1);
|
||||
}
|
||||
|
||||
const std::vector<std::string> repo_parts = string_split(repository, "/");
|
||||
if (repo_parts.size() < 3) {
|
||||
printe("Invalid GitHub repository format\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string & org = repo_parts[0];
|
||||
const std::string & project = repo_parts[1];
|
||||
std::string url = "https://raw.githubusercontent.com/" + org + "/" + project + "/" + branch;
|
||||
for (size_t i = 2; i < repo_parts.size(); ++i) {
|
||||
url += "/" + repo_parts[i];
|
||||
}
|
||||
|
||||
return download(url, bn, true);
|
||||
std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer;
|
||||
return download(blob_url, headers, bn, true);
|
||||
}
|
||||
|
||||
std::string basename(const std::string & path) {
|
||||
@@ -707,41 +628,37 @@ class LlamaData {
|
||||
return path.substr(pos + 1);
|
||||
}
|
||||
|
||||
int rm_until_substring(std::string & model_, const std::string & substring) {
|
||||
const std::string::size_type pos = model_.find(substring);
|
||||
int remove_proto(std::string & model_) {
|
||||
const std::string::size_type pos = model_.find("://");
|
||||
if (pos == std::string::npos) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
model_ = model_.substr(pos + substring.size()); // Skip past the substring
|
||||
model_ = model_.substr(pos + 3); // Skip past "://"
|
||||
return 0;
|
||||
}
|
||||
|
||||
int resolve_model(std::string & model_) {
|
||||
int ret = 0;
|
||||
if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) {
|
||||
rm_until_substring(model_, "://");
|
||||
remove_proto(model_);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
const std::string bn = basename(model_);
|
||||
if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://") ||
|
||||
string_starts_with(model_, "hf.co/")) {
|
||||
rm_until_substring(model_, "hf.co/");
|
||||
rm_until_substring(model_, "://");
|
||||
ret = huggingface_dl(model_, bn);
|
||||
} else if ((string_starts_with(model_, "https://") || string_starts_with(model_, "http://")) &&
|
||||
!string_starts_with(model_, "https://ollama.com/library/")) {
|
||||
ret = download(model_, bn, true);
|
||||
} else if (string_starts_with(model_, "github:") || string_starts_with(model_, "github://")) {
|
||||
rm_until_substring(model_, "github:");
|
||||
rm_until_substring(model_, "://");
|
||||
ret = github_dl(model_, bn);
|
||||
} else { // ollama:// or nothing
|
||||
rm_until_substring(model_, "ollama.com/library/");
|
||||
rm_until_substring(model_, "://");
|
||||
ret = ollama_dl(model_, bn);
|
||||
const std::string bn = basename(model_);
|
||||
const std::vector<std::string> headers = { "--header",
|
||||
"Accept: application/vnd.docker.distribution.manifest.v2+json" };
|
||||
if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
|
||||
remove_proto(model_);
|
||||
ret = huggingface_dl(model_, headers, bn);
|
||||
} else if (string_starts_with(model_, "ollama://")) {
|
||||
remove_proto(model_);
|
||||
ret = ollama_dl(model_, headers, bn);
|
||||
} else if (string_starts_with(model_, "https://")) {
|
||||
download(model_, headers, bn, true);
|
||||
} else {
|
||||
ret = ollama_dl(model_, headers, bn);
|
||||
}
|
||||
|
||||
model_ = bn;
|
||||
@@ -794,31 +711,13 @@ static void add_message(const char * role, const std::string & text, LlamaData &
|
||||
}
|
||||
|
||||
// Function to apply the chat template and resize `formatted` if needed
|
||||
static int apply_chat_template(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) {
|
||||
if (use_jinja) {
|
||||
json messages = json::array();
|
||||
for (const auto & msg : llama_data.messages) {
|
||||
messages.push_back({
|
||||
{"role", msg.role},
|
||||
{"content", msg.content},
|
||||
});
|
||||
}
|
||||
try {
|
||||
auto result = tmpl.apply(messages, /* tools= */ json(), append);
|
||||
llama_data.fmtted.resize(result.size() + 1);
|
||||
memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
|
||||
return result.size();
|
||||
} catch (const std::exception & e) {
|
||||
printe("failed to render the chat template: %s\n", e.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
static int apply_chat_template(LlamaData & llama_data, const bool append) {
|
||||
int result = llama_chat_apply_template(
|
||||
tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append,
|
||||
llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(), llama_data.messages.size(), append,
|
||||
append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
|
||||
if (append && result > static_cast<int>(llama_data.fmtted.size())) {
|
||||
llama_data.fmtted.resize(result);
|
||||
result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(),
|
||||
result = llama_chat_apply_template(llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(),
|
||||
llama_data.messages.size(), append, llama_data.fmtted.data(),
|
||||
llama_data.fmtted.size());
|
||||
}
|
||||
@@ -828,12 +727,10 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll
|
||||
|
||||
// Function to tokenize the prompt
|
||||
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
|
||||
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
|
||||
const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
|
||||
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
std::vector<llama_token> & prompt_tokens) {
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||
prompt_tokens.resize(n_prompt_tokens);
|
||||
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first,
|
||||
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
|
||||
true) < 0) {
|
||||
printe("failed to tokenize the prompt\n");
|
||||
return -1;
|
||||
@@ -879,7 +776,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
|
||||
const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
if (tokenize_prompt(vocab, prompt, tokens, llama_data) < 0) {
|
||||
if (tokenize_prompt(vocab, prompt, tokens) < 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -910,44 +807,24 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
|
||||
batch = llama_batch_get_one(&new_token_id, 1);
|
||||
}
|
||||
|
||||
printf("\033[0m");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_user_input(std::string & user_input) {
|
||||
static const char * prompt_prefix = "> ";
|
||||
#ifdef WIN32
|
||||
printf(
|
||||
"\r%*s"
|
||||
"\r\033[0m%s",
|
||||
get_terminal_width(), " ", prompt_prefix);
|
||||
|
||||
std::getline(std::cin, user_input);
|
||||
static int read_user_input(std::string & user) {
|
||||
std::getline(std::cin, user);
|
||||
if (std::cin.eof()) {
|
||||
printf("\n");
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
std::unique_ptr<char, decltype(&std::free)> line(const_cast<char *>(linenoise(prompt_prefix)), free);
|
||||
if (!line) {
|
||||
|
||||
if (user == "/bye") {
|
||||
return 1;
|
||||
}
|
||||
|
||||
user_input = line.get();
|
||||
#endif
|
||||
|
||||
if (user_input == "/bye") {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (user_input.empty()) {
|
||||
if (user.empty()) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
#ifndef WIN32
|
||||
linenoiseHistoryAdd(line.get());
|
||||
#endif
|
||||
|
||||
return 0; // Should have data in happy path
|
||||
}
|
||||
|
||||
@@ -970,8 +847,8 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
|
||||
}
|
||||
|
||||
// Helper function to apply the chat template and handle errors
|
||||
static int apply_chat_template_with_error_handling(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
|
||||
const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja);
|
||||
static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
|
||||
const int new_len = apply_chat_template(llama_data, append);
|
||||
if (new_len < 0) {
|
||||
printe("failed to apply the chat template\n");
|
||||
return -1;
|
||||
@@ -988,6 +865,10 @@ static int handle_user_input(std::string & user_input, const std::string & user)
|
||||
return 0; // No need for interactive input
|
||||
}
|
||||
|
||||
printf(
|
||||
"\r%*s"
|
||||
"\r\033[32m> \033[0m",
|
||||
get_terminal_width(), " ");
|
||||
return read_user_input(user_input); // Returns true if input ends the loop
|
||||
}
|
||||
|
||||
@@ -1030,11 +911,9 @@ static int get_user_input(std::string & user_input, const std::string & user) {
|
||||
}
|
||||
|
||||
// Main chat loop function
|
||||
static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_jinja) {
|
||||
static int chat_loop(LlamaData & llama_data, const std::string & user) {
|
||||
int prev_len = 0;
|
||||
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
||||
auto chat_templates = common_chat_templates_from_model(llama_data.model.get(), "");
|
||||
GGML_ASSERT(chat_templates.template_default);
|
||||
static const bool stdout_a_terminal = is_stdout_a_terminal();
|
||||
while (true) {
|
||||
// Get user input
|
||||
@@ -1045,7 +924,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_
|
||||
|
||||
add_message("user", user.empty() ? user_input : user, llama_data);
|
||||
int new_len;
|
||||
if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) {
|
||||
if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1060,7 +939,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_
|
||||
}
|
||||
|
||||
add_message("assistant", response, llama_data);
|
||||
if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) {
|
||||
if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@@ -1120,7 +999,7 @@ int main(int argc, const char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (chat_loop(llama_data, opt.user, opt.use_jinja)) {
|
||||
if (chat_loop(llama_data, opt.user)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
|
||||
| `--grammar-file FNAME` | file to read grammar from |
|
||||
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
|
||||
| `--jinja` | Enable experimental Jinja templating engine (needed for tool use) |
|
||||
|
||||
|
||||
**Example-specific params**
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -19,7 +19,6 @@
|
||||
#include "loading.html.hpp"
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
#include <cstddef>
|
||||
#include <cinttypes>
|
||||
@@ -33,8 +32,6 @@
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
constexpr int HTTP_POLLING_SECONDS = 1;
|
||||
|
||||
enum stop_type {
|
||||
STOP_TYPE_NONE,
|
||||
STOP_TYPE_EOS,
|
||||
@@ -267,11 +264,6 @@ struct server_task {
|
||||
params.speculative.n_min = std::max(params.speculative.n_min, 2);
|
||||
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
||||
|
||||
// Use OpenAI API logprobs only if n_probs wasn't provided
|
||||
if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
|
||||
params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
|
||||
}
|
||||
|
||||
if (data.contains("lora")) {
|
||||
if (data.at("lora").is_array()) {
|
||||
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
|
||||
@@ -1427,10 +1419,6 @@ struct server_queue {
|
||||
int post(server_task task, bool front = false) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
GGML_ASSERT(task.id != -1);
|
||||
// if this is cancel task make sure to clean up pending tasks
|
||||
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
||||
cleanup_pending_task(task.id_target);
|
||||
}
|
||||
QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
|
||||
if (front) {
|
||||
queue_tasks.push_front(std::move(task));
|
||||
@@ -1448,10 +1436,6 @@ struct server_queue {
|
||||
if (task.id == -1) {
|
||||
task.id = id++;
|
||||
}
|
||||
// if this is cancel task make sure to clean up pending tasks
|
||||
if (task.type == SERVER_TASK_TYPE_CANCEL) {
|
||||
cleanup_pending_task(task.id_target);
|
||||
}
|
||||
QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
|
||||
if (front) {
|
||||
queue_tasks.push_front(std::move(task));
|
||||
@@ -1552,20 +1536,6 @@ struct server_queue {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void cleanup_pending_task(int id_target) {
|
||||
// no need lock because this is called exclusively by post()
|
||||
auto rm_func = [id_target](const server_task & task) {
|
||||
return task.id_target == id_target;
|
||||
};
|
||||
queue_tasks.erase(
|
||||
std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func),
|
||||
queue_tasks.end());
|
||||
queue_tasks_deferred.erase(
|
||||
std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
|
||||
queue_tasks_deferred.end());
|
||||
}
|
||||
};
|
||||
|
||||
struct server_response {
|
||||
@@ -1601,12 +1571,6 @@ struct server_response {
|
||||
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.erase(id_task);
|
||||
// make sure to clean up all pending results
|
||||
queue_results.erase(
|
||||
std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
|
||||
return res->id == id_task;
|
||||
}),
|
||||
queue_results.end());
|
||||
}
|
||||
|
||||
void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
|
||||
@@ -1626,24 +1590,6 @@ struct server_response {
|
||||
return !queue_results.empty();
|
||||
});
|
||||
|
||||
for (size_t i = 0; i < queue_results.size(); i++) {
|
||||
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
|
||||
server_task_result_ptr res = std::move(queue_results[i]);
|
||||
queue_results.erase(queue_results.begin() + i);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// should never reach here
|
||||
}
|
||||
|
||||
// same as recv(), but have timeout in seconds
|
||||
// if timeout is reached, nullptr is returned
|
||||
server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++) {
|
||||
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
|
||||
server_task_result_ptr res = std::move(queue_results[i]);
|
||||
@@ -1651,11 +1597,6 @@ struct server_response {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
|
||||
if (cr_res == std::cv_status::timeout) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// should never reach here
|
||||
@@ -1720,8 +1661,6 @@ struct server_context {
|
||||
// Necessary similarity of prompt for slot selection
|
||||
float slot_prompt_similarity = 0.0f;
|
||||
|
||||
common_chat_templates chat_templates;
|
||||
|
||||
~server_context() {
|
||||
// Clear any sampling context
|
||||
for (server_slot & slot : slots) {
|
||||
@@ -1762,16 +1701,13 @@ struct server_context {
|
||||
add_bos_token = llama_vocab_get_add_bos(vocab);
|
||||
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
|
||||
if (!params_base.speculative.model.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
||||
|
||||
auto params_dft = params_base;
|
||||
|
||||
params_dft.devices = params_base.speculative.devices;
|
||||
params_dft.hf_file = params_base.speculative.hf_file;
|
||||
params_dft.hf_repo = params_base.speculative.hf_repo;
|
||||
params_dft.model = params_base.speculative.model;
|
||||
params_dft.model_url = params_base.speculative.model_url;
|
||||
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
|
||||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||
params_dft.n_parallel = 1;
|
||||
@@ -1799,44 +1735,16 @@ struct server_context {
|
||||
// force F16 KV cache for the draft model for extra performance
|
||||
cparams_dft.type_k = GGML_TYPE_F16;
|
||||
cparams_dft.type_v = GGML_TYPE_F16;
|
||||
|
||||
// the context is not needed - we will create one for each slot
|
||||
llama_init_dft.context.reset();
|
||||
}
|
||||
|
||||
chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
|
||||
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool validate_builtin_chat_template(bool use_jinja) const {
|
||||
bool validate_builtin_chat_template() const {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
|
||||
if (use_jinja) {
|
||||
auto templates = common_chat_templates_from_model(model, "");
|
||||
GGML_ASSERT(templates.template_default);
|
||||
try {
|
||||
templates.template_default->apply({{
|
||||
{"role", "user"},
|
||||
{"content", "test"},
|
||||
}}, json(), true);
|
||||
if (templates.template_tool_use) {
|
||||
templates.template_tool_use->apply({{
|
||||
{"role", "user"},
|
||||
{"content", "test"},
|
||||
}}, json(), true);
|
||||
}
|
||||
return true;
|
||||
} catch (const std::exception & e) {
|
||||
SRV_ERR("failed to apply template: %s\n", e.what());
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
|
||||
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
|
||||
return chat_res > 0;
|
||||
}
|
||||
const char * tmpl = llama_model_chat_template(model);
|
||||
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
|
||||
return chat_res > 0;
|
||||
}
|
||||
|
||||
void init() {
|
||||
@@ -2403,8 +2311,8 @@ struct server_context {
|
||||
|
||||
server_task task(SERVER_TASK_TYPE_CANCEL);
|
||||
task.id_target = id_task;
|
||||
queue_results.remove_waiting_task_id(id_task);
|
||||
cancel_tasks.push_back(task);
|
||||
queue_results.remove_waiting_task_id(id_task);
|
||||
}
|
||||
// push to beginning of the queue, so it has highest priority
|
||||
queue_tasks.post(cancel_tasks, true);
|
||||
@@ -2414,21 +2322,10 @@ struct server_context {
|
||||
void receive_multi_results(
|
||||
const std::unordered_set<int> & id_tasks,
|
||||
const std::function<void(std::vector<server_task_result_ptr>&)> & result_handler,
|
||||
const std::function<void(json)> & error_handler,
|
||||
const std::function<bool()> & is_connection_closed) {
|
||||
const std::function<void(json)> & error_handler) {
|
||||
std::vector<server_task_result_ptr> results(id_tasks.size());
|
||||
for (int i = 0; i < (int)id_tasks.size(); i++) {
|
||||
server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
|
||||
|
||||
if (is_connection_closed()) {
|
||||
cancel_tasks(id_tasks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (result == nullptr) {
|
||||
i--; // retry
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < id_tasks.size(); i++) {
|
||||
server_task_result_ptr result = queue_results.recv(id_tasks);
|
||||
|
||||
if (result->is_error()) {
|
||||
error_handler(result->to_json());
|
||||
@@ -2452,20 +2349,10 @@ struct server_context {
|
||||
void receive_cmpl_results_stream(
|
||||
const std::unordered_set<int> & id_tasks,
|
||||
const std::function<bool(server_task_result_ptr&)> & result_handler,
|
||||
const std::function<void(json)> & error_handler,
|
||||
const std::function<bool()> & is_connection_closed) {
|
||||
const std::function<void(json)> & error_handler) {
|
||||
size_t n_finished = 0;
|
||||
while (true) {
|
||||
server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
|
||||
|
||||
if (is_connection_closed()) {
|
||||
cancel_tasks(id_tasks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (result == nullptr) {
|
||||
continue; // retry
|
||||
}
|
||||
server_task_result_ptr result = queue_results.recv(id_tasks);
|
||||
|
||||
if (result->is_error()) {
|
||||
error_handler(result->to_json());
|
||||
@@ -3721,12 +3608,9 @@ int main(int argc, char ** argv) {
|
||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
{ "model_path", ctx_server.params_base.model },
|
||||
{ "chat_template", ctx_server.chat_templates.template_default->source() },
|
||||
{ "chat_template", common_get_builtin_chat_template(ctx_server.model) },
|
||||
{ "build_info", build_info },
|
||||
};
|
||||
if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
|
||||
data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
|
||||
}
|
||||
|
||||
res_ok(res, data);
|
||||
};
|
||||
@@ -3749,7 +3633,6 @@ int main(int argc, char ** argv) {
|
||||
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
|
||||
server_task_type type,
|
||||
json & data,
|
||||
std::function<bool()> is_connection_closed,
|
||||
httplib::Response & res,
|
||||
oaicompat_type oaicompat) {
|
||||
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
||||
@@ -3811,7 +3694,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
}, is_connection_closed);
|
||||
});
|
||||
|
||||
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
||||
} else {
|
||||
@@ -3821,7 +3704,6 @@ int main(int argc, char ** argv) {
|
||||
if (res_json.is_array()) {
|
||||
for (const auto & res : res_json) {
|
||||
if (!server_sent_event(sink, "data", res)) {
|
||||
// sending failed (HTTP connection closed), cancel the generation
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -3831,9 +3713,6 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}, [&](const json & error_data) {
|
||||
server_sent_event(sink, "error", error_data);
|
||||
}, [&sink]() {
|
||||
// note: do not use req.is_connection_closed here because req is already destroyed
|
||||
return !sink.is_writable();
|
||||
});
|
||||
if (oaicompat != OAICOMPAT_TYPE_NONE) {
|
||||
static const std::string ev_done = "data: [DONE]\n\n";
|
||||
@@ -3856,7 +3735,6 @@ int main(int argc, char ** argv) {
|
||||
return handle_completions_impl(
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
data,
|
||||
req.is_connection_closed,
|
||||
res,
|
||||
OAICOMPAT_TYPE_NONE);
|
||||
};
|
||||
@@ -3866,7 +3744,6 @@ int main(int argc, char ** argv) {
|
||||
return handle_completions_impl(
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
data,
|
||||
req.is_connection_closed,
|
||||
res,
|
||||
OAICOMPAT_TYPE_COMPLETION);
|
||||
};
|
||||
@@ -3943,7 +3820,6 @@ int main(int argc, char ** argv) {
|
||||
return handle_completions_impl(
|
||||
SERVER_TASK_TYPE_INFILL,
|
||||
data,
|
||||
req.is_connection_closed,
|
||||
res,
|
||||
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
||||
};
|
||||
@@ -3954,14 +3830,10 @@ int main(int argc, char ** argv) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto body = json::parse(req.body);
|
||||
const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
|
||||
json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
|
||||
|
||||
json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
||||
return handle_completions_impl(
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
data,
|
||||
req.is_connection_closed,
|
||||
res,
|
||||
OAICOMPAT_TYPE_CHAT);
|
||||
};
|
||||
@@ -4108,7 +3980,7 @@ int main(int argc, char ** argv) {
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
}, req.is_connection_closed);
|
||||
});
|
||||
|
||||
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
||||
}
|
||||
@@ -4198,7 +4070,7 @@ int main(int argc, char ** argv) {
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
}, req.is_connection_closed);
|
||||
});
|
||||
}
|
||||
|
||||
if (error) {
|
||||
@@ -4367,7 +4239,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
||||
if (params.chat_template.empty()) {
|
||||
if (!ctx_server.validate_builtin_chat_template(params.use_jinja)) {
|
||||
if (!ctx_server.validate_builtin_chat_template()) {
|
||||
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
||||
params.chat_template = "chatml";
|
||||
}
|
||||
@@ -4375,8 +4247,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// print sample chat example to make it clear which template is used
|
||||
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
|
||||
ctx_server.chat_templates.template_default->source().c_str(),
|
||||
common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
|
||||
params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
|
||||
common_chat_format_example(ctx_server.model, params.chat_template).c_str());
|
||||
|
||||
ctx_server.queue_tasks.on_new_task(std::bind(
|
||||
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
||||
|
||||
@@ -4,26 +4,22 @@ from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
|
||||
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
|
||||
[
|
||||
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", False, None),
|
||||
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", True, None),
|
||||
(None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
|
||||
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
|
||||
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
|
||||
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
|
||||
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
|
||||
]
|
||||
)
|
||||
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
|
||||
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
|
||||
global server
|
||||
server.jinja = jinja
|
||||
server.chat_template = chat_template
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"model": model,
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import pytest
|
||||
import requests
|
||||
import time
|
||||
from openai import OpenAI
|
||||
from utils import *
|
||||
@@ -87,7 +86,7 @@ def test_completion_stream_vs_non_stream():
|
||||
assert content_stream == res_non_stream.body["content"]
|
||||
|
||||
|
||||
def test_completion_with_openai_library():
|
||||
def test_completion_stream_with_openai_library():
|
||||
global server
|
||||
server.start()
|
||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||
@@ -102,7 +101,7 @@ def test_completion_with_openai_library():
|
||||
assert match_regex("(going|bed)+", res.choices[0].text)
|
||||
|
||||
|
||||
def test_completion_stream_with_openai_library():
|
||||
def test_completion_with_openai_library():
|
||||
global server
|
||||
server.start()
|
||||
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
|
||||
@@ -406,23 +405,3 @@ def test_n_probs_post_sampling():
|
||||
assert "bytes" in prob and type(prob["bytes"]) == list
|
||||
# because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
|
||||
assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
|
||||
|
||||
|
||||
def test_cancel_request():
|
||||
global server
|
||||
server.n_ctx = 4096
|
||||
server.n_predict = -1
|
||||
server.n_slots = 1
|
||||
server.server_slots = True
|
||||
server.start()
|
||||
# send a request that will take a long time, but cancel it before it finishes
|
||||
try:
|
||||
server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
}, timeout=0.1)
|
||||
except requests.exceptions.ReadTimeout:
|
||||
pass # expected
|
||||
# make sure the slot is free
|
||||
time.sleep(1) # wait for HTTP_POLLING_SECONDS
|
||||
res = server.make_request("GET", "/slots")
|
||||
assert res.body[0]["is_processing"] == False
|
||||
|
||||
@@ -26,9 +26,6 @@ from re import RegexFlag
|
||||
import wget
|
||||
|
||||
|
||||
DEFAULT_HTTP_TIMEOUT = 10 if "LLAMA_SANITIZE" not in os.environ else 30
|
||||
|
||||
|
||||
class ServerResponse:
|
||||
headers: dict
|
||||
status_code: int
|
||||
@@ -72,14 +69,13 @@ class ServerProcess:
|
||||
pooling: str | None = None
|
||||
draft: int | None = None
|
||||
api_key: str | None = None
|
||||
response_format: str | None = None
|
||||
lora_files: List[str] | None = None
|
||||
disable_ctx_shift: int | None = False
|
||||
draft_min: int | None = None
|
||||
draft_max: int | None = None
|
||||
no_webui: bool | None = None
|
||||
jinja: bool | None = None
|
||||
chat_template: str | None = None
|
||||
chat_template_file: str | None = None
|
||||
|
||||
# session variables
|
||||
process: subprocess.Popen | None = None
|
||||
@@ -92,7 +88,7 @@ class ServerProcess:
|
||||
if "PORT" in os.environ:
|
||||
self.server_port = int(os.environ["PORT"])
|
||||
|
||||
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
|
||||
def start(self, timeout_seconds: int = 10) -> None:
|
||||
if "LLAMA_SERVER_BIN_PATH" in os.environ:
|
||||
server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
|
||||
elif os.name == "nt":
|
||||
@@ -170,12 +166,8 @@ class ServerProcess:
|
||||
server_args.extend(["--draft-min", self.draft_min])
|
||||
if self.no_webui:
|
||||
server_args.append("--no-webui")
|
||||
if self.jinja:
|
||||
server_args.append("--jinja")
|
||||
if self.chat_template:
|
||||
server_args.extend(["--chat-template", self.chat_template])
|
||||
if self.chat_template_file:
|
||||
server_args.extend(["--chat-template-file", self.chat_template_file])
|
||||
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"bench: starting server with: {' '.join(args)}")
|
||||
@@ -227,18 +219,17 @@ class ServerProcess:
|
||||
path: str,
|
||||
data: dict | Any | None = None,
|
||||
headers: dict | None = None,
|
||||
timeout: float | None = None,
|
||||
) -> ServerResponse:
|
||||
url = f"http://{self.server_host}:{self.server_port}{path}"
|
||||
parse_body = False
|
||||
if method == "GET":
|
||||
response = requests.get(url, headers=headers, timeout=timeout)
|
||||
response = requests.get(url, headers=headers)
|
||||
parse_body = True
|
||||
elif method == "POST":
|
||||
response = requests.post(url, headers=headers, json=data, timeout=timeout)
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
parse_body = True
|
||||
elif method == "OPTIONS":
|
||||
response = requests.options(url, headers=headers, timeout=timeout)
|
||||
response = requests.options(url, headers=headers)
|
||||
else:
|
||||
raise ValueError(f"Unimplemented method: {method}")
|
||||
result = ServerResponse()
|
||||
|
||||
@@ -16,8 +16,6 @@
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
#include "minja.hpp"
|
||||
#include "chat-template.hpp"
|
||||
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
@@ -351,7 +349,7 @@ static llama_tokens format_infill(
|
||||
}
|
||||
|
||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||
inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
|
||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
||||
std::vector<common_chat_msg> chat;
|
||||
|
||||
for (size_t i = 0; i < messages.size(); ++i) {
|
||||
@@ -379,7 +377,7 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
|
||||
chat.push_back({role, content});
|
||||
}
|
||||
|
||||
const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
|
||||
const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
|
||||
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
||||
|
||||
return formatted_chat;
|
||||
@@ -578,23 +576,14 @@ static json oaicompat_completion_params_parse(const json & body) {
|
||||
return llama_params;
|
||||
}
|
||||
|
||||
static json oaicompat_completion_params_parse(
|
||||
const json & body, /* openai api json semantics */
|
||||
const common_chat_template & tmpl,
|
||||
bool use_jinja)
|
||||
{
|
||||
static json oaicompat_chat_completion_params_parse(
|
||||
const struct llama_model * model,
|
||||
const json & body, /* openai api json semantics */
|
||||
const std::string & chat_template) {
|
||||
json llama_params;
|
||||
|
||||
auto tools = json_value(body, "tools", json());
|
||||
auto has_tools = tools.is_array() && !tools.empty();
|
||||
|
||||
if (has_tools) {
|
||||
if (use_jinja) {
|
||||
LOG_WRN("tools param is not fully supported yet\n");
|
||||
} else {
|
||||
throw std::runtime_error("tools param requires --jinja flag");
|
||||
}
|
||||
}
|
||||
// Apply chat template to the list of messages
|
||||
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
||||
|
||||
// Handle "stop" field
|
||||
if (body.contains("stop") && body.at("stop").is_string()) {
|
||||
@@ -617,13 +606,6 @@ static json oaicompat_completion_params_parse(
|
||||
}
|
||||
}
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
if (use_jinja) {
|
||||
llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
|
||||
} else {
|
||||
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
|
||||
}
|
||||
|
||||
// Handle "n" field
|
||||
int n_choices = json_value(body, "n", 1);
|
||||
if (n_choices != 1) {
|
||||
@@ -639,7 +621,7 @@ static json oaicompat_completion_params_parse(
|
||||
}
|
||||
|
||||
// Params supported by OAI but unsupported by llama.cpp
|
||||
static const std::vector<std::string> unsupported_params { "tool_choice" };
|
||||
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
|
||||
for (const auto & param : unsupported_params) {
|
||||
if (body.contains(param)) {
|
||||
throw std::runtime_error("Unsupported param: " + param);
|
||||
|
||||
@@ -141,7 +141,6 @@
|
||||
:msg="pendingMsg"
|
||||
:key="pendingMsg.id"
|
||||
:is-generating="isGenerating"
|
||||
:show-thought-in-progress="config.showThoughtInProgress"
|
||||
:edit-user-msg-and-regenerate="() => {}"
|
||||
:regenerate-msg="() => {}"></message-bubble>
|
||||
</div>
|
||||
@@ -203,20 +202,6 @@
|
||||
</template>
|
||||
</div>
|
||||
</details>
|
||||
<!-- Section: Reasoning models -->
|
||||
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
|
||||
<summary class="collapse-title font-bold">Reasoning models</summary>
|
||||
<div class="collapse-content">
|
||||
<div class="flex flex-row items-center mb-2">
|
||||
<input type="checkbox" class="checkbox" v-model="config.showThoughtInProgress" />
|
||||
<span class="ml-4">Expand though process by default for generating message</span>
|
||||
</div>
|
||||
<div class="flex flex-row items-center mb-2">
|
||||
<input type="checkbox" class="checkbox" v-model="config.excludeThoughtOnReq" />
|
||||
<span class="ml-4">Exclude thought process when sending request to API (Recommended for DeepSeek-R1)</span>
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
<!-- Section: Advanced config -->
|
||||
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
|
||||
<summary class="collapse-title font-bold">Advanced config</summary>
|
||||
@@ -276,17 +261,7 @@
|
||||
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
|
||||
<!-- render message as markdown -->
|
||||
<div v-else dir="auto">
|
||||
<details v-if="msg.role === 'assistant' && splitMsgContent.cot" class="collapse bg-base-200 collapse-arrow mb-4" :open="splitMsgContent.isThinking && showThoughtInProgress">
|
||||
<summary class="collapse-title">
|
||||
<span v-if="splitMsgContent.isThinking">
|
||||
<span v-if="isGenerating" class="loading loading-spinner loading-md mr-2" style="vertical-align: middle;"></span>
|
||||
<b>Thinking</b>
|
||||
</span>
|
||||
<b v-else>Thought Process</b>
|
||||
</summary>
|
||||
<vue-markdown :source="splitMsgContent.cot" dir="auto" class="collapse-content"></vue-markdown>
|
||||
</details>
|
||||
<vue-markdown :source="splitMsgContent.content"></vue-markdown>
|
||||
<vue-markdown :source="msg.content"></vue-markdown>
|
||||
</div>
|
||||
<!-- render timings if enabled -->
|
||||
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
|
||||
|
||||
@@ -17,11 +17,6 @@ import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
|
||||
|
||||
const isDev = import.meta.env.MODE === 'development';
|
||||
|
||||
// types
|
||||
/** @typedef {{ id: number, role: 'user' | 'assistant', content: string, timings: any }} Message */
|
||||
/** @typedef {{ role: 'user' | 'assistant', content: string }} APIMessage */
|
||||
/** @typedef {{ id: string, lastModified: number, messages: Array<Message> }} Conversation */
|
||||
|
||||
// utility functions
|
||||
const isString = (x) => !!x.toLowerCase;
|
||||
const isBoolean = (x) => x === true || x === false;
|
||||
@@ -55,8 +50,6 @@ const CONFIG_DEFAULT = {
|
||||
apiKey: '',
|
||||
systemMessage: 'You are a helpful assistant.',
|
||||
showTokensPerSecond: false,
|
||||
showThoughtInProgress: false,
|
||||
excludeThoughtOnReq: true,
|
||||
// make sure these default values are in sync with `common.h`
|
||||
samplers: 'edkypmxt',
|
||||
temperature: 0.8,
|
||||
@@ -179,7 +172,6 @@ const MessageBubble = defineComponent({
|
||||
config: Object,
|
||||
msg: Object,
|
||||
isGenerating: Boolean,
|
||||
showThoughtInProgress: Boolean,
|
||||
editUserMsgAndRegenerate: Function,
|
||||
regenerateMsg: Function,
|
||||
},
|
||||
@@ -196,31 +188,7 @@ const MessageBubble = defineComponent({
|
||||
prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
|
||||
predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
|
||||
};
|
||||
},
|
||||
splitMsgContent() {
|
||||
const content = this.msg.content;
|
||||
if (this.msg.role !== 'assistant') {
|
||||
return { content };
|
||||
}
|
||||
let actualContent = '';
|
||||
let cot = '';
|
||||
let isThinking = false;
|
||||
let thinkSplit = content.split('<think>', 2);
|
||||
actualContent += thinkSplit[0];
|
||||
while (thinkSplit[1] !== undefined) {
|
||||
// <think> tag found
|
||||
thinkSplit = thinkSplit[1].split('</think>', 2);
|
||||
cot += thinkSplit[0];
|
||||
isThinking = true;
|
||||
if (thinkSplit[1] !== undefined) {
|
||||
// </think> closing tag found
|
||||
isThinking = false;
|
||||
thinkSplit = thinkSplit[1].split('<think>', 2);
|
||||
actualContent += thinkSplit[0];
|
||||
}
|
||||
}
|
||||
return { content: actualContent, cot, isThinking };
|
||||
},
|
||||
}
|
||||
},
|
||||
methods: {
|
||||
copyMsg() {
|
||||
@@ -240,10 +208,7 @@ const MessageBubble = defineComponent({
|
||||
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
|
||||
// convId is a string prefixed with 'conv-'
|
||||
const StorageUtils = {
|
||||
/**
|
||||
* manage conversations
|
||||
* @returns {Array<Conversation>}
|
||||
*/
|
||||
// manage conversations
|
||||
getAllConversations() {
|
||||
const res = [];
|
||||
for (const key in localStorage) {
|
||||
@@ -254,19 +219,11 @@ const StorageUtils = {
|
||||
res.sort((a, b) => b.lastModified - a.lastModified);
|
||||
return res;
|
||||
},
|
||||
/**
|
||||
* can return null if convId does not exist
|
||||
* @param {string} convId
|
||||
* @returns {Conversation | null}
|
||||
*/
|
||||
// can return null if convId does not exist
|
||||
getOneConversation(convId) {
|
||||
return JSON.parse(localStorage.getItem(convId) || 'null');
|
||||
},
|
||||
/**
|
||||
* if convId does not exist, create one
|
||||
* @param {string} convId
|
||||
* @param {Message} msg
|
||||
*/
|
||||
// if convId does not exist, create one
|
||||
appendMsg(convId, msg) {
|
||||
if (msg.content === null) return;
|
||||
const conv = StorageUtils.getOneConversation(convId) || {
|
||||
@@ -278,24 +235,12 @@ const StorageUtils = {
|
||||
conv.lastModified = Date.now();
|
||||
localStorage.setItem(convId, JSON.stringify(conv));
|
||||
},
|
||||
/**
|
||||
* Get new conversation id
|
||||
* @returns {string}
|
||||
*/
|
||||
getNewConvId() {
|
||||
return `conv-${Date.now()}`;
|
||||
},
|
||||
/**
|
||||
* remove conversation by id
|
||||
* @param {string} convId
|
||||
*/
|
||||
remove(convId) {
|
||||
localStorage.removeItem(convId);
|
||||
},
|
||||
/**
|
||||
* remove all conversations
|
||||
* @param {string} convId
|
||||
*/
|
||||
filterAndKeepMsgs(convId, predicate) {
|
||||
const conv = StorageUtils.getOneConversation(convId);
|
||||
if (!conv) return;
|
||||
@@ -303,11 +248,6 @@ const StorageUtils = {
|
||||
conv.lastModified = Date.now();
|
||||
localStorage.setItem(convId, JSON.stringify(conv));
|
||||
},
|
||||
/**
|
||||
* remove last message from conversation
|
||||
* @param {string} convId
|
||||
* @returns {Message | undefined}
|
||||
*/
|
||||
popMsg(convId) {
|
||||
const conv = StorageUtils.getOneConversation(convId);
|
||||
if (!conv) return;
|
||||
@@ -382,12 +322,10 @@ const mainApp = createApp({
|
||||
data() {
|
||||
return {
|
||||
conversations: StorageUtils.getAllConversations(),
|
||||
/** @type {Array<Message>} */
|
||||
messages: [],
|
||||
messages: [], // { id: number, role: 'user' | 'assistant', content: string }
|
||||
viewingConvId: StorageUtils.getNewConvId(),
|
||||
inputMsg: '',
|
||||
isGenerating: false,
|
||||
/** @type {Array<Message> | null} */
|
||||
pendingMsg: null, // the on-going message from assistant
|
||||
stopGeneration: () => {},
|
||||
selectedTheme: StorageUtils.getTheme(),
|
||||
@@ -395,7 +333,6 @@ const mainApp = createApp({
|
||||
showConfigDialog: false,
|
||||
// const
|
||||
themes: THEMES,
|
||||
/** @type {CONFIG_DEFAULT} */
|
||||
configDefault: {...CONFIG_DEFAULT},
|
||||
configInfo: {...CONFIG_INFO},
|
||||
isDev,
|
||||
@@ -488,50 +425,42 @@ const mainApp = createApp({
|
||||
this.isGenerating = true;
|
||||
|
||||
try {
|
||||
/** @type {CONFIG_DEFAULT} */
|
||||
const config = this.config;
|
||||
const abortController = new AbortController();
|
||||
this.stopGeneration = () => abortController.abort();
|
||||
/** @type {Array<APIMessage>} */
|
||||
let messages = [
|
||||
{ role: 'system', content: config.systemMessage },
|
||||
...normalizeMsgsForAPI(this.messages),
|
||||
];
|
||||
if (config.excludeThoughtOnReq) {
|
||||
messages = filterThoughtFromMsgs(messages);
|
||||
}
|
||||
if (isDev) console.log({messages});
|
||||
const params = {
|
||||
messages,
|
||||
messages: [
|
||||
{ role: 'system', content: this.config.systemMessage },
|
||||
...this.messages,
|
||||
],
|
||||
stream: true,
|
||||
cache_prompt: true,
|
||||
samplers: config.samplers,
|
||||
temperature: config.temperature,
|
||||
dynatemp_range: config.dynatemp_range,
|
||||
dynatemp_exponent: config.dynatemp_exponent,
|
||||
top_k: config.top_k,
|
||||
top_p: config.top_p,
|
||||
min_p: config.min_p,
|
||||
typical_p: config.typical_p,
|
||||
xtc_probability: config.xtc_probability,
|
||||
xtc_threshold: config.xtc_threshold,
|
||||
repeat_last_n: config.repeat_last_n,
|
||||
repeat_penalty: config.repeat_penalty,
|
||||
presence_penalty: config.presence_penalty,
|
||||
frequency_penalty: config.frequency_penalty,
|
||||
dry_multiplier: config.dry_multiplier,
|
||||
dry_base: config.dry_base,
|
||||
dry_allowed_length: config.dry_allowed_length,
|
||||
dry_penalty_last_n: config.dry_penalty_last_n,
|
||||
max_tokens: config.max_tokens,
|
||||
timings_per_token: !!config.showTokensPerSecond,
|
||||
...(config.custom.length ? JSON.parse(config.custom) : {}),
|
||||
samplers: this.config.samplers,
|
||||
temperature: this.config.temperature,
|
||||
dynatemp_range: this.config.dynatemp_range,
|
||||
dynatemp_exponent: this.config.dynatemp_exponent,
|
||||
top_k: this.config.top_k,
|
||||
top_p: this.config.top_p,
|
||||
min_p: this.config.min_p,
|
||||
typical_p: this.config.typical_p,
|
||||
xtc_probability: this.config.xtc_probability,
|
||||
xtc_threshold: this.config.xtc_threshold,
|
||||
repeat_last_n: this.config.repeat_last_n,
|
||||
repeat_penalty: this.config.repeat_penalty,
|
||||
presence_penalty: this.config.presence_penalty,
|
||||
frequency_penalty: this.config.frequency_penalty,
|
||||
dry_multiplier: this.config.dry_multiplier,
|
||||
dry_base: this.config.dry_base,
|
||||
dry_allowed_length: this.config.dry_allowed_length,
|
||||
dry_penalty_last_n: this.config.dry_penalty_last_n,
|
||||
max_tokens: this.config.max_tokens,
|
||||
timings_per_token: !!this.config.showTokensPerSecond,
|
||||
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
|
||||
};
|
||||
const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
...(config.apiKey ? {'Authorization': `Bearer ${config.apiKey}`} : {})
|
||||
...(this.config.apiKey ? {'Authorization': `Bearer ${this.config.apiKey}`} : {})
|
||||
},
|
||||
body: JSON.stringify(params),
|
||||
signal: abortController.signal,
|
||||
@@ -548,7 +477,7 @@ const mainApp = createApp({
|
||||
};
|
||||
}
|
||||
const timings = chunk.timings;
|
||||
if (timings && config.showTokensPerSecond) {
|
||||
if (timings && this.config.showTokensPerSecond) {
|
||||
// only extract what's really needed, to save some space
|
||||
this.pendingMsg.timings = {
|
||||
prompt_n: timings.prompt_n,
|
||||
@@ -669,33 +598,3 @@ try {
|
||||
<button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
/**
|
||||
* filter out redundant fields upon sending to API
|
||||
* @param {Array<APIMessage>} messages
|
||||
* @returns {Array<APIMessage>}
|
||||
*/
|
||||
function normalizeMsgsForAPI(messages) {
|
||||
return messages.map((msg) => {
|
||||
return {
|
||||
role: msg.role,
|
||||
content: msg.content,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
|
||||
* @param {Array<APIMessage>} messages
|
||||
* @returns {Array<APIMessage>}
|
||||
*/
|
||||
function filterThoughtFromMsgs(messages) {
|
||||
return messages.map((msg) => {
|
||||
return {
|
||||
role: msg.role,
|
||||
content: msg.role === 'assistant'
|
||||
? msg.content.split('</think>').at(-1).trim()
|
||||
: msg.content,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
@@ -98,12 +98,10 @@ int main(int argc, char ** argv) {
|
||||
auto generate = [&](const std::string & prompt) {
|
||||
std::string response;
|
||||
|
||||
const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
|
||||
|
||||
// tokenize the prompt
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
|
||||
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, true) < 0) {
|
||||
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
|
||||
GGML_ABORT("failed to tokenize the prompt\n");
|
||||
}
|
||||
|
||||
@@ -163,7 +161,7 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
|
||||
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
|
||||
const char * tmpl = llama_model_chat_template(model);
|
||||
|
||||
// add the user input to the message list and format it
|
||||
messages.push_back({"user", strdup(user.c_str())});
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
cmake_minimum_required(VERSION 3.12)
|
||||
project(llama-simple-cmake-pkg)
|
||||
|
||||
set(TARGET llama-simple-cmake-pkg)
|
||||
|
||||
find_package(Llama REQUIRED)
|
||||
|
||||
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../simple/simple.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ggml::all ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -1,34 +0,0 @@
|
||||
# llama.cpp/example/simple-cmake-pkg
|
||||
|
||||
This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||
|
||||
## Building
|
||||
|
||||
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
|
||||
|
||||
### Considerations
|
||||
|
||||
When hardware acceleration libraries are used (e.g. CUDA, Metal, Vulkan, etc.), the appropriate dependencies will be searched for automatically. So, for example, when finding a package
|
||||
|
||||
### Build llama.cpp and install to llama.cpp/inst
|
||||
|
||||
```sh
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
cmake -S . -B build
|
||||
cmake --build build
|
||||
cmake --install build --prefix inst
|
||||
|
||||
### Build simple-cmake-pkg
|
||||
|
||||
```sh
|
||||
cd examples/simple-cmake-pkg
|
||||
cmake -S . -B build -DCMAKE_PREFIX_PATH=../../inst/lib/cmake
|
||||
cmake --build build
|
||||
```
|
||||
|
||||
### Run simple-cmake-pkg
|
||||
|
||||
```sh
|
||||
./build/llama-simple-cmake-pkg -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
|
||||
```
|
||||
@@ -425,33 +425,6 @@ static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) {
|
||||
prompt_add(prompt, vocab, "<|im_start|>\n", true, true);
|
||||
}
|
||||
|
||||
static std::vector<llama_token> prepare_guide_tokens(const llama_vocab * vocab, const std::string & str) {
|
||||
const std::string& delimiter = "<|text_sep|>";
|
||||
|
||||
std::vector<llama_token> result;
|
||||
size_t start = 0;
|
||||
size_t end = str.find(delimiter);
|
||||
|
||||
//first token is always a newline, as it was not previously added
|
||||
result.push_back(common_tokenize(vocab, "\n", false, true)[0]);
|
||||
|
||||
while (end != std::string::npos) {
|
||||
std::string current_word = str.substr(start, end - start);
|
||||
auto tmp = common_tokenize(vocab, current_word, false, true);
|
||||
result.push_back(tmp[0]);
|
||||
start = end + delimiter.length();
|
||||
end = str.find(delimiter, start);
|
||||
}
|
||||
|
||||
// Add the last part
|
||||
std::string current_word = str.substr(start);
|
||||
auto tmp = common_tokenize(vocab, current_word, false, true);
|
||||
if (tmp.size() > 0) {
|
||||
result.push_back(tmp[0]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
@@ -521,7 +494,6 @@ int main(int argc, char ** argv) {
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
std::vector<llama_token> codes;
|
||||
std::vector<llama_token> guide_tokens;
|
||||
|
||||
// process prompt and generate voice codes
|
||||
{
|
||||
@@ -536,9 +508,6 @@ int main(int argc, char ** argv) {
|
||||
// convert the input text into the necessary format expected by OuteTTS
|
||||
{
|
||||
std::string prompt_clean = process_text(params.prompt);
|
||||
if (params.vocoder.use_guide_tokens) {
|
||||
guide_tokens = prepare_guide_tokens(vocab, prompt_clean);
|
||||
}
|
||||
|
||||
LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
|
||||
|
||||
@@ -748,8 +717,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
||||
int n_past = batch.n_tokens;
|
||||
int n_decode = 0;
|
||||
|
||||
bool next_token_uses_guide_token = true;
|
||||
|
||||
while (n_decode <= n_predict) {
|
||||
// prepare the next batch
|
||||
common_batch_clear(batch);
|
||||
@@ -761,17 +728,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
||||
continue;
|
||||
}
|
||||
|
||||
llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
|
||||
|
||||
//guide tokens help prevent hallucinations by forcing the TTS to use the correct word
|
||||
if (!guide_tokens.empty() && next_token_uses_guide_token && !llama_vocab_is_control(vocab, new_token_id) && !llama_vocab_is_eog(vocab, new_token_id)) {
|
||||
llama_token guide_token = guide_tokens[0];
|
||||
guide_tokens.erase(guide_tokens.begin());
|
||||
new_token_id = guide_token; //ensure correct word fragment is used
|
||||
}
|
||||
|
||||
//this is the token id that always precedes a new word
|
||||
next_token_uses_guide_token = (new_token_id == 198);
|
||||
const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
|
||||
|
||||
common_sampler_accept(smpl[i], new_token_id, true);
|
||||
|
||||
|
||||
@@ -58,8 +58,7 @@ else()
|
||||
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
|
||||
endif()
|
||||
|
||||
if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH})
|
||||
message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF")
|
||||
if (CMAKE_CROSSCOMPILING)
|
||||
set(GGML_NATIVE_DEFAULT OFF)
|
||||
else()
|
||||
set(GGML_NATIVE_DEFAULT ON)
|
||||
@@ -154,8 +153,6 @@ option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashA
|
||||
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
||||
|
||||
option(GGML_HIP "ggml: use HIP" OFF)
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
@@ -267,74 +264,3 @@ if (GGML_STANDALONE)
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
||||
DESTINATION share/pkgconfig)
|
||||
endif()
|
||||
|
||||
#
|
||||
# Create CMake package
|
||||
#
|
||||
|
||||
# Generate version info based on git commit.
|
||||
|
||||
find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
|
||||
execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GGML_BUILD_NUMBER
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
|
||||
if(GGML_BUILD_NUMBER EQUAL 1)
|
||||
message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
|
||||
endif()
|
||||
|
||||
execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE GGML_BUILD_COMMIT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
|
||||
# Capture variables prefixed with GGML_.
|
||||
|
||||
set(variable_set_statements
|
||||
"
|
||||
####### Expanded from @GGML_VARIABLES_EXPANED@ by configure_package_config_file() #######
|
||||
####### Any changes to this file will be overwritten by the next CMake run #######
|
||||
|
||||
")
|
||||
|
||||
set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS})
|
||||
|
||||
get_cmake_property(all_variables VARIABLES)
|
||||
foreach(variable_name IN LISTS all_variables)
|
||||
if(variable_name MATCHES "^GGML_")
|
||||
string(REPLACE ";" "\\;"
|
||||
variable_value "${${variable_name}}")
|
||||
|
||||
set(variable_set_statements
|
||||
"${variable_set_statements}set(${variable_name} \"${variable_value}\")\n")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
|
||||
|
||||
# Create the CMake package and set install location.
|
||||
|
||||
set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
|
||||
set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
||||
set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||
set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||
|
||||
configure_package_config_file(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
||||
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml
|
||||
PATH_VARS GGML_INCLUDE_INSTALL_DIR
|
||||
GGML_LIB_INSTALL_DIR
|
||||
GGML_BIN_INSTALL_DIR)
|
||||
|
||||
write_basic_package_version_file(
|
||||
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
||||
VERSION ${GGML_INSTALL_VERSION}
|
||||
COMPATIBILITY SameMajorVersion)
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
|
||||
|
||||
@@ -1,147 +0,0 @@
|
||||
|
||||
@GGML_VARIABLES_EXPANDED@
|
||||
|
||||
@PACKAGE_INIT@
|
||||
|
||||
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
|
||||
set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
find_library(GGML_LIBRARY ggml
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
add_library(ggml::ggml UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_LIBRARY}")
|
||||
|
||||
find_library(GGML_BASE_LIBRARY ggml-base
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml-base
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
|
||||
if (NOT GGML_SHARED_LIB)
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP REQUIRED)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
find_library(memkind memkind REQUIRED)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
|
||||
endif()
|
||||
|
||||
if (GGML_BLAS)
|
||||
find_package(BLAS REQUIRED)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_METAL)
|
||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
|
||||
list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
|
||||
${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN)
|
||||
find_package(Vulkan REQUIRED)
|
||||
list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP)
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL)
|
||||
find_package(DNNL)
|
||||
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
|
||||
endif()
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(_ggml_all_targets "")
|
||||
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
|
||||
|
||||
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
|
||||
|
||||
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
|
||||
if(is_cpu_variant)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
|
||||
else()
|
||||
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml" "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
|
||||
endforeach()
|
||||
|
||||
add_library(ggml::all INTERFACE IMPORTED)
|
||||
set_target_properties(ggml::all
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
|
||||
|
||||
check_required_components(ggml)
|
||||
@@ -203,8 +203,6 @@ extern "C" {
|
||||
// Backend registry
|
||||
//
|
||||
|
||||
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||
|
||||
// Backend (reg) enumeration
|
||||
GGML_API size_t ggml_backend_reg_count(void);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
|
||||
|
||||
@@ -1384,20 +1384,16 @@ extern "C" {
|
||||
float scale,
|
||||
float max_bias);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
float scale,
|
||||
float max_bias);
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace(
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
float scale,
|
||||
float max_bias);
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// rotary position embedding
|
||||
// if (mode & 1) - skip n_past elements (NOT SUPPORTED)
|
||||
|
||||
@@ -250,17 +250,6 @@ function(ggml_add_backend_library backend)
|
||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
|
||||
target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED)
|
||||
endif()
|
||||
|
||||
if(NOT GGML_AVAILABLE_BACKENDS)
|
||||
set(GGML_AVAILABLE_BACKENDS "${backend}"
|
||||
CACHE INTERNAL "List of backends for cmake package")
|
||||
else()
|
||||
list(FIND GGML_AVAILABLE_BACKENDS "${backend}" has_backend)
|
||||
if(has_backend EQUAL -1)
|
||||
set(GGML_AVAILABLE_BACKENDS "${GGML_AVAILABLE_BACKENDS};${backend}"
|
||||
CACHE INTERNAL "List of backends for cmake package")
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(ggml_add_backend backend)
|
||||
@@ -308,7 +297,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
else ()
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
endif()
|
||||
|
||||
|
||||
@@ -37,7 +37,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
||||
return true;
|
||||
}
|
||||
|
||||
// ops that return true for this function must not use restrict pointers for their backend implementations
|
||||
static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_SCALE:
|
||||
@@ -53,12 +52,8 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_UNARY:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK:
|
||||
case GGML_OP_SILU_BACK:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
return true;
|
||||
|
||||
default:
|
||||
|
||||
@@ -208,6 +208,7 @@ extern "C" {
|
||||
|
||||
// Internal backend registry API
|
||||
GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
|
||||
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||
|
||||
// Add backend dynamic loading support to the backend
|
||||
|
||||
|
||||
@@ -5573,88 +5573,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
uint32_t utmp[4];
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
|
||||
|
||||
memcpy(utmp, x[i].scales, K_SCALE_SIZE);
|
||||
|
||||
uint32x2_t mins8 = { 0 };
|
||||
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
|
||||
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
|
||||
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
|
||||
const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
|
||||
vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
|
||||
sumf -= dmin * vaddvq_s32(prod);
|
||||
|
||||
const uint8_t * scales = (const uint8_t *)utmp;
|
||||
|
||||
const uint8_t * restrict q4 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
||||
const svuint8_t m4b = svdup_n_u8(0xf);
|
||||
const svint32_t mzero = svdup_n_s32(0);
|
||||
svint32_t sumi1 = svdup_n_s32(0);
|
||||
svint32_t sumi1_1 = svdup_n_s32(0);
|
||||
svint32_t sumi1_2 = svdup_n_s32(0);
|
||||
svint32_t sumi2 = svdup_n_s32(0);
|
||||
svint32_t sumi2_1 = svdup_n_s32(0);
|
||||
svint32_t sumi2_2 = svdup_n_s32(0);
|
||||
switch (vector_length) {
|
||||
case 128:
|
||||
{
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), m4b));
|
||||
svint8_t q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
|
||||
sumi1_1 = svmla_n_s32_x(svptrue_b32(), sumi1_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
|
||||
q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), m4b));
|
||||
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
|
||||
sumi1_2 = svmla_n_s32_x(svptrue_b32(), sumi1_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
|
||||
|
||||
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), 4));
|
||||
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
|
||||
sumi2_1 = svmla_n_s32_x(svptrue_b32(), sumi2_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
|
||||
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), 4));
|
||||
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
|
||||
sumi2_2 = svmla_n_s32_x(svptrue_b32(), sumi2_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
|
||||
q4 += 32;
|
||||
}
|
||||
sumi1 = svadd_s32_x(svptrue_b32(), sumi1_1, sumi1_2);
|
||||
sumi2 = svadd_s32_x(svptrue_b32(), sumi2_1, sumi2_2);
|
||||
sumf += d * (svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sumi1, sumi2)));
|
||||
} break;
|
||||
case 256:
|
||||
case 512:
|
||||
{
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
const svuint8_t q4bits = svld1_u8(svptrue_pat_b8(SV_VL32), q4); q4 += 32;
|
||||
svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_pat_b8(SV_VL32), q4bits, m4b));
|
||||
svint8_t q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
|
||||
sumi1 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
|
||||
|
||||
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q4bits, 4));
|
||||
q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
|
||||
sumi2 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
|
||||
}
|
||||
sumf += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), sumi1, sumi2)));
|
||||
} break;
|
||||
default:
|
||||
assert(false && "Unsupported vector length");
|
||||
break;
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
#elif __ARM_NEON
|
||||
#ifdef __ARM_NEON
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||
const int32x4_t mzero = vdupq_n_s32(0);
|
||||
|
||||
|
||||
@@ -1302,7 +1302,7 @@ struct ggml_threadpool {
|
||||
// these are atomic as an annotation for thread-sanitizer
|
||||
atomic_bool stop; // Used for stopping the threadpool altogether
|
||||
atomic_bool pause; // Used for pausing the threadpool or individual threads
|
||||
atomic_int abort; // Used for aborting processing of a graph
|
||||
atomic_bool abort; // Used for aborting processing of a graph
|
||||
|
||||
struct ggml_compute_state * workers; // per thread state
|
||||
int n_threads_max; // number of threads in the pool
|
||||
@@ -3967,57 +3967,6 @@ static void ggml_compute_forward_dup_bytes(
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_dup_q(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
|
||||
|
||||
size_t qk = ggml_blck_size(type);
|
||||
const int64_t nr = ggml_nelements(src1) / qk;
|
||||
|
||||
// destination must be contiguous in the first dimension
|
||||
GGML_ASSERT(nb10 == ggml_type_size(dst->type));
|
||||
// must either have first dimension large enough to hold a row, or fully contiguous
|
||||
GGML_ASSERT((ne10 % qk) == 0 || ggml_is_contiguous(dst));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||
|
||||
uint32_t i = ir * qk;
|
||||
|
||||
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
||||
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int64_t i13 = i/(ne10 * ne11 * ne12);
|
||||
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
dequantize_row_q(
|
||||
(const void *) ((char *) src0->data + x_offset),
|
||||
(float *) ((char *) dst->data + dst_offset), qk);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_dup(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
@@ -4044,10 +3993,6 @@ static void ggml_compute_forward_dup(
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
|
||||
ggml_compute_forward_dup_q(params, dst);
|
||||
break;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
@@ -6746,20 +6691,20 @@ static void ggml_compute_forward_silu_back_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * grad = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * grad = dst->src[1];
|
||||
|
||||
assert(ggml_is_contiguous_1(grad));
|
||||
assert(ggml_is_contiguous_1(src1));
|
||||
assert(ggml_is_contiguous_1(src0));
|
||||
assert(ggml_is_contiguous_1(dst));
|
||||
assert(ggml_are_same_shape(src1, dst));
|
||||
assert(ggml_are_same_shape(src1, grad));
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
assert(ggml_are_same_shape(src0, grad));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src1->ne[0];
|
||||
const int nr = ggml_nrows(src1);
|
||||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
@@ -6771,7 +6716,7 @@ static void ggml_compute_forward_silu_back_f32(
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
ggml_vec_silu_backward_f32(nc,
|
||||
(float *) ((char *) dst->data + i1*( dst->nb[1])),
|
||||
(float *) ((char *) src1->data + i1*(src1->nb[1])),
|
||||
(float *) ((char *) src0->data + i1*(src0->nb[1])),
|
||||
(float *) ((char *) grad->data + i1*(grad->nb[1])));
|
||||
|
||||
#ifndef NDEBUG
|
||||
@@ -6950,7 +6895,7 @@ static void ggml_compute_forward_norm_f32(
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
GGML_ASSERT(eps > 0.0f);
|
||||
|
||||
// TODO: optimize
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -7021,7 +6966,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
GGML_ASSERT(eps > 0.0f);
|
||||
|
||||
// TODO: optimize
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -7073,13 +7018,12 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // gradients from forward pass output
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // src1 from forward pass
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
@@ -7098,8 +7042,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
||||
const int64_t i12 = i02;
|
||||
const int64_t i13 = i03;
|
||||
|
||||
const float * dz = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
const float * x = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
|
||||
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
|
||||
|
||||
ggml_float sum_xx = 0.0;
|
||||
ggml_float sum_xdz = 0.0;
|
||||
@@ -7122,9 +7066,9 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
||||
{
|
||||
// z = rms_norm(x)
|
||||
//
|
||||
// rms_norm(src1) =
|
||||
// rms_norm(src0) =
|
||||
// scale(
|
||||
// src1,
|
||||
// src0,
|
||||
// div(
|
||||
// 1,
|
||||
// sqrt(
|
||||
@@ -7132,13 +7076,13 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
||||
// scale(
|
||||
// sum(
|
||||
// sqr(
|
||||
// src1)),
|
||||
// src0)),
|
||||
// (1.0/N)),
|
||||
// eps))));
|
||||
|
||||
// postorder:
|
||||
// ## op args grad
|
||||
// 00 param src1 grad[#00]
|
||||
// 00 param src0 grad[#00]
|
||||
// 01 const 1
|
||||
// 02 sqr (#00) grad[#02]
|
||||
// 03 sum (#02) grad[#03]
|
||||
@@ -7215,7 +7159,6 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
||||
// dx := scale(dx, rrms)
|
||||
float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
// dx[i00] = (x*(-sum_xdz/sum_eps) + dz) / sqrtf(mean_eps)
|
||||
ggml_vec_cpy_f32 (ne00, dx, x);
|
||||
// ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
|
||||
ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
|
||||
@@ -7807,13 +7750,12 @@ static void ggml_compute_forward_out_prod_f32(
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_ASSERT(ne0 == ne00);
|
||||
GGML_ASSERT(ne1 == ne10);
|
||||
GGML_ASSERT(ne2 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
|
||||
GGML_ASSERT(ne2 % ne02 == 0);
|
||||
GGML_ASSERT(ne3 % ne03 == 0);
|
||||
GGML_ASSERT(ne0 == ne00);
|
||||
GGML_ASSERT(ne1 == ne10);
|
||||
GGML_ASSERT(ne2 == ne02);
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
GGML_ASSERT(ne03 == ne13);
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
@@ -7855,10 +7797,6 @@ static void ggml_compute_forward_out_prod_f32(
|
||||
const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
|
||||
const int64_t blck_1 = 16;
|
||||
|
||||
// dps == dst per src0, used for group query attention
|
||||
const int64_t dps2 = ne2 / ne02;
|
||||
const int64_t dps3 = ne3 / ne03;
|
||||
|
||||
for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
|
||||
const int64_t bir1 = MIN(bir + blck_1, ir1);
|
||||
for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
|
||||
@@ -7869,8 +7807,8 @@ static void ggml_compute_forward_out_prod_f32(
|
||||
const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
|
||||
const int64_t i02 = i2 / dps2;
|
||||
const int64_t i03 = i3 / dps3;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i03 = i3;
|
||||
|
||||
//const int64_t i10 = i1;
|
||||
const int64_t i12 = i2;
|
||||
@@ -7883,7 +7821,7 @@ static void ggml_compute_forward_out_prod_f32(
|
||||
|
||||
float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
|
||||
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
|
||||
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
||||
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
||||
|
||||
ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
|
||||
}
|
||||
@@ -7892,7 +7830,7 @@ static void ggml_compute_forward_out_prod_f32(
|
||||
|
||||
float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
|
||||
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
|
||||
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
||||
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
||||
|
||||
ggml_vec_mad_f32(ne0, d, s0, *s1);
|
||||
}
|
||||
@@ -8968,9 +8906,9 @@ static void ggml_compute_forward_soft_max(
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_soft_max_ext_back
|
||||
// ggml_compute_forward_soft_max_back
|
||||
|
||||
static void ggml_compute_forward_soft_max_ext_back_f32(
|
||||
static void ggml_compute_forward_soft_max_back_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
@@ -8983,14 +8921,6 @@ static void ggml_compute_forward_soft_max_ext_back_f32(
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
GGML_ASSERT(ggml_are_same_shape(src1, dst));
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
GGML_ASSERT(max_bias == 0.0f);
|
||||
|
||||
// TODO: handle transposed/permuted matrices
|
||||
|
||||
const int ith = params->ith;
|
||||
@@ -9039,11 +8969,10 @@ static void ggml_compute_forward_soft_max_ext_back_f32(
|
||||
|
||||
// linear runtime, no additional memory
|
||||
float dot_y_dy = 0;
|
||||
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
|
||||
ggml_vec_cpy_f32 (nc, dx, dy);
|
||||
ggml_vec_acc1_f32 (nc, dx, -dot_y_dy);
|
||||
ggml_vec_mul_f32 (nc, dx, dx, y);
|
||||
ggml_vec_scale_f32(nc, dx, scale);
|
||||
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
|
||||
ggml_vec_cpy_f32 (nc, dx, dy);
|
||||
ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
|
||||
ggml_vec_mul_f32 (nc, dx, dx, y);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
@@ -9054,7 +8983,7 @@ static void ggml_compute_forward_soft_max_ext_back_f32(
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_soft_max_ext_back(
|
||||
static void ggml_compute_forward_soft_max_back(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
@@ -9063,7 +8992,7 @@ static void ggml_compute_forward_soft_max_ext_back(
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_soft_max_ext_back_f32(params, dst);
|
||||
ggml_compute_forward_soft_max_back_f32(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -10056,10 +9985,9 @@ static void ggml_compute_forward_im2col_back_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // convolution kernel
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -10081,11 +10009,11 @@ static void ggml_compute_forward_im2col_back_f32(
|
||||
const int64_t IH = is_2D ? ne1 : 1;
|
||||
const int64_t IW = ne0;
|
||||
|
||||
const int64_t KH = is_2D ? ne11 : 1;
|
||||
const int64_t KW = ne10;
|
||||
const int64_t KH = is_2D ? ne01 : 1;
|
||||
const int64_t KW = ne00;
|
||||
|
||||
const int64_t OH = is_2D ? ne02 : 1;
|
||||
const int64_t OW = ne01;
|
||||
const int64_t OH = is_2D ? ne12 : 1;
|
||||
const int64_t OW = ne11;
|
||||
|
||||
int ofs0 = is_2D ? nb3 : nb2;
|
||||
int ofs1 = is_2D ? nb2 : nb1;
|
||||
@@ -10131,9 +10059,9 @@ static void ggml_compute_forward_im2col_back_f32(
|
||||
continue;
|
||||
}
|
||||
|
||||
const float * const grad_in = (const float *) src0->data
|
||||
const float * const src_data = (const float *) src1->data
|
||||
+ (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
||||
grad += grad_in[iic*(KH*KW) + ikh*KW + ikw];
|
||||
grad += src_data[iic*(KH*KW) + ikh*KW + ikw];
|
||||
}
|
||||
}
|
||||
float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW]
|
||||
@@ -12556,22 +12484,22 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * grad = dst->src[0]; // gradient of forward pass output
|
||||
const struct ggml_tensor * src0f = dst->src[1]; // src0 of forward pass
|
||||
const struct ggml_tensor * src1f = dst->src[2]; // src1 of forward pass
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
const struct ggml_tensor * opt0 = dst->src[2];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0f));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1f));
|
||||
GGML_ASSERT(ggml_is_contiguous(grad));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0f, src1f) && ggml_are_same_shape(src0f, dst));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(ggml_is_contiguous(opt0));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
const int64_t ith = params->ith;
|
||||
const int64_t nth = params->nth;
|
||||
|
||||
// TODO: handle transposed/permuted matrices
|
||||
const int64_t nc = src0f->ne[0];
|
||||
const int64_t nr = ggml_nrows(src0f);
|
||||
const int64_t nc = src0->ne[0];
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
// rows per thread
|
||||
const int64_t dr = (nr + nth - 1)/nth;
|
||||
@@ -12580,12 +12508,12 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
||||
const int64_t ir0 = dr*ith;
|
||||
const int64_t ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
const float d_by_nr = ((const float *) grad->data)[0] / (float) nr;
|
||||
const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr;
|
||||
|
||||
for (int64_t i1 = ir0; i1 < ir1; i1++) {
|
||||
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
|
||||
const float * s0 = (const float *)((const char *) src0f->data + i1*src0f->nb[1]);
|
||||
const float * s1 = (const float *)((const char *) src1f->data + i1*src1f->nb[1]);
|
||||
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
|
||||
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
||||
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int64_t i = 0; i < nc; ++i) {
|
||||
@@ -12598,11 +12526,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
||||
// soft_max
|
||||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(nc, &max, s0);
|
||||
const ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
|
||||
ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
|
||||
assert(sum > 0.0);
|
||||
ggml_vec_scale_f32(nc, ds0, 1.0/sum);
|
||||
|
||||
// grad(src0f) = (softmax(src0f) - src1f) * grad(cross_entropy_loss(src0f, src1f)) / nr
|
||||
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
||||
ggml_vec_sub_f32(nc, ds0, ds0, s1);
|
||||
ggml_vec_scale_f32(nc, ds0, d_by_nr);
|
||||
|
||||
@@ -12899,7 +12827,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
} break;
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
{
|
||||
ggml_compute_forward_soft_max_ext_back(params, tensor);
|
||||
ggml_compute_forward_soft_max_back(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
@@ -13851,14 +13779,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
/*.threadpool=*/ tp,
|
||||
};
|
||||
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
|
||||
for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
if (state->ith == 0 && cplan->abort_callback &&
|
||||
cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
|
||||
tp->abort = true;
|
||||
tp->ec = GGML_STATUS_ABORTED;
|
||||
}
|
||||
|
||||
@@ -14031,7 +13959,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||
threadpool->current_chunk = 0;
|
||||
threadpool->stop = false;
|
||||
threadpool->pause = tpp->paused;
|
||||
threadpool->abort = -1;
|
||||
threadpool->abort = false;
|
||||
threadpool->workers = NULL;
|
||||
threadpool->n_threads_max = tpp->n_threads;
|
||||
threadpool->n_threads_cur = tpp->n_threads;
|
||||
@@ -14110,7 +14038,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||
threadpool->cgraph = cgraph;
|
||||
threadpool->cplan = cplan;
|
||||
threadpool->current_chunk = 0;
|
||||
threadpool->abort = -1;
|
||||
threadpool->abort = false;
|
||||
threadpool->ec = GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -403,21 +403,10 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||
case GGML_OP_MUL_MAT:
|
||||
return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
|
||||
case GGML_OP_SOFT_MAX_BACK: {
|
||||
if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
|
||||
|
||||
return max_bias == 0.0f;
|
||||
}
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
|
||||
case GGML_OP_OUT_PROD:
|
||||
return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
|
||||
src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||
return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -93,31 +93,26 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
|
||||
|
||||
template <typename T>
|
||||
static __global__ void k_repeat_back(
|
||||
const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const size_t s00, const size_t s01, const size_t s02, const size_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
|
||||
const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2) {
|
||||
|
||||
const int64_t tid0 = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
|
||||
const int64_t tid1 = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
|
||||
const int64_t tid23 = int64_t(blockIdx.z)*blockDim.z + threadIdx.z;
|
||||
const int64_t tid2 = tid23 % ne2;
|
||||
const int64_t tid3 = tid23 / ne2;
|
||||
const int64_t tid0 = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
|
||||
const int64_t tid1 = (int64_t) blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int64_t tid2 = (int64_t) blockIdx.z*blockDim.z + threadIdx.z;
|
||||
|
||||
if (tid0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
T sum = 0;
|
||||
for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
|
||||
for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
|
||||
for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
|
||||
for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
|
||||
sum += src[i3*s03 + i2*s02 + i1*s01 + i0*s00];
|
||||
}
|
||||
for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
|
||||
for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
|
||||
for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
|
||||
sum += src[i2*ne01*ne00 + i1*ne00 + i0];
|
||||
}
|
||||
}
|
||||
}
|
||||
dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
|
||||
dst[tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
|
||||
}
|
||||
|
||||
template<float (*bin_op)(const float, const float)>
|
||||
@@ -279,14 +274,12 @@ struct bin_bcast_cuda {
|
||||
|
||||
template <typename T>
|
||||
static void repeat_back_cuda(
|
||||
const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const size_t s00, const size_t s01, const size_t s02, const size_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
||||
const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, cudaStream_t stream) {
|
||||
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2*ne3);
|
||||
k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>
|
||||
(src, dst, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3);
|
||||
const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2);
|
||||
k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>(src, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
}
|
||||
|
||||
template<class op>
|
||||
@@ -333,26 +326,27 @@ void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
GGML_ASSERT(ggml_can_repeat(dst, src0));
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
GGML_ASSERT(src0->ne[3] == 1);
|
||||
|
||||
GGML_ASSERT(ne2*ne3 <= (1 << 15));
|
||||
|
||||
const size_t ts = ggml_type_size(src0->type);
|
||||
const size_t s00 = nb00 / ts;
|
||||
const size_t s01 = nb01 / ts;
|
||||
const size_t s02 = nb02 / ts;
|
||||
const size_t s03 = nb03 / ts;
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
const int64_t ne2 = dst->ne[2];
|
||||
GGML_ASSERT(dst->ne[3] == 1);
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3, stream);
|
||||
repeat_back_cuda<float>(src0_d, dst_d, ne00, ne01, ne02, ne0, ne1, ne2, stream);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ASSERT(false);
|
||||
|
||||
@@ -46,20 +46,20 @@
|
||||
#define GGML_CUDA_CC_VOLTA 700
|
||||
#define GGML_CUDA_CC_TURING 750
|
||||
#define GGML_CUDA_CC_AMPERE 800
|
||||
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
|
||||
#define GGML_CUDA_CC_OFFSET_AMD 1000000
|
||||
|
||||
// GCN/CNDA, wave size is 64
|
||||
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
||||
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
||||
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
||||
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
||||
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
||||
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
||||
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
|
||||
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
|
||||
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
|
||||
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
|
||||
|
||||
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
||||
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
|
||||
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
|
||||
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
|
||||
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
|
||||
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
|
||||
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
|
||||
|
||||
#define GGML_CUDA_CC_QY1 210
|
||||
#define GGML_CUDA_CC_QY2 220
|
||||
@@ -131,10 +131,6 @@ typedef float dfloat; // dequantize float
|
||||
typedef float2 dfloat2;
|
||||
#endif // GGML_CUDA_F16
|
||||
|
||||
#if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
|
||||
#define GGML_USE_VMM
|
||||
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
|
||||
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
#define FP16_AVAILABLE
|
||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
@@ -592,7 +588,7 @@ struct ggml_tensor_extra_gpu {
|
||||
};
|
||||
|
||||
|
||||
#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
|
||||
#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
|
||||
#define USE_CUDA_GRAPH
|
||||
#endif
|
||||
|
||||
|
||||
@@ -5,89 +5,95 @@
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
||||
template <bool use_shared>
|
||||
static __global__ void cross_entropy_loss_f32(
|
||||
const float * __restrict__ logits, const float * __restrict__ labels, float * __restrict__ dst, const int nclasses, const int k) {
|
||||
extern __shared__ float tmp[];
|
||||
static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) {
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE;
|
||||
|
||||
logits += int64_t(blockIdx.x)*nclasses;
|
||||
labels += int64_t(blockIdx.x)*nclasses;
|
||||
const int ne_tmp = WARP_SIZE*nclasses;
|
||||
|
||||
extern __shared__ float tmp_all[];
|
||||
float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp;
|
||||
float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp;
|
||||
|
||||
// Each warp first loads ne_tmp logits/labels into shared memory:
|
||||
for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) {
|
||||
const int ig = i0*nclasses + i; // ig == i global
|
||||
|
||||
tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f;
|
||||
tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f;
|
||||
}
|
||||
|
||||
// Each thread in the warp then calculates the cross entropy loss for a single row.
|
||||
// TODO: pad in order to avoid shared memory bank conflicts.
|
||||
|
||||
// Find maximum for softmax:
|
||||
float max_logit = -INFINITY;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float val = logits[i];
|
||||
max_logit = fmaxf(max_logit, val);
|
||||
|
||||
if (use_shared) {
|
||||
tmp[i] = val;
|
||||
}
|
||||
float max = -INFINITY;
|
||||
for (int i = 0; i < nclasses; ++i) {
|
||||
max = fmaxf(max, tmp_logits[lane_id*nclasses + i]);
|
||||
}
|
||||
max_logit = warp_reduce_max(max_logit);
|
||||
|
||||
// Calculate log(softmax(logits)) which is just logits - max:
|
||||
float sum = 0.0f;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float logit_i = use_shared ? tmp[i] : logits[i];
|
||||
sum += expf(logit_i - max_logit);
|
||||
for (int i = 0; i < nclasses; ++i) {
|
||||
float val = tmp_logits[lane_id*nclasses + i] - max;
|
||||
sum += expf(val);
|
||||
tmp_logits[lane_id*nclasses + i] = val;
|
||||
}
|
||||
sum = warp_reduce_sum(sum);
|
||||
sum = logf(sum);
|
||||
|
||||
// log(exp(logits - max) / sum) = (logits - max) - log(sum)
|
||||
float loss = 0.0f;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float logit_i = use_shared ? tmp[i] : logits[i];
|
||||
loss += (logit_i - max_logit - sum) * labels[i];
|
||||
for (int i = 0; i < nclasses; ++i) {
|
||||
loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i];
|
||||
}
|
||||
loss = -warp_reduce_sum(loss) / (float)k;
|
||||
|
||||
if (threadIdx.x != 0) {
|
||||
__syncthreads();
|
||||
|
||||
if (lane_id == 0) {
|
||||
tmp_all[warp_id] = loss;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (warp_id != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f;
|
||||
loss = warp_reduce_sum(loss);
|
||||
|
||||
if (lane_id != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[blockIdx.x] = loss;
|
||||
}
|
||||
|
||||
template <bool use_shared>
|
||||
static __global__ void cross_entropy_loss_back_f32(
|
||||
const float * __restrict__ grad, const float * __restrict__ logits, const float * __restrict__ labels,
|
||||
float * __restrict__ dst, const int nclasses) {
|
||||
static __global__ void cross_entropy_loss_back_f32(const float * logits, const float * labels, const float * loss, float * dst, const int nclasses) {
|
||||
extern __shared__ float tmp[];
|
||||
|
||||
logits += int64_t(blockIdx.x)*nclasses;
|
||||
labels += int64_t(blockIdx.x)*nclasses;
|
||||
dst += int64_t(blockIdx.x)*nclasses;
|
||||
|
||||
float maxval = -INFINITY;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float val = logits[i];
|
||||
const float val = logits[blockIdx.x*nclasses + i];
|
||||
maxval = fmaxf(maxval, val);
|
||||
|
||||
if (use_shared) {
|
||||
tmp[i] = val;
|
||||
}
|
||||
tmp[i] = val;
|
||||
}
|
||||
maxval = warp_reduce_max(maxval);
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float val = expf((use_shared ? tmp[i] : logits[i]) - maxval);
|
||||
const float val = expf(tmp[i] - maxval);
|
||||
sum += val;
|
||||
|
||||
if (use_shared) {
|
||||
tmp[i] = val;
|
||||
} else {
|
||||
dst[i] = val;
|
||||
}
|
||||
tmp[i] = val;
|
||||
}
|
||||
sum = warp_reduce_sum(sum);
|
||||
const float sm_scale = 1.0f/sum;
|
||||
|
||||
const float d_by_nrows = *grad/gridDim.x;
|
||||
const float d_by_nrows = *loss/gridDim.x;
|
||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
||||
const float val = use_shared ? tmp[i] : dst[i];
|
||||
dst[i] = (val*sm_scale - labels[i])*d_by_nrows;
|
||||
dst[blockIdx.x*nclasses + i] = (tmp[i]*sm_scale - labels[blockIdx.x*nclasses + i])*d_by_nrows;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,77 +119,48 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const dim3 blocks_dim(WARP_SIZE, 1, 1);
|
||||
const dim3 blocks_num(nrows, 1, 1);
|
||||
const size_t nbytes_shared = ne00*sizeof(float);
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
|
||||
const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
|
||||
const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float);
|
||||
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
} else {
|
||||
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
|
||||
// Combine results from individual blocks:
|
||||
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * grad = dst->src[0];
|
||||
const ggml_tensor * src0f = dst->src[1];
|
||||
const ggml_tensor * src1f = dst->src[2];
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
const ggml_tensor * opt0 = dst->src[2];
|
||||
|
||||
GGML_ASSERT(src0f->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1f->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( grad->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(opt0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_scalar(grad));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0f));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1f));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(ggml_is_contiguous(opt0));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0f, src1f));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0f, dst));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
|
||||
const int64_t ne00 = src0f->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0f);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
const float * grad_d = (const float *) grad->data;
|
||||
const float * src0f_d = (const float *) src0f->data;
|
||||
const float * src1f_d = (const float *) src1f->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
const float * opt0_d = (const float *) opt0->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const dim3 blocks_dim(WARP_SIZE, 1, 1);
|
||||
const dim3 blocks_num(nrows, 1, 1);
|
||||
const size_t nbytes_shared = ne00*sizeof(float);
|
||||
const int shmem = ne00*sizeof(float);
|
||||
|
||||
const int id = ggml_cuda_get_device();
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
||||
} else {
|
||||
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
||||
}
|
||||
cross_entropy_loss_back_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, opt0_d, dst_d, ne00);
|
||||
}
|
||||
|
||||
@@ -3,15 +3,15 @@
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void k_get_rows(
|
||||
const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
|
||||
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
|
||||
/*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
|
||||
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
const void * src0, const int32_t * src1, dst_t * dst,
|
||||
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
||||
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
||||
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
||||
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
||||
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
||||
|
||||
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
||||
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
||||
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
||||
|
||||
@@ -22,10 +22,10 @@ static __global__ void k_get_rows(
|
||||
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
||||
|
||||
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
||||
const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
||||
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
|
||||
|
||||
const int ib = i00/qk; // block index
|
||||
const int iqs = (i00%qk)/qr; // quant index
|
||||
const int ib = i00/qk; // block index
|
||||
const int iqs = (i00%qk)/qr; // quant index
|
||||
const int iybs = i00 - i00%qk; // dst block start index
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
@@ -39,15 +39,15 @@ static __global__ void k_get_rows(
|
||||
|
||||
template<typename src0_t, typename dst_t>
|
||||
static __global__ void k_get_rows_float(
|
||||
const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
|
||||
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
|
||||
/*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
|
||||
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
const src0_t * src0, const int32_t * src1, dst_t * dst,
|
||||
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
|
||||
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
|
||||
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
|
||||
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
|
||||
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
|
||||
|
||||
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
||||
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
||||
|
||||
@@ -58,38 +58,14 @@ static __global__ void k_get_rows_float(
|
||||
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
||||
|
||||
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
|
||||
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
||||
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
|
||||
|
||||
dst_row[i00] = src0_row[i00];
|
||||
}
|
||||
|
||||
template<typename grad_t, typename dst_t>
|
||||
static __global__ void k_get_rows_back_float(
|
||||
const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) {
|
||||
const int col = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
if (col >= ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int dst_row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
for (int64_t i = 0; i < nrows_grad; ++i) {
|
||||
if (rows[i] != dst_row) {
|
||||
continue;
|
||||
}
|
||||
sum += grad[i*ncols + col];
|
||||
}
|
||||
|
||||
dst[dst_row*ncols + col] = sum;
|
||||
}
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dq>
|
||||
static void get_rows_cuda(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
||||
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
@@ -111,25 +87,22 @@ static void get_rows_cuda(
|
||||
GGML_ASSERT(ne00 % 2 == 0);
|
||||
|
||||
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_dd, src1_dd, dst_dd,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10, ne11,*/ ne12, /*ne13,*/
|
||||
/* s0,*/ s1, s2, s3,
|
||||
/* nb00,*/ nb01, nb02, nb03,
|
||||
s10, s11, s12/*, s13*/);
|
||||
src0_dd, src1_dd, dst_dd,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10, ne11,*/ ne12, /*ne13,*/
|
||||
/* s0,*/ s1, s2, s3,
|
||||
/* nb00,*/ nb01, nb02, nb03,
|
||||
s10, s11, s12/*, s13*/);
|
||||
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
|
||||
template<typename src0_t>
|
||||
static void get_rows_cuda_float(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
||||
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(ne13 == 1);
|
||||
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
||||
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
||||
@@ -146,12 +119,12 @@ static void get_rows_cuda_float(
|
||||
//const size_t s13 = nb13 / ggml_element_size(src1);
|
||||
|
||||
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_dd, src1_dd, dst_dd,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10, ne11,*/ ne12, /*ne13,*/
|
||||
/* s0,*/ s1, s2, s3,
|
||||
/* nb00,*/ nb01, nb02, nb03,
|
||||
s10, s11, s12/*, s13*/);
|
||||
src0_dd, src1_dd, dst_dd,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10, ne11,*/ ne12, /*ne13,*/
|
||||
/* s0,*/ s1, s2, s3,
|
||||
/* nb00,*/ nb01, nb02, nb03,
|
||||
s10, s11, s12/*, s13*/);
|
||||
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
@@ -159,41 +132,42 @@ static void get_rows_cuda_float(
|
||||
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const void * src0_d = (const void *) src0->data;
|
||||
const int32_t * src1_d = (const int32_t *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
||||
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
||||
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
|
||||
|
||||
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream);
|
||||
get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream);
|
||||
get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
|
||||
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
|
||||
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
|
||||
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
|
||||
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
|
||||
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
|
||||
break;
|
||||
default:
|
||||
// TODO: k-quants
|
||||
@@ -201,34 +175,3 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output
|
||||
const ggml_tensor * src1 = dst->src[1]; // src1 in forward pass
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const int32_t * src1_d = (const int32_t *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
GGML_ASSERT(ne02*ne03 == 1);
|
||||
GGML_ASSERT(ne12*ne13 == 1);
|
||||
GGML_ASSERT(ne2*ne3 == 1);
|
||||
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE;
|
||||
const dim3 block_nums(block_num_x, ne1, 1);
|
||||
|
||||
k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10);
|
||||
}
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
||||
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -42,7 +42,6 @@
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <charconv>
|
||||
#include <cinttypes>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
@@ -63,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
[[noreturn]]
|
||||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
||||
int id = -1; // in case cudaGetDevice fails
|
||||
(void)cudaGetDevice(&id);
|
||||
cudaGetDevice(&id);
|
||||
|
||||
GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
|
||||
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
||||
@@ -120,78 +119,12 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
static int ggml_cuda_parse_id(char devName[]) {
|
||||
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
|
||||
// these values are not stable so this is susceptible to breakage
|
||||
// https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
|
||||
int archMajor = 0x0;
|
||||
int archMinor = 0x0;
|
||||
int archNum = GGML_CUDA_CC_OFFSET_AMD;
|
||||
int archLen = strlen(devName);
|
||||
char archName[archLen + 1];
|
||||
|
||||
// strip leading 'gfx' while copying into our buffer
|
||||
if (archLen > 3) {
|
||||
strcpy(archName, &devName[3]);
|
||||
archLen -= 3;
|
||||
}
|
||||
|
||||
// trim trailing :xnack- or :sramecc- statuses
|
||||
archLen = strcspn(archName, ":");
|
||||
archName[archLen] = '\0';
|
||||
|
||||
// tease out the version information
|
||||
if (archLen > 8) {
|
||||
// versions labeled generic use '-' as delimiter
|
||||
// strip the trailing "-generic" then iterate through what remains
|
||||
if ((strstr(archName, "-generic"))) {
|
||||
archName[archLen - 8] = '\0';
|
||||
char * pch;
|
||||
if ((pch = strtok(archName, "-"))) {
|
||||
archMajor = (int)strtoul(pch, 0, 16);
|
||||
if ((pch = strtok(NULL, "-"))) {
|
||||
archMinor = 0x10 * (int)strtoul(pch, 0, 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (archLen >= 3) {
|
||||
// last two digits should be the minor * 0x10 + stepping
|
||||
archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
|
||||
archName[archLen - 2] = '\0';
|
||||
|
||||
// only the major version remains
|
||||
archMajor = (int)strtoul(archName, 0, 16);
|
||||
}
|
||||
archNum += archMajor * 0x100;
|
||||
archNum += archMinor;
|
||||
return archNum;
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
||||
{
|
||||
int major_version = 0;
|
||||
size_t version_length = 0;
|
||||
if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
|
||||
std::string version(version_length, '\0');
|
||||
if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
|
||||
version.resize(::strlen(version.c_str()));
|
||||
int parsed_value = 0;
|
||||
if (std::from_chars(version.c_str(), version.c_str() + version.length(), parsed_value).ec == std::errc()) {
|
||||
major_version = parsed_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (major_version < 4) {
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
|
||||
rocblas_initialize();
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
rocblas_initialize();
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
#endif
|
||||
|
||||
ggml_cuda_device_info info = {};
|
||||
@@ -219,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
int device_vmm = 0;
|
||||
|
||||
#if defined(GGML_USE_VMM)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
CUdevice device;
|
||||
CU_CHECK(cuDeviceGet(&device, id));
|
||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||
@@ -231,11 +164,12 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
alloc_prop.location.id = id;
|
||||
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||
}
|
||||
#endif // defined(GGML_USE_VMM)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
info.devices[id].vmm = !!device_vmm;
|
||||
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
||||
|
||||
info.default_tensor_split[id] = total_vram;
|
||||
total_vram += prop.totalGlobalMem;
|
||||
@@ -244,25 +178,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||
|
||||
info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
|
||||
if ((info.devices[id].cc & 0xff00) == 0x0) {
|
||||
GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n",
|
||||
id, prop.name, prop.gcnArchName, prop.major, prop.minor);
|
||||
|
||||
// Fallback to prop.major and prop.minor
|
||||
if (prop.major > 0) {
|
||||
info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
}
|
||||
}
|
||||
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s\n",
|
||||
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no");
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
|
||||
#else
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
@@ -381,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
};
|
||||
|
||||
// pool with virtual memory
|
||||
#if defined(GGML_USE_VMM)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
|
||||
@@ -390,9 +309,6 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
size_t pool_used = 0;
|
||||
size_t pool_size = 0;
|
||||
size_t granularity;
|
||||
#if defined(GGML_USE_HIP)
|
||||
std::vector<std::pair<CUdeviceptr, size_t>> mappings;
|
||||
#endif
|
||||
|
||||
explicit ggml_cuda_pool_vmm(int device) :
|
||||
device(device),
|
||||
@@ -401,14 +317,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
|
||||
~ggml_cuda_pool_vmm() {
|
||||
if (pool_addr != 0) {
|
||||
#if defined(GGML_USE_HIP)
|
||||
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
|
||||
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
|
||||
CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
|
||||
}
|
||||
#else
|
||||
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
|
||||
#endif
|
||||
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
|
||||
}
|
||||
}
|
||||
@@ -441,11 +350,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
}
|
||||
|
||||
// map at the end of the pool
|
||||
CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
|
||||
CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
|
||||
#if defined(GGML_USE_HIP)
|
||||
mappings.push_back({start_ptr, reserve_size});
|
||||
#endif
|
||||
CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
|
||||
|
||||
// the memory allocation handle is no longer needed after mapping
|
||||
CU_CHECK(cuMemRelease(handle));
|
||||
@@ -455,7 +360,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
access.location.id = device;
|
||||
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
|
||||
CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
|
||||
|
||||
// add to the pool
|
||||
pool_size += reserve_size;
|
||||
@@ -467,7 +372,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
|
||||
GGML_ASSERT(pool_addr != 0);
|
||||
|
||||
void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
|
||||
void * ptr = (void *) (pool_addr + pool_used);
|
||||
*actual_size = size;
|
||||
pool_used += size;
|
||||
|
||||
@@ -486,17 +391,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
pool_used -= size;
|
||||
|
||||
// all deallocations must be in reverse order of the allocations
|
||||
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
|
||||
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
||||
}
|
||||
};
|
||||
#endif // defined(GGML_USE_VMM)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
|
||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||
#if defined(GGML_USE_VMM)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
if (ggml_cuda_info().devices[device].vmm) {
|
||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||
}
|
||||
#endif // defined(GGML_USE_VMM)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||
}
|
||||
|
||||
@@ -642,7 +547,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
(void)cudaGetLastError();
|
||||
cudaGetLastError();
|
||||
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
||||
return nullptr;
|
||||
}
|
||||
@@ -1057,7 +962,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
||||
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
(void)cudaGetLastError();
|
||||
cudaGetLastError();
|
||||
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
||||
return nullptr;
|
||||
@@ -1177,9 +1082,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
|
||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||
|
||||
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
|
||||
|
||||
if (compute_capability >= GGML_CUDA_CC_VOLTA && use_fp16) {
|
||||
if (compute_capability >= GGML_CUDA_CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
||||
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
||||
if (src0->type != GGML_TYPE_F16) {
|
||||
@@ -1200,38 +1103,28 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
|
||||
}
|
||||
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
|
||||
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
|
||||
|
||||
const half alpha_f16 = 1.0f;
|
||||
const half beta_f16 = 0.0f;
|
||||
|
||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
}
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
row_diff, src1_ncols, ne10,
|
||||
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
||||
src1_ptr, CUDA_R_16F, ne10,
|
||||
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
||||
if (compute_capability == GGML_CUDA_CC_CDNA) {
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
row_diff, src1_ncols, ne10,
|
||||
&alpha, src0_ptr, CUDA_R_16F, ne00,
|
||||
src1_ptr, CUDA_R_16F, ne10,
|
||||
&beta, dst_dd_i, CUDA_R_32F, ldc,
|
||||
CUBLAS_COMPUTE_32F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
} else {
|
||||
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
|
||||
|
||||
const half alpha_f16 = 1.0f;
|
||||
const half beta_f16 = 0.0f;
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
row_diff, src1_ncols, ne10,
|
||||
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
||||
src1_ptr, CUDA_R_16F, ne10,
|
||||
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
}
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
} else {
|
||||
ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
|
||||
ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
|
||||
@@ -1304,7 +1197,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
||||
CUDA_CHECK(err);
|
||||
} else {
|
||||
// reset the error
|
||||
(void)cudaGetLastError();
|
||||
cudaGetLastError();
|
||||
}
|
||||
} else {
|
||||
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
||||
@@ -1312,7 +1205,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
||||
CUDA_CHECK(err);
|
||||
} else {
|
||||
// reset the error
|
||||
(void)cudaGetLastError();
|
||||
cudaGetLastError();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1720,6 +1613,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||
cudaDataType_t cu_data_type = CUDA_R_16F;
|
||||
|
||||
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
}
|
||||
|
||||
// dst strides
|
||||
size_t nbd2 = dst->nb[2];
|
||||
size_t nbd3 = dst->nb[3];
|
||||
@@ -1748,12 +1645,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
beta = &beta_f32;
|
||||
}
|
||||
|
||||
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
alpha = &alpha_f32;
|
||||
beta = &beta_f32;
|
||||
}
|
||||
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne13 % ne03 == 0);
|
||||
|
||||
@@ -2112,9 +2003,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_GET_ROWS:
|
||||
ggml_cuda_op_get_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
ggml_cuda_op_get_rows_back(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_DUP:
|
||||
ggml_cuda_dup(ctx, dst);
|
||||
break;
|
||||
@@ -2203,15 +2091,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
ggml_cuda_op_leaky_relu(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SILU_BACK:
|
||||
ggml_cuda_op_silu_back(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
ggml_cuda_op_rms_norm(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
ggml_cuda_op_rms_norm_back(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
||||
GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
||||
@@ -2256,9 +2138,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_SOFT_MAX:
|
||||
ggml_cuda_op_soft_max(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SOFT_MAX_BACK:
|
||||
ggml_cuda_op_soft_max_back(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_ROPE:
|
||||
ggml_cuda_op_rope(ctx, dst);
|
||||
break;
|
||||
@@ -2547,7 +2426,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
|
||||
if (stat == cudaErrorInvalidDeviceFunction) {
|
||||
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
||||
// We don't need to update blas nodes, so clear error and move on.
|
||||
(void)cudaGetLastError();
|
||||
cudaGetLastError();
|
||||
} else {
|
||||
GGML_ASSERT(stat == cudaSuccess);
|
||||
}
|
||||
@@ -2602,20 +2481,14 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
|
||||
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
|
||||
cudaGraphExecUpdateResultInfo result_info;
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
hipGraphNode_t errorNode;
|
||||
hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
|
||||
#else
|
||||
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
||||
#endif
|
||||
if (stat == cudaErrorGraphExecUpdateFailure) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
|
||||
#endif
|
||||
|
||||
// The pre-existing graph exec cannot be updated due to violated constraints
|
||||
// so instead clear error and re-instantiate
|
||||
(void)cudaGetLastError();
|
||||
cudaGetLastError();
|
||||
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
|
||||
cuda_ctx->cuda_graph->instance = nullptr;
|
||||
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
||||
@@ -2843,7 +2716,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
||||
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
(void)cudaGetLastError();
|
||||
cudaGetLastError();
|
||||
|
||||
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
||||
@@ -2863,7 +2736,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
||||
cudaError_t err = cudaHostUnregister(buffer);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
(void)cudaGetLastError();
|
||||
cudaGetLastError();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3039,7 +2912,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
@@ -3055,10 +2928,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
{
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
|
||||
} break;
|
||||
case GGML_OP_CPY:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
@@ -3117,7 +2986,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
||||
} break;
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->ne[3] == 1;
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
@@ -3132,12 +3001,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_SILU_BACK:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
break;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
@@ -3162,11 +3027,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
case GGML_OP_SOFT_MAX_BACK: {
|
||||
float max_bias = 0.0f;
|
||||
memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
|
||||
return max_bias == 0.0f;
|
||||
}
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK: {
|
||||
const size_t ts = ggml_type_size(op->src[0]->type);
|
||||
@@ -3331,7 +3191,7 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
|
||||
features.push_back({ "FORCE_CUBLAS", "1" });
|
||||
#endif
|
||||
|
||||
#ifndef GGML_USE_VMM
|
||||
#ifdef GGML_CUDA_NO_VMM
|
||||
features.push_back({ "NO_VMM", "1" });
|
||||
#endif
|
||||
|
||||
|
||||
@@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
|
||||
int64_t nwarps = 1;
|
||||
int64_t rows_per_cuda_block = 1;
|
||||
|
||||
if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
||||
if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
|
||||
switch(ncols_y) {
|
||||
case 1:
|
||||
nwarps = 4;
|
||||
@@ -166,7 +166,6 @@ static void mul_mat_vec_q_cuda(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
|
||||
const dim3 block_nums(nblocks, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, nwarps, 1);
|
||||
|
||||
@@ -5,24 +5,20 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
x += int64_t(row)*ncols;
|
||||
dst += int64_t(row)*ncols;
|
||||
|
||||
float2 mean_var = make_float2(0.0f, 0.0f);
|
||||
float2 mean_var = make_float2(0.f, 0.f);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[col];
|
||||
const float xi = x[row*ncols + col];
|
||||
mean_var.x += xi;
|
||||
mean_var.y += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
mean_var = warp_reduce_sum(mean_var);
|
||||
if constexpr (block_size > WARP_SIZE) {
|
||||
static_assert(block_size == 1024, "unexpected block_size");
|
||||
if (block_size > WARP_SIZE) {
|
||||
__shared__ float2 s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
int warp_id = threadIdx.x / WARP_SIZE;
|
||||
int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = mean_var;
|
||||
}
|
||||
@@ -36,7 +32,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
|
||||
const float inv_std = rsqrtf(var + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[col] = (x[col] - mean) * inv_std;
|
||||
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,8 +40,14 @@ template <int block_size>
|
||||
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
|
||||
// blockIdx.x: num_groups idx
|
||||
// threadIdx.x: block_size idx
|
||||
const int start = blockIdx.x*group_size + threadIdx.x;
|
||||
const int end = min(blockIdx.x*group_size + group_size, ne_elements);
|
||||
int start = blockIdx.x * group_size;
|
||||
int end = start + group_size;
|
||||
|
||||
start += threadIdx.x;
|
||||
|
||||
if (end >= ne_elements) {
|
||||
end = ne_elements;
|
||||
}
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
@@ -54,11 +56,10 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
|
||||
}
|
||||
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
if constexpr (block_size > WARP_SIZE) {
|
||||
static_assert(block_size == 1024, "unexpected block_size");
|
||||
if (block_size > WARP_SIZE) {
|
||||
__shared__ float s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
int warp_id = threadIdx.x / WARP_SIZE;
|
||||
int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
@@ -67,11 +68,11 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
}
|
||||
|
||||
const float mean = tmp / group_size;
|
||||
float mean = tmp / group_size;
|
||||
tmp = 0.0f;
|
||||
|
||||
for (int j = start; j < end; j += block_size) {
|
||||
const float xi = x[j] - mean;
|
||||
float xi = x[j] - mean;
|
||||
dst[j] = xi;
|
||||
tmp += xi * xi;
|
||||
}
|
||||
@@ -79,8 +80,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
if (block_size > WARP_SIZE) {
|
||||
__shared__ float s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
int warp_id = threadIdx.x / WARP_SIZE;
|
||||
int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
@@ -89,8 +90,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
}
|
||||
|
||||
const float variance = tmp / group_size;
|
||||
const float scale = rsqrtf(variance + eps);
|
||||
float variance = tmp / group_size;
|
||||
float scale = rsqrtf(variance + eps);
|
||||
for (int j = start; j < end; j += block_size) {
|
||||
dst[j] *= scale;
|
||||
}
|
||||
@@ -101,23 +102,19 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
x += int64_t(row)*ncols;
|
||||
dst += int64_t(row)*ncols;
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xi = x[col];
|
||||
const float xi = x[row*ncols + col];
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
if constexpr (block_size > WARP_SIZE) {
|
||||
static_assert(block_size == 1024, "unexpected block_size");
|
||||
if (block_size > WARP_SIZE) {
|
||||
__shared__ float s_sum[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
int warp_id = threadIdx.x / WARP_SIZE;
|
||||
int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
@@ -130,63 +127,12 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
||||
const float scale = rsqrtf(mean + eps);
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[col] = scale * x[col];
|
||||
}
|
||||
}
|
||||
|
||||
template <int block_size>
|
||||
static __global__ void rms_norm_back_f32(
|
||||
const float * grad, const float * xf, float * dst, const int ncols, const float eps) {
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
grad += int64_t(row)*ncols;
|
||||
xf += int64_t(row)*ncols;
|
||||
dst += int64_t(row)*ncols;
|
||||
|
||||
float sum_xx = 0.0f; // sum for squares of x, equivalent to forward pass
|
||||
float sum_xg = 0.0f; // sum for x * gradient, needed because RMS norm mixes inputs
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
const float xfi = xf[col];
|
||||
sum_xx += xfi * xfi;
|
||||
sum_xg += xfi * grad[col];
|
||||
}
|
||||
|
||||
// sum up partial sums
|
||||
sum_xx = warp_reduce_sum(sum_xx);
|
||||
sum_xg = warp_reduce_sum(sum_xg);
|
||||
if constexpr (block_size > WARP_SIZE) {
|
||||
static_assert(block_size == 1024, "unexpected block_size");
|
||||
__shared__ float s_sum_xx[32];
|
||||
__shared__ float s_sum_xg[32];
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
s_sum_xx[warp_id] = sum_xx;
|
||||
s_sum_xg[warp_id] = sum_xg;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
sum_xx = s_sum_xx[lane_id];
|
||||
sum_xx = warp_reduce_sum(sum_xx);
|
||||
|
||||
sum_xg = s_sum_xg[lane_id];
|
||||
sum_xg = warp_reduce_sum(sum_xg);
|
||||
}
|
||||
|
||||
const float mean_eps = sum_xx / ncols + eps;
|
||||
const float sum_eps = sum_xx + ncols*eps;
|
||||
|
||||
const float scale_grad = rsqrtf(mean_eps);
|
||||
const float scale_x = -scale_grad * sum_xg/sum_eps;
|
||||
|
||||
for (int col = tid; col < ncols; col += block_size) {
|
||||
dst[col] = scale_grad*grad[col] + scale_x*xf[col];
|
||||
dst[row*ncols + col] = scale * x[row*ncols + col];
|
||||
}
|
||||
}
|
||||
|
||||
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
||||
@@ -196,8 +142,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
||||
}
|
||||
}
|
||||
|
||||
static void group_norm_f32_cuda(
|
||||
const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
|
||||
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
|
||||
if (group_size < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
|
||||
@@ -208,6 +153,7 @@ static void group_norm_f32_cuda(
|
||||
}
|
||||
|
||||
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
||||
@@ -217,16 +163,6 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
|
||||
}
|
||||
}
|
||||
|
||||
static void rms_norm_back_f32_cuda(const float * grad, const float * xf, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
||||
if (ncols < 1024) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
rms_norm_back_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(grad, xf, dst, ncols, eps);
|
||||
} else {
|
||||
const dim3 block_dims(1024, 1, 1);
|
||||
rms_norm_back_f32<1024><<<nrows, block_dims, 0, stream>>>(grad, xf, dst, ncols, eps);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
@@ -243,7 +179,6 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
|
||||
norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
|
||||
}
|
||||
@@ -263,7 +198,6 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
|
||||
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
||||
group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream);
|
||||
@@ -285,33 +219,6 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
|
||||
rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * grad = dst->src[0]; // gradients
|
||||
const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass
|
||||
|
||||
const float * grad_d = (const float *) grad->data;
|
||||
const float * src0f_d = (const float *) src0f->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(grad));
|
||||
|
||||
GGML_ASSERT( grad->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0f->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0f->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0f);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
|
||||
rms_norm_back_f32_cuda(grad_d, src0f_d, dst_d, ne00, nrows, eps, stream);
|
||||
}
|
||||
|
||||
@@ -5,5 +5,3 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -11,15 +11,16 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
GGML_ASSERT(ne01 == ne11);
|
||||
GGML_ASSERT(ne0 == ne00);
|
||||
GGML_ASSERT(ne1 == ne10);
|
||||
|
||||
GGML_ASSERT(ne2 % src0->ne[2] == 0);
|
||||
GGML_ASSERT(ne3 % src0->ne[3] == 0);
|
||||
|
||||
GGML_ASSERT(ne2 == src0->ne[2]);
|
||||
GGML_ASSERT(ne2 == src1->ne[2]);
|
||||
GGML_ASSERT(ne3 == src0->ne[3]);
|
||||
GGML_ASSERT(ne3 == src1->ne[3]);
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
@@ -32,37 +33,19 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
|
||||
GGML_ASSERT(ne2 == 1);
|
||||
GGML_ASSERT(ne3 == 1);
|
||||
CUBLAS_CHECK(cublasSetStream(handle, stream));
|
||||
|
||||
const int64_t lda = nb01 / sizeof(float);
|
||||
const int64_t ldc = nb1 / sizeof(float);
|
||||
|
||||
const bool src1_T = ggml_is_transposed(src1);
|
||||
const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
|
||||
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
||||
GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float));
|
||||
|
||||
// data strides in dimensions 2/3
|
||||
const size_t s02 = nb02 / sizeof(float);
|
||||
const size_t s03 = nb03 / sizeof(float);
|
||||
const size_t s12 = nb12 / sizeof(float);
|
||||
const size_t s13 = nb13 / sizeof(float);
|
||||
const size_t s2 = nb2 / sizeof(float);
|
||||
const size_t s3 = nb3 / sizeof(float);
|
||||
|
||||
// dps == dst per src0, used for group query attention
|
||||
const int64_t dps2 = ne2 / ne02;
|
||||
const int64_t dps3 = ne3 / ne03;
|
||||
|
||||
// TODO batched matrix multiplication
|
||||
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||
CUBLAS_CHECK(
|
||||
cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
|
||||
ne0, ne1, ne01,
|
||||
&alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
|
||||
src1_d + i3 *s13 + i2 *s12, ldb,
|
||||
&beta, dst_d + i3 *s3 + i2 *s2, ldc));
|
||||
}
|
||||
}
|
||||
CUBLAS_CHECK(
|
||||
cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
|
||||
ne0, ne1, ne01,
|
||||
&alpha, src0_d, ne00,
|
||||
src1_d, ldb,
|
||||
&beta, dst_d, ne0));
|
||||
}
|
||||
|
||||
@@ -39,9 +39,9 @@ static __device__ void rope_yarn(
|
||||
|
||||
template<bool forward, bool has_ff, typename T>
|
||||
static __global__ void rope_norm(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
|
||||
const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) {
|
||||
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
|
||||
const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) {
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne0) {
|
||||
@@ -83,9 +83,9 @@ static __global__ void rope_norm(
|
||||
|
||||
template<bool forward, bool has_ff, typename T>
|
||||
static __global__ void rope_neox(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
|
||||
const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) {
|
||||
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
|
||||
const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) {
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne0) {
|
||||
@@ -127,9 +127,9 @@ static __global__ void rope_neox(
|
||||
|
||||
template<bool forward, bool has_ff, typename T>
|
||||
static __global__ void rope_multi(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
|
||||
const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) {
|
||||
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
|
||||
const int n_dims, const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) {
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne0) {
|
||||
@@ -187,9 +187,9 @@ static __global__ void rope_multi(
|
||||
|
||||
template<bool forward, bool has_ff, typename T>
|
||||
static __global__ void rope_vision(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
|
||||
const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
|
||||
const float theta_scale, const float * freq_factors, const mrope_sections sections) {
|
||||
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
|
||||
const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
|
||||
const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) {
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne0) {
|
||||
@@ -234,9 +234,9 @@ static __global__ void rope_vision(
|
||||
|
||||
template<bool forward, typename T>
|
||||
static void rope_norm_cuda(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
|
||||
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
|
||||
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
|
||||
const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
@@ -257,9 +257,9 @@ static void rope_norm_cuda(
|
||||
|
||||
template<bool forward, typename T>
|
||||
static void rope_neox_cuda(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
|
||||
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
|
||||
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
|
||||
const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
@@ -280,9 +280,9 @@ static void rope_neox_cuda(
|
||||
|
||||
template<bool forward, typename T>
|
||||
static void rope_multi_cuda(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
|
||||
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
|
||||
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
|
||||
const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
@@ -303,9 +303,9 @@ static void rope_multi_cuda(
|
||||
|
||||
template<bool forward, typename T>
|
||||
static void rope_vision_cuda(
|
||||
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
|
||||
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
|
||||
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
|
||||
const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
|
||||
const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
#include "common.cuh"
|
||||
#include "ggml.h"
|
||||
#include "softmax.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ float t2f32(T val) {
|
||||
@@ -13,26 +11,14 @@ __device__ float __forceinline__ t2f32<half>(half val) {
|
||||
return __half2float(val);
|
||||
}
|
||||
|
||||
// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
|
||||
// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
#endif
|
||||
template <bool use_shared, int ncols_template, int block_size_template, typename T>
|
||||
static __global__ void soft_max_f32(
|
||||
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
|
||||
const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
|
||||
template <bool vals_smem, int ncols_template, int block_size_template, typename T>
|
||||
static __global__ void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
|
||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int rowx = blockIdx.x;
|
||||
const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
|
||||
|
||||
x += int64_t(rowx)*ncols;
|
||||
mask += int64_t(rowy)*ncols * (mask != nullptr);
|
||||
dst += int64_t(rowx)*ncols;
|
||||
|
||||
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
|
||||
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
@@ -43,7 +29,7 @@ static __global__ void soft_max_f32(
|
||||
extern __shared__ float data_soft_max_f32[];
|
||||
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
|
||||
// shared memory buffer to cache values between iterations:
|
||||
float * vals = use_shared ? buf_iw + WARP_SIZE : dst;
|
||||
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;
|
||||
|
||||
float max_val = -INFINITY;
|
||||
|
||||
@@ -55,7 +41,10 @@ static __global__ void soft_max_f32(
|
||||
break;
|
||||
}
|
||||
|
||||
const float val = x[col]*scale + (mask ? slope*t2f32(mask[col]) : 0.0f);
|
||||
const int64_t ix = (int64_t)rowx*ncols + col;
|
||||
const int64_t iy = (int64_t)rowy*ncols + col;
|
||||
|
||||
const float val = x[ix]*scale + (mask ? slope*t2f32(mask[iy]) : 0.0f);
|
||||
|
||||
vals[col] = val;
|
||||
max_val = max(max_val, val);
|
||||
@@ -121,32 +110,8 @@ static __global__ void soft_max_f32(
|
||||
return;
|
||||
}
|
||||
|
||||
dst[col] = vals[col] * inv_sum;
|
||||
}
|
||||
}
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif
|
||||
|
||||
static __global__ void soft_max_back_f32(
|
||||
const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {
|
||||
const int tid = threadIdx.x;
|
||||
const int rowx = blockIdx.x;
|
||||
|
||||
grad += int64_t(rowx)*ncols;
|
||||
dstf += int64_t(rowx)*ncols;
|
||||
dst += int64_t(rowx)*ncols;
|
||||
|
||||
float dgf_dot = 0.0f; // dot product of dst from forward pass and gradients
|
||||
|
||||
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
||||
dgf_dot += dstf[col]*grad[col];
|
||||
}
|
||||
|
||||
dgf_dot = warp_reduce_sum(dgf_dot);
|
||||
|
||||
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
||||
dst[col] = scale * (grad[col] - dgf_dot) * dstf[col];
|
||||
const int64_t idst = (int64_t)rowx*ncols + col;
|
||||
dst[idst] = vals[col] * inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -156,7 +121,7 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
|
||||
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||
const dim3 block_dims(nth, 1, 1);
|
||||
const dim3 block_nums(nrows_x, 1, 1);
|
||||
const size_t nbytes_shared = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
|
||||
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
|
||||
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
||||
|
||||
const uint32_t n_head = nrows_x/nrows_y;
|
||||
@@ -166,68 +131,50 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
|
||||
if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
|
||||
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
|
||||
switch (ncols_x) {
|
||||
case 32:
|
||||
soft_max_f32<true, 32, 32><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 64:
|
||||
soft_max_f32<true, 64, 64><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 128:
|
||||
soft_max_f32<true, 128, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 256:
|
||||
soft_max_f32<true, 256, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 512:
|
||||
soft_max_f32<true, 512, 512><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 1024:
|
||||
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 2048:
|
||||
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
case 4096:
|
||||
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
default:
|
||||
soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
|
||||
soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
const size_t shmem_low = WARP_SIZE*sizeof(float);
|
||||
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||
}
|
||||
}
|
||||
|
||||
static void soft_max_back_f32_cuda(
|
||||
const float * grad, const float * dstf, float * dst,
|
||||
const int ncols, const int nrows, const float scale, cudaStream_t stream) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
|
||||
soft_max_back_f32<<<block_nums, block_dims, 0, stream>>>(grad, dstf, dst, ncols, scale);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const void * src1_d = src1 ? (const void *) src1->data : nullptr;
|
||||
float * dst_d = (float *) dst->data;
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const void * src1_d = src1 ? (const void *)src1->data : nullptr;
|
||||
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
@@ -242,42 +189,18 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
|
||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
||||
|
||||
if (use_f16) {
|
||||
soft_max_f32_cuda(src0_d, (const half *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
|
||||
const half * src1_dd = (const half *)src1_d;
|
||||
|
||||
soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
|
||||
} else {
|
||||
soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
|
||||
const float * src1_dd = (const float *)src1_d;
|
||||
|
||||
soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0]; // grad
|
||||
const ggml_tensor * src1 = dst->src[1]; // forward pass output
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ncols = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
GGML_ASSERT(max_bias == 0.0f);
|
||||
|
||||
soft_max_back_f32_cuda(src0_d, src1_d, dst_d, ncols, nrows, scale, stream);
|
||||
}
|
||||
|
||||
@@ -3,5 +3,3 @@
|
||||
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
|
||||
|
||||
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -51,19 +51,6 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
||||
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
||||
}
|
||||
|
||||
static __global__ void silu_back_f32(
|
||||
const float * grad, const float * xf, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float xfi = xf[i];
|
||||
const float s = 1.0f / (1.0f + expf(-xfi));
|
||||
dst[i] = grad[i] * s * (1.0f + xfi * (1.0f - s));
|
||||
}
|
||||
|
||||
static __global__ void tanh_f32(const float * x, float * dst, int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (i >= k) {
|
||||
@@ -186,11 +173,6 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
||||
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void silu_back_f32_cuda(const float * grad, const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
||||
silu_back_f32<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
|
||||
}
|
||||
|
||||
static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
|
||||
tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
@@ -302,24 +284,6 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0]; // input from forward pass
|
||||
const ggml_tensor * src1 = dst->src[1]; // grads of forward pass output
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
silu_back_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
#define CUDA_STEP_BLOCK_SIZE 256
|
||||
#define CUDA_GELU_BLOCK_SIZE 256
|
||||
#define CUDA_SILU_BLOCK_SIZE 256
|
||||
#define CUDA_SILU_BACK_BLOCK_SIZE 256
|
||||
#define CUDA_TANH_BLOCK_SIZE 256
|
||||
#define CUDA_RELU_BLOCK_SIZE 256
|
||||
#define CUDA_SIGMOID_BLOCK_SIZE 256
|
||||
@@ -24,8 +23,6 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
43
ggml/src/ggml-cuda/vendors/hip.h
vendored
43
ggml/src/ggml-cuda/vendors/hip.h
vendored
@@ -19,12 +19,6 @@
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||
#define CUDA_R_16F HIPBLAS_R_16F
|
||||
#define CUDA_R_32F HIPBLAS_R_32F
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
|
||||
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
|
||||
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
|
||||
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
|
||||
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
|
||||
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||
#define cublasCreate hipblasCreate
|
||||
@@ -80,21 +74,6 @@
|
||||
#define cudaMemGetInfo hipMemGetInfo
|
||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||
#define cudaSetDevice hipSetDevice
|
||||
#define cuDeviceGet hipDeviceGet
|
||||
#define CUdevice hipDevice_t
|
||||
#define CUdeviceptr hipDeviceptr_t
|
||||
#define cuMemUnmap hipMemUnmap
|
||||
#define CUmemAccessDesc hipMemAccessDesc
|
||||
#define cuMemAddressFree hipMemAddressFree
|
||||
#define cuMemRelease hipMemRelease
|
||||
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
|
||||
#define cuMemCreate hipMemCreate
|
||||
#define cuMemAddressReserve hipMemAddressReserve
|
||||
#define cuMemMap hipMemMap
|
||||
#define cuMemSetAccess hipMemSetAccess
|
||||
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
|
||||
#define CUmemAllocationProp hipMemAllocationProp
|
||||
#define cuDeviceGetAttribute hipDeviceGetAttribute
|
||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||
#define cudaStreamDestroy hipStreamDestroy
|
||||
#define cudaStreamFireAndForget hipStreamFireAndForget
|
||||
@@ -102,28 +81,6 @@
|
||||
#define cudaStreamPerThread hipStreamPerThread
|
||||
#define cudaStreamSynchronize hipStreamSynchronize
|
||||
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
||||
#define cudaGraphExec_t hipGraphExec_t
|
||||
#define cudaGraphNode_t hipGraphNode_t
|
||||
#define cudaKernelNodeParams hipKernelNodeParams
|
||||
#define cudaKernelNodeParams hipKernelNodeParams
|
||||
#define cudaGraphExecDestroy hipGraphExecDestroy
|
||||
#define cudaGraphLaunch hipGraphLaunch
|
||||
#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
|
||||
#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
|
||||
#define cudaGraphNodeType hipGraphNodeType
|
||||
#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
|
||||
#define cudaGraphInstantiate hipGraphInstantiate
|
||||
#define cudaStreamEndCapture hipStreamEndCapture
|
||||
#define cudaGraphDestroy hipGraphDestroy
|
||||
#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
|
||||
#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
|
||||
#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
|
||||
#define cudaGraphNodeGetType hipGraphNodeGetType
|
||||
#define cudaGraphGetNodes hipGraphGetNodes
|
||||
#define cudaGraphExecUpdate hipGraphExecUpdate
|
||||
#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
|
||||
#define cudaStreamBeginCapture hipStreamBeginCapture
|
||||
#define cudaGraph_t hipGraph_t
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaSuccess hipSuccess
|
||||
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
||||
|
||||
@@ -92,14 +92,6 @@ if (GGML_CUDA_NO_PEER_COPY)
|
||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP_GRAPHS)
|
||||
add_compile_definitions(GGML_HIP_GRAPHS)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP_NO_VMM)
|
||||
add_compile_definitions(GGML_HIP_NO_VMM)
|
||||
endif()
|
||||
|
||||
if (CXX_IS_HIPCC)
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||
target_link_libraries(ggml-hip PRIVATE hip::device)
|
||||
|
||||
@@ -19,10 +19,7 @@
|
||||
// max number of MTLCommandBuffer used to submit a graph for processing
|
||||
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
|
||||
|
||||
// create residency sets only on macOS >= 15.0
|
||||
#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
|
||||
#define GGML_METAL_HAS_RESIDENCY_SETS 1
|
||||
#endif
|
||||
#define UNUSED(x) (void)(x)
|
||||
|
||||
// globals
|
||||
|
||||
@@ -42,7 +39,6 @@ static struct ggml_backend_metal_device_context {
|
||||
|
||||
bool has_simdgroup_reduction;
|
||||
bool has_simdgroup_mm;
|
||||
bool has_residency_sets;
|
||||
bool has_bfloat;
|
||||
bool use_bfloat;
|
||||
|
||||
@@ -52,7 +48,6 @@ static struct ggml_backend_metal_device_context {
|
||||
/*.mtl_device_ref_count =*/ 0,
|
||||
/*.has_simdgroup_reduction =*/ false,
|
||||
/*.has_simdgroup_mm =*/ false,
|
||||
/*.has_residency_sets =*/ false,
|
||||
/*.has_bfloat =*/ false,
|
||||
/*.use_bfloat =*/ false,
|
||||
/*.name =*/ "",
|
||||
@@ -64,18 +59,12 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
|
||||
|
||||
if (ctx->mtl_device == nil) {
|
||||
ctx->mtl_device = MTLCreateSystemDefaultDevice();
|
||||
}
|
||||
|
||||
if (ctx->mtl_device) {
|
||||
ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
||||
|
||||
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
|
||||
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
||||
ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
|
||||
#endif
|
||||
|
||||
ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
||||
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
|
||||
|
||||
@@ -101,10 +90,8 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
ctx->mtl_device_ref_count--;
|
||||
|
||||
if (ctx->mtl_device_ref_count == 0) {
|
||||
if (ctx->mtl_device) {
|
||||
[ctx->mtl_device release];
|
||||
ctx->mtl_device = nil;
|
||||
}
|
||||
[ctx->mtl_device release];
|
||||
ctx->mtl_device = nil;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -496,11 +483,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
||||
|
||||
ctx->queue = [device newCommandQueue];
|
||||
if (ctx->queue == nil) {
|
||||
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
id<MTLLibrary> metal_library;
|
||||
@@ -667,7 +649,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
|
||||
GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
|
||||
@@ -1054,70 +1035,8 @@ struct ggml_backend_metal_buffer_context {
|
||||
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
||||
int n_buffers;
|
||||
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||
|
||||
// optional MTLResidencySet
|
||||
id rset;
|
||||
};
|
||||
|
||||
// rset init
|
||||
static bool ggml_backend_metal_buffer_rset_init(
|
||||
struct ggml_backend_metal_buffer_context * ctx,
|
||||
struct ggml_backend_metal_device_context * ctx_dev,
|
||||
id<MTLDevice> device) {
|
||||
ctx->rset = nil;
|
||||
|
||||
if (!ctx_dev->has_residency_sets) {
|
||||
return true;
|
||||
}
|
||||
|
||||
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
||||
if (@available(macOS 15.0, *)) {
|
||||
MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
|
||||
desc.label = @"ggml_backend_metal";
|
||||
desc.initialCapacity = ctx->n_buffers;
|
||||
|
||||
NSError * error;
|
||||
ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
[desc release];
|
||||
return false;
|
||||
}
|
||||
|
||||
[desc release];
|
||||
|
||||
for (int i = 0; i < ctx->n_buffers; i++) {
|
||||
[ctx->rset addAllocation:ctx->buffers[i].metal];
|
||||
}
|
||||
|
||||
[ctx->rset commit];
|
||||
[ctx->rset requestResidency];
|
||||
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(ctx_dev);
|
||||
GGML_UNUSED(device);
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// rset free
|
||||
static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
|
||||
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
||||
if (@available(macOS 15.0, *)) {
|
||||
if (ctx->rset) {
|
||||
[ctx->rset endResidency];
|
||||
[ctx->rset removeAllAllocations];
|
||||
[ctx->rset release];
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(ctx);
|
||||
#endif
|
||||
}
|
||||
|
||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||
// Metal buffer based on the host memory pointer
|
||||
@@ -4257,8 +4176,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
for (int i = 0; i < ctx->n_buffers; i++) {
|
||||
[ctx->buffers[i].metal release];
|
||||
}
|
||||
|
||||
ggml_backend_metal_buffer_rset_free(ctx);
|
||||
ggml_backend_metal_device_rel(buffer->buft->device->context);
|
||||
|
||||
if (ctx->owned) {
|
||||
@@ -4281,19 +4198,19 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
memset((char *)tensor->data + offset, value, size);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
memcpy((char *)tensor->data + offset, data, size);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
memcpy(data, (const char *)tensor->data + offset, size);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
@@ -4303,7 +4220,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
|
||||
}
|
||||
return false;
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
@@ -4329,7 +4246,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "Metal";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
|
||||
@@ -4353,8 +4270,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
GGML_UNUSED(device);
|
||||
GGML_UNUSED(size_aligned);
|
||||
UNUSED(device);
|
||||
UNUSED(size_aligned);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
@@ -4367,8 +4284,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
||||
size_aligned += (size_page - (size_aligned % size_page));
|
||||
}
|
||||
|
||||
struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
|
||||
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
|
||||
id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
|
||||
|
||||
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
||||
ctx->all_size = size_aligned;
|
||||
@@ -4391,14 +4307,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
||||
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
||||
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||
free(ctx);
|
||||
ggml_backend_metal_device_rel(ctx_dev);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
|
||||
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
|
||||
free(ctx);
|
||||
ggml_backend_metal_device_rel(ctx_dev);
|
||||
ggml_backend_metal_device_rel(buft->device->context);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -4409,7 +4318,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 32;
|
||||
GGML_UNUSED(buft);
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
@@ -4419,13 +4328,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
|
||||
|
||||
return max_size;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return true;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||
@@ -4448,7 +4357,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "Metal_Mapped";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
|
||||
@@ -4491,8 +4400,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
||||
size_aligned += (size_page - (size_aligned % size_page));
|
||||
}
|
||||
|
||||
struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
|
||||
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
|
||||
id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
|
||||
|
||||
// the buffer fits into the max buffer size allowed by the device
|
||||
if (size_aligned <= device.maxBufferLength) {
|
||||
@@ -4545,13 +4453,6 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
||||
}
|
||||
}
|
||||
|
||||
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
|
||||
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
|
||||
free(ctx);
|
||||
ggml_backend_metal_device_rel(ctx_dev);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||
}
|
||||
|
||||
@@ -4560,7 +4461,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
||||
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||
return "Metal";
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||
@@ -4865,13 +4766,6 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
|
||||
}
|
||||
}
|
||||
|
||||
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
|
||||
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
|
||||
free(ctx);
|
||||
ggml_backend_metal_device_rel(ctx_dev);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||
}
|
||||
|
||||
@@ -4885,7 +4779,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
|
||||
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
|
||||
buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
|
||||
@@ -4416,6 +4416,7 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||
device const half * dh = &x[ib].d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
|
||||
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
@@ -4446,7 +4447,7 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@@ -4612,7 +4613,7 @@ void kernel_mul_mv_q3_K_f32_impl(
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
if (tiisg == 0) {
|
||||
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
dst_f32[first_row + row] = sumf1[row];
|
||||
}
|
||||
}
|
||||
@@ -4728,7 +4729,7 @@ void kernel_mul_mv_q4_K_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@@ -4860,7 +4861,7 @@ void kernel_mul_mv_q5_K_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
const float tot = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = tot;
|
||||
@@ -4905,10 +4906,6 @@ void kernel_mul_mv_q6_K_f32_impl(
|
||||
|
||||
const int row = 2*r0 + sgitg;
|
||||
|
||||
if (row >= args.ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i12 = im%args.ne12;
|
||||
const uint i13 = im/args.ne12;
|
||||
|
||||
@@ -5064,7 +5061,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum * 0.25f;
|
||||
@@ -5182,7 +5179,7 @@ void kernel_mul_mv_iq2_xs_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum * 0.25f;
|
||||
@@ -5292,7 +5289,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum * 0.5f;
|
||||
@@ -5404,7 +5401,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@@ -5517,7 +5514,7 @@ void kernel_mul_mv_iq2_s_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum * 0.25f;
|
||||
@@ -5617,7 +5614,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@@ -5712,7 +5709,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@@ -5802,7 +5799,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
@@ -5891,7 +5888,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
|
||||
|
||||
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
|
||||
|
||||
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst_f32[first_row + row] = all_sum;
|
||||
|
||||
@@ -181,7 +181,7 @@ struct ggml_backend_rpc_context {
|
||||
|
||||
struct ggml_backend_rpc_buffer_context {
|
||||
std::shared_ptr<socket_t> sock;
|
||||
void * base_ptr;
|
||||
std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
|
||||
uint64_t remote_ptr;
|
||||
};
|
||||
|
||||
@@ -423,15 +423,16 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
if (ctx->base_ptr != nullptr) {
|
||||
return ctx->base_ptr;
|
||||
if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
|
||||
return ctx->base_cache[buffer];
|
||||
}
|
||||
rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
|
||||
rpc_msg_buffer_get_base_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
ctx->base_ptr = reinterpret_cast<void *>(response.base_ptr);
|
||||
return ctx->base_ptr;
|
||||
void * base_ptr = reinterpret_cast<void *>(response.base_ptr);
|
||||
ctx->base_cache[buffer] = base_ptr;
|
||||
return base_ptr;
|
||||
}
|
||||
|
||||
static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
||||
@@ -556,7 +557,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
||||
if (response.remote_ptr != 0) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||
ggml_backend_rpc_buffer_interface,
|
||||
new ggml_backend_rpc_buffer_context{sock, nullptr, response.remote_ptr},
|
||||
new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
|
||||
response.remote_size);
|
||||
return buffer;
|
||||
} else {
|
||||
|
||||
@@ -333,12 +333,8 @@ struct ggml_backend_sycl_context {
|
||||
// pool
|
||||
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
|
||||
|
||||
static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
|
||||
|
||||
ggml_sycl_pool & pool(int device) {
|
||||
if (pools[device] == nullptr) {
|
||||
pools[device] = new_pool_for_device(stream(device,0), device);
|
||||
@@ -349,15 +345,6 @@ struct ggml_backend_sycl_context {
|
||||
ggml_sycl_pool & pool() {
|
||||
return pool(device);
|
||||
}
|
||||
|
||||
ggml_sycl_pool & host_pool(int device) {
|
||||
if (host_pools[device] == nullptr) {
|
||||
host_pools[device] = new_pool_for_host(stream(device, 0), device);
|
||||
}
|
||||
return *host_pools[device];
|
||||
}
|
||||
|
||||
ggml_sycl_pool & host_pool() { return host_pool(device); }
|
||||
};
|
||||
|
||||
// common device functions
|
||||
|
||||
@@ -82,14 +82,6 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
|
||||
return device_type.str();
|
||||
}
|
||||
|
||||
template <typename Ts> struct matrix_info_t {
|
||||
oneapi::mkl::transpose transpose_info[2];
|
||||
Ts value_info[2];
|
||||
std::int64_t size_info[3];
|
||||
std::int64_t ld_info[3];
|
||||
std::int64_t groupsize_info;
|
||||
};
|
||||
|
||||
namespace dpct
|
||||
{
|
||||
typedef sycl::queue *queue_ptr;
|
||||
@@ -1735,13 +1727,26 @@ namespace dpct
|
||||
};
|
||||
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
|
||||
int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
|
||||
int ldb, const void * beta, void ** c, int ldc, int batch_size,
|
||||
matrix_info_t<float> * matrix_info) {
|
||||
inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n, int k,
|
||||
const void *alpha, const void **a, int lda,
|
||||
const void **b, int ldb, const void *beta, void **c,
|
||||
int ldc, int batch_size)
|
||||
{
|
||||
struct matrix_info_t
|
||||
{
|
||||
oneapi::mkl::transpose transpose_info[2];
|
||||
Ts value_info[2];
|
||||
std::int64_t size_info[3];
|
||||
std::int64_t ld_info[3];
|
||||
std::int64_t groupsize_info;
|
||||
};
|
||||
|
||||
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
|
||||
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
|
||||
|
||||
matrix_info_t *matrix_info =
|
||||
(matrix_info_t *)std::malloc(sizeof(matrix_info_t));
|
||||
matrix_info->transpose_info[0] = a_trans;
|
||||
matrix_info->transpose_info[1] = b_trans;
|
||||
matrix_info->value_info[0] = alpha_value;
|
||||
@@ -1758,18 +1763,23 @@ namespace dpct
|
||||
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
|
||||
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
|
||||
matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
|
||||
matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
|
||||
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
|
||||
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
|
||||
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast<const Ta **>(a),
|
||||
matrix_info->ld_info, reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
|
||||
matrix_info->value_info + 1, reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1,
|
||||
&(matrix_info->groupsize_info));
|
||||
#else
|
||||
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
|
||||
q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
|
||||
matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
|
||||
matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info,
|
||||
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
|
||||
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
|
||||
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
|
||||
matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
#endif
|
||||
|
||||
q.submit([&](sycl::handler &cgh)
|
||||
{
|
||||
cgh.depends_on(e);
|
||||
cgh.host_task([=] { std::free(matrix_info); }); });
|
||||
}
|
||||
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
@@ -2412,11 +2422,25 @@ namespace dpct
|
||||
/// \param [in] ldc Leading dimension of C.
|
||||
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
|
||||
/// \param [in] scaling_type Data type of the scaling factors.
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
|
||||
const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
|
||||
library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
|
||||
matrix_info_t<float> * matrix_info) {
|
||||
inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n, int k,
|
||||
const void *alpha, const void *a[],
|
||||
library_data_t a_type, int lda, const void *b[],
|
||||
library_data_t b_type, int ldb, const void *beta,
|
||||
void *c[], library_data_t c_type, int ldc,
|
||||
int batch_size, library_data_t scaling_type)
|
||||
{
|
||||
if (scaling_type == library_data_t::real_float &&
|
||||
c_type == library_data_t::complex_float)
|
||||
{
|
||||
scaling_type = library_data_t::complex_float;
|
||||
}
|
||||
else if (scaling_type == library_data_t::real_double &&
|
||||
c_type == library_data_t::complex_double)
|
||||
{
|
||||
scaling_type = library_data_t::complex_double;
|
||||
}
|
||||
|
||||
std::uint64_t key =
|
||||
detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
|
||||
switch (key)
|
||||
@@ -2425,24 +2449,48 @@ namespace dpct
|
||||
library_data_t::real_float, library_data_t::real_float,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<float, float, float, float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
|
||||
beta, c, ldc, batch_size, matrix_info);
|
||||
detail::gemm_batch_impl<float, float, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::real_double, library_data_t::real_double,
|
||||
library_data_t::real_double, library_data_t::real_double):
|
||||
{
|
||||
detail::gemm_batch_impl<double, double, double, double>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
|
||||
beta, c, ldc, batch_size, matrix_info);
|
||||
detail::gemm_batch_impl<double, double, double, double>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::complex_float, library_data_t::complex_float,
|
||||
library_data_t::complex_float, library_data_t::complex_float):
|
||||
{
|
||||
detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
|
||||
std::complex<float>, std::complex<float>>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::complex_double, library_data_t::complex_double,
|
||||
library_data_t::complex_double, library_data_t::complex_double):
|
||||
{
|
||||
detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
|
||||
std::complex<double>, std::complex<double>>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::real_half, library_data_t::real_half,
|
||||
library_data_t::real_half, library_data_t::real_half):
|
||||
{
|
||||
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
|
||||
sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
|
||||
a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
#ifdef __INTEL_MKL__
|
||||
@@ -2450,16 +2498,19 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
|
||||
oneapi::mkl::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
|
||||
float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
|
||||
b, ldb, beta, c, ldc, batch_size);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
@@ -2471,9 +2522,10 @@ namespace dpct
|
||||
dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
|
||||
float beta_float =
|
||||
dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
|
||||
detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, float>(
|
||||
q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc, batch_size,
|
||||
matrix_info);
|
||||
detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
|
||||
float>(q, a_trans, b_trans, m, n, k, &alpha_float,
|
||||
a, lda, b, ldb, &beta_float, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
@@ -2481,7 +2533,8 @@ namespace dpct
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
@@ -2489,7 +2542,8 @@ namespace dpct
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
@@ -2503,7 +2557,8 @@ namespace dpct
|
||||
sycl::half alpha_half(alpha_value);
|
||||
sycl::half beta_half(beta_value);
|
||||
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
|
||||
q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, batch_size, matrix_info);
|
||||
q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
||||
@@ -1173,85 +1173,6 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_sycl_pool_host : public ggml_sycl_pool {
|
||||
queue_ptr qptr;
|
||||
int device;
|
||||
|
||||
inline static int counter{ 0 };
|
||||
|
||||
struct ggml_sycl_buffer {
|
||||
void * ptr = nullptr;
|
||||
size_t size = 0;
|
||||
};
|
||||
|
||||
// Set arbitrarly to 64
|
||||
static constexpr int MAX_POOL_SIZE{ 64 };
|
||||
std::vector<ggml_sycl_buffer> buffer_pool = std::vector<ggml_sycl_buffer>(MAX_POOL_SIZE);
|
||||
size_t pool_size = 0;
|
||||
|
||||
explicit ggml_sycl_pool_host(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
|
||||
|
||||
~ggml_sycl_pool_host() {
|
||||
for (int i = 0; i < MAX_POOL_SIZE; ++i) {
|
||||
ggml_sycl_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
|
||||
b.ptr = nullptr;
|
||||
pool_size -= b.size;
|
||||
b.size = 0;
|
||||
}
|
||||
}
|
||||
counter = 0;
|
||||
}
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
if (counter == MAX_POOL_SIZE) {
|
||||
ggml_sycl_buffer b = buffer_pool[0];
|
||||
void * ptr = b.ptr;
|
||||
*actual_size = b.size;
|
||||
counter = 1;
|
||||
return ptr;
|
||||
}
|
||||
ggml_sycl_buffer & b = buffer_pool[counter];
|
||||
|
||||
if (b.ptr == nullptr) {
|
||||
void * ptr;
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *) sycl::malloc_host(size, *qptr)));
|
||||
if (!ptr) {
|
||||
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
|
||||
return nullptr;
|
||||
}
|
||||
pool_size += size;
|
||||
*actual_size = size;
|
||||
counter = counter + 1;
|
||||
return ptr;
|
||||
} else {
|
||||
++counter;
|
||||
b.size = size;
|
||||
return b.ptr;
|
||||
}
|
||||
}
|
||||
|
||||
void free(void * ptr, size_t size) override {
|
||||
// if the pool is not completed add the pointer to it in place of the first nullptr found.
|
||||
// Otherwise do nothing, pointers will be freed once the pool is deallocated.
|
||||
for (int i = 0; i < MAX_POOL_SIZE; ++i) {
|
||||
ggml_sycl_buffer & b = buffer_pool[i];
|
||||
if (b.ptr == nullptr) {
|
||||
b.ptr = ptr;
|
||||
b.size = size;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(queue_ptr qptr, int device) {
|
||||
// return pool for the host to speed up memory management
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_host(qptr, device));
|
||||
}
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
|
||||
// TBD: NO VMM support
|
||||
// if (ggml_sycl_info().devices[device].vmm) {
|
||||
@@ -3442,7 +3363,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
|
||||
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
|
||||
ggml_sycl_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
|
||||
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
|
||||
|
||||
sycl::range<3> block_dims(1, ne12, ne13);
|
||||
/*
|
||||
@@ -3471,10 +3391,14 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
});
|
||||
}
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
|
||||
*main_stream, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **)(ptrs_src.get() + 0 * ne23),
|
||||
dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **)(ptrs_src.get() + 1 * ne23),
|
||||
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
||||
cu_compute_type)));
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
@@ -3878,6 +3802,10 @@ static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf);
|
||||
}
|
||||
|
||||
static void ggml_sycl_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_soft_max);
|
||||
}
|
||||
|
||||
static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented
|
||||
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope);
|
||||
@@ -4086,7 +4014,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
||||
ggml_sycl_diag_mask_inf(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
ggml_sycl_op_soft_max(ctx, dst);
|
||||
ggml_sycl_soft_max(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_ROPE:
|
||||
ggml_sycl_rope(ctx, dst);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "softmax.hpp"
|
||||
#include "norm.hpp"
|
||||
|
||||
template <bool vals_smem, int ncols_template, int block_size_template, typename T>
|
||||
static void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par,
|
||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||
static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
|
||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||
@@ -29,7 +29,7 @@ static void soft_max_f32(const float * x, const T * mask, float * dst, const int
|
||||
slope = sycl::pow(base, float(exp));
|
||||
}
|
||||
|
||||
float *vals = vals_smem ? buf + sycl::max(nwarps, WARP_SIZE) : dst + rowx * ncols;
|
||||
float *vals = vals_smem ? buf + std::max(nwarps, WARP_SIZE) : dst + rowx * ncols;
|
||||
float max_val = -INFINITY;
|
||||
|
||||
for (int col0 = 0; col0 < ncols; col0 += block_size) {
|
||||
@@ -42,7 +42,7 @@ static void soft_max_f32(const float * x, const T * mask, float * dst, const int
|
||||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
|
||||
const float val = x[ix]*scale + (mask ? slope*static_cast<float>(mask[iy]) : 0.0f);
|
||||
const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
|
||||
|
||||
vals[col] = val;
|
||||
max_val = sycl::max(max_val, val);
|
||||
@@ -65,7 +65,7 @@ static void soft_max_f32(const float * x, const T * mask, float * dst, const int
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
max_val = buf[lane_id];
|
||||
for (size_t i = 1; i < nreduce; i += 1) {
|
||||
max_val = sycl::max(max_val, buf[lane_id + i * WARP_SIZE]);
|
||||
max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]);
|
||||
}
|
||||
max_val = warp_reduce_max(max_val, item_ct1);
|
||||
}
|
||||
@@ -122,8 +122,8 @@ static void soft_max_f32(const float * x, const T * mask, float * dst, const int
|
||||
}
|
||||
}
|
||||
|
||||
template <bool vals_smem, int ncols_template, int block_size_template, typename T>
|
||||
static void soft_max_f32_submitter(const float * x, const T * mask, float * dst, const int ncols_par,
|
||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||
static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
|
||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
||||
const size_t n_local_scratch, queue_ptr stream) {
|
||||
@@ -141,8 +141,7 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst,
|
||||
});
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void soft_max_f32_sycl(const float * x, const T * mask,
|
||||
static void soft_max_f32_sycl(const float * x, const float * mask,
|
||||
float * dst, const int ncols_x, const int nrows_x,
|
||||
const int nrows_y, const float scale, const float max_bias,
|
||||
queue_ptr stream, int device) {
|
||||
@@ -224,16 +223,22 @@ static void soft_max_f32_sycl(const float * x, const T * mask,
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd,
|
||||
float *dst_dd,
|
||||
const queue_ptr &main_stream) {
|
||||
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(!dst->src[1] || dst->src[1]->type == GGML_TYPE_F16 || dst->src[1]->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
||||
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
|
||||
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
||||
|
||||
const int64_t ne00 = dst->src[0]->ne[0];
|
||||
const int64_t nrows_x = ggml_nrows(dst->src[0]);
|
||||
const int64_t nrows_y = dst->src[0]->ne[1];
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows_x = ggml_nrows(src0);
|
||||
const int64_t nrows_y = src0->ne[1];
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
@@ -241,21 +246,6 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
||||
|
||||
const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
|
||||
float * dst_dd = static_cast<float *>(dst->data);
|
||||
|
||||
ggml_sycl_set_device(ctx.device);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
|
||||
if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
|
||||
const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
|
||||
soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
|
||||
main_stream, ctx.device);
|
||||
} else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
|
||||
const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
|
||||
soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
|
||||
} else {
|
||||
/* mask unavailable */
|
||||
soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
|
||||
}
|
||||
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
|
||||
nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
|
||||
}
|
||||
|
||||
@@ -15,6 +15,10 @@
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, ggml_tensor *dst);
|
||||
void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, const ggml_tensor *src0,
|
||||
const ggml_tensor *src1, ggml_tensor *dst,
|
||||
const float *src0_dd, const float *src1_dd,
|
||||
float *dst_dd,
|
||||
const queue_ptr &main_stream);
|
||||
|
||||
#endif // GGML_SYCL_SOFTMAX_HPP
|
||||
|
||||
@@ -29,6 +29,8 @@
|
||||
|
||||
#include "ggml-vulkan-shaders.hpp"
|
||||
|
||||
#define VK_API_VERSION VK_API_VERSION_1_2
|
||||
|
||||
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
||||
|
||||
#define VK_VENDOR_ID_AMD 0x1002
|
||||
@@ -85,10 +87,6 @@ struct vk_pipeline_struct {
|
||||
uint32_t parameter_count;
|
||||
std::array<uint32_t, 3> wg_denoms;
|
||||
uint32_t align;
|
||||
// set to true to request the pipeline is compiled after the dryrun
|
||||
bool needed {};
|
||||
// set to true when the shader has been compiled
|
||||
bool compiled {};
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
|
||||
@@ -190,11 +188,8 @@ struct vk_device_struct {
|
||||
bool mul_mat_id_m;
|
||||
bool mul_mat_id_s;
|
||||
|
||||
// set to true to indicate that some shaders need to be compiled after the dryrun
|
||||
bool need_compiles {};
|
||||
|
||||
vk_matmul_pipeline pipeline_matmul_f32 {};
|
||||
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
|
||||
vk_matmul_pipeline pipeline_matmul_f32;
|
||||
vk_matmul_pipeline pipeline_matmul_f32_f16;
|
||||
vk_matmul_pipeline2 pipeline_matmul_f16;
|
||||
vk_matmul_pipeline2 pipeline_matmul_f16_f32;
|
||||
vk_pipeline pipeline_matmul_split_k_reduce;
|
||||
@@ -202,7 +197,7 @@ struct vk_device_struct {
|
||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT];
|
||||
vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
|
||||
|
||||
vk_matmul_pipeline pipeline_matmul_id_f32 {};
|
||||
vk_matmul_pipeline pipeline_matmul_id_f32;
|
||||
vk_matmul_pipeline2 pipeline_matmul_id_f16;
|
||||
vk_matmul_pipeline2 pipeline_matmul_id_f16_f32;
|
||||
|
||||
@@ -233,8 +228,6 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_repeat_f32;
|
||||
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
|
||||
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
|
||||
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
|
||||
vk_pipeline pipeline_norm_f32;
|
||||
vk_pipeline pipeline_group_norm_f32;
|
||||
vk_pipeline pipeline_rms_norm_f32;
|
||||
@@ -391,13 +384,10 @@ struct vk_flash_attn_push_constants {
|
||||
uint32_t nev3;
|
||||
uint32_t nem1;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t nb21;
|
||||
uint32_t nb22;
|
||||
uint32_t nb23;
|
||||
uint32_t nb31;
|
||||
@@ -783,6 +773,13 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
GGML_ASSERT(parameter_count > 0);
|
||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
||||
|
||||
pipeline = std::make_shared<vk_pipeline_struct>();
|
||||
pipeline->name = name;
|
||||
pipeline->parameter_count = parameter_count;
|
||||
pipeline->push_constant_size = push_constant_size;
|
||||
pipeline->wg_denoms = wg_denoms;
|
||||
pipeline->align = align;
|
||||
|
||||
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
|
||||
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
||||
|
||||
@@ -865,7 +862,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
}
|
||||
|
||||
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
||||
pipeline->compiled = true;
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(device->mutex);
|
||||
@@ -876,6 +872,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
||||
assert(compile_count > 0);
|
||||
compile_count--;
|
||||
|
||||
// "Progress bar" for shader compiles
|
||||
static uint32_t total_compile_count = 0;
|
||||
if ((total_compile_count++ % 10) == 0) {
|
||||
std::cerr << ".";
|
||||
}
|
||||
}
|
||||
compile_count_cond.notify_all();
|
||||
}
|
||||
@@ -901,10 +903,6 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
||||
static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
|
||||
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
||||
device->pipeline_descriptor_set_requirements[pipeline->name] += n;
|
||||
if (!pipeline->compiled) {
|
||||
pipeline->needed = true;
|
||||
device->need_compiles = true;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
|
||||
@@ -1387,6 +1385,8 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
|
||||
static void ggml_vk_load_shaders(vk_device& device) {
|
||||
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
||||
|
||||
std::cerr << "ggml_vulkan: Compiling shaders";
|
||||
|
||||
// some shaders have a minimum subgroup size
|
||||
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
|
||||
const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u);
|
||||
@@ -1524,33 +1524,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!device->pipeline_matmul_f32) {
|
||||
device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
}
|
||||
if (!device->pipeline_matmul_f32_f16) {
|
||||
device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
}
|
||||
if (!device->pipeline_matmul_id_f32) {
|
||||
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
}
|
||||
device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
|
||||
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
|
||||
std::vector<std::future<void>> compiles;
|
||||
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
|
||||
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
|
||||
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
|
||||
|
||||
if (!pipeline) {
|
||||
pipeline = std::make_shared<vk_pipeline_struct>();
|
||||
pipeline->name = name;
|
||||
pipeline->parameter_count = parameter_count;
|
||||
pipeline->push_constant_size = push_constant_size;
|
||||
pipeline->wg_denoms = wg_denoms;
|
||||
pipeline->align = align;
|
||||
}
|
||||
|
||||
if (!pipeline->needed || pipeline->compiled) {
|
||||
return;
|
||||
}
|
||||
{
|
||||
// wait until fewer than N compiles are in progress
|
||||
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
||||
@@ -1627,7 +1609,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
|
||||
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
|
||||
|
||||
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3)
|
||||
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3)
|
||||
|
||||
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3)
|
||||
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
||||
@@ -1640,18 +1626,21 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
|
||||
|
||||
CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
|
||||
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
|
||||
#undef CREATE_MM
|
||||
#undef CREATE_MM2
|
||||
} else
|
||||
@@ -1688,31 +1677,31 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||
|
||||
if (device->coopmat_acc_f16_support) {
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
} else {
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||
}
|
||||
|
||||
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
||||
@@ -1722,31 +1711,31 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||
|
||||
if (device->coopmat_acc_f16_support) {
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
} else {
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||
}
|
||||
}
|
||||
#undef CREATE_MM2
|
||||
@@ -1976,20 +1965,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_0], "cpy_q5_0_f32", cpy_q5_0_f32_len, cpy_q5_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f32_norepeat, "add_f32_norepeat", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
|
||||
@@ -2027,7 +2002,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
|
||||
@@ -2065,7 +2040,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
device->need_compiles = false;
|
||||
std::cerr << "Done!" << std::endl;
|
||||
}
|
||||
|
||||
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
|
||||
@@ -2293,14 +2268,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
}
|
||||
#endif
|
||||
|
||||
VkPhysicalDeviceMaintenance4Features maint4_features {};
|
||||
maint4_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES;
|
||||
if (maintenance4_support) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&maint4_features;
|
||||
last_struct = (VkBaseOutStructure *)&maint4_features;
|
||||
device_extensions.push_back("VK_KHR_maintenance4");
|
||||
}
|
||||
|
||||
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
|
||||
|
||||
device->fp16 = device->fp16 && vk12_features.shaderFloat16;
|
||||
@@ -2676,14 +2643,7 @@ void ggml_vk_instance_init() {
|
||||
|
||||
vk_instance_initialized = true;
|
||||
|
||||
uint32_t api_version = vk::enumerateInstanceVersion();
|
||||
|
||||
if (api_version < VK_API_VERSION_1_2) {
|
||||
std::cerr << "ggml_vulkan: Error: Vulkan 1.2 required." << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, api_version };
|
||||
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
||||
|
||||
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
|
||||
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
|
||||
@@ -2993,7 +2953,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(src1_type == GGML_TYPE_F32 || (ctx->device->coopmat2 && src1_type == GGML_TYPE_F16));
|
||||
GGML_ASSERT(src1_type == GGML_TYPE_F32);
|
||||
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
@@ -3729,33 +3689,6 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_cpy_f16_f16;
|
||||
}
|
||||
}
|
||||
if (src->type == GGML_TYPE_F32) {
|
||||
switch (to) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return ctx->device->pipeline_cpy_f32_quant[to];
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (to == GGML_TYPE_F32) {
|
||||
switch (src->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return ctx->device->pipeline_cpy_quant_f32[src->type];
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -3833,9 +3766,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
// Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf
|
||||
const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) ||
|
||||
!ggml_vk_dim01_contiguous(src0);
|
||||
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
||||
// Reformat and convert to fp16 if src1 is non-contiguous, or for coopmat2 for better perf
|
||||
const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) ||
|
||||
!ggml_vk_dim01_contiguous(src1);
|
||||
|
||||
@@ -4415,11 +4347,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
ids_uma = d_ids != nullptr;
|
||||
}
|
||||
|
||||
// Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf
|
||||
const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) ||
|
||||
!ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) ||
|
||||
!ggml_vk_dim01_contiguous(src1);
|
||||
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
||||
|
||||
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
|
||||
|
||||
@@ -4429,8 +4358,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
|
||||
|
||||
if (qx_needs_dequant) {
|
||||
// Fall back to dequant + f16 mulmat
|
||||
mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16, (ggml_prec)dst->op_params[0]);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// Not implemented
|
||||
@@ -4838,14 +4766,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
}
|
||||
assert(pipelines);
|
||||
|
||||
const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type));
|
||||
const uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type));
|
||||
const uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type));
|
||||
|
||||
bool aligned = (KV % pipelines[1]->align) == 0 &&
|
||||
// the "aligned" shader variant will forcibly align strides, for performance
|
||||
(q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0;
|
||||
|
||||
bool aligned = (KV % pipelines[1]->align) == 0;
|
||||
vk_pipeline pipeline = pipelines[aligned];
|
||||
assert(pipeline);
|
||||
|
||||
@@ -4881,15 +4802,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, k->data, d_K, k_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, v->data, d_V, v_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, dst->data, d_D, d_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, k->data, d_K, q_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, v->data, d_V, q_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, dst->data, d_D, q_buf_offset);
|
||||
Q_uma = d_Q != nullptr;
|
||||
K_uma = d_K != nullptr;
|
||||
V_uma = d_V != nullptr;
|
||||
D_uma = d_D != nullptr;
|
||||
if (mask) {
|
||||
ggml_vk_host_get(ctx->device, mask->data, d_M, m_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, mask->data, d_M, q_buf_offset);
|
||||
M_uma = d_M != nullptr;
|
||||
}
|
||||
}
|
||||
@@ -4927,18 +4848,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
}
|
||||
}
|
||||
|
||||
const vk_flash_attn_push_constants pc = { N, KV,
|
||||
(uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
|
||||
(uint32_t)neq2, (uint32_t)neq3,
|
||||
(uint32_t)nek2, (uint32_t)nek3,
|
||||
(uint32_t)nev2, (uint32_t)nev3,
|
||||
nem1,
|
||||
q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
|
||||
k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
|
||||
v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
|
||||
nbm1,
|
||||
scale, max_bias, logit_softcap,
|
||||
mask != nullptr, n_head_log2, m0, m1 };
|
||||
const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, nem1, (uint32_t)nbq2, (uint32_t)nbq3, (uint32_t)nbk2, (uint32_t)nbk3, (uint32_t)nbv2, (uint32_t)nbv3, nbm1, scale, max_bias, logit_softcap, mask != nullptr, n_head_log2, m0, m1 };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
|
||||
@@ -5250,7 +5160,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
}
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
||||
std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
|
||||
GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
||||
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
||||
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
||||
GGML_ASSERT(dst->buffer != nullptr);
|
||||
const uint64_t ne00 = src0->ne[0];
|
||||
@@ -7671,9 +7581,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
|
||||
}
|
||||
if (ctx->device->need_compiles) {
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
}
|
||||
ggml_vk_preallocate_buffers(ctx);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
|
||||
@@ -7998,36 +7905,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
|
||||
|
||||
if (src0_type == GGML_TYPE_F32) {
|
||||
switch (src1_type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src1_type == GGML_TYPE_F32) {
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
@@ -8718,7 +8601,6 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
||||
ggml_tensor * src0 = tensor->src[0];
|
||||
ggml_tensor * src1 = tensor->src[1];
|
||||
ggml_tensor * src2 = tensor->src[2];
|
||||
ggml_tensor * src3 = tensor->src[3];
|
||||
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
@@ -8781,9 +8663,6 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
||||
if (src2 != nullptr) {
|
||||
std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
|
||||
}
|
||||
if (src3 != nullptr) {
|
||||
std::cerr << "src3=" << src3 << " src3->name=" << src3->name << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
|
||||
}
|
||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3);
|
||||
@@ -8828,9 +8707,6 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
||||
if (src2 != nullptr) {
|
||||
std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
|
||||
}
|
||||
if (src3 != nullptr) {
|
||||
std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
|
||||
}
|
||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
||||
@@ -8853,9 +8729,6 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
||||
if (src2 != nullptr) {
|
||||
std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
|
||||
}
|
||||
if (src3 != nullptr) {
|
||||
std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
|
||||
}
|
||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]);
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
#include "generic_unary_head.comp"
|
||||
#include "dequant_funcs.comp"
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
// 16 invocations needed for init_iq4nl_shmem
|
||||
layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
|
||||
#else
|
||||
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
init_iq4nl_shmem();
|
||||
if (gl_LocalInvocationIndex.x != 0) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint dst_idx = get_doffset() + dst_idx(idx);
|
||||
uint src_idx = src0_idx_quant(idx, QUANT_K);
|
||||
|
||||
const uint a_offset = 0;
|
||||
const uint ib = src_idx;
|
||||
const vec2 dm = get_dm(ib, a_offset);
|
||||
|
||||
[[unroll]] for (int j = 0; j < QUANT_K; j += 4) {
|
||||
vec4 v = dequantize4(ib, j / QUANT_R, a_offset);
|
||||
v = v * dm.x + vec4(dm.y);
|
||||
|
||||
#if QUANT_R == 2
|
||||
data_d[dst_idx + j/2 + 0] = v[0];
|
||||
data_d[dst_idx + j/2 + QUANT_K/2 + 0] = v[1];
|
||||
data_d[dst_idx + j/2 + 1] = v[2];
|
||||
data_d[dst_idx + j/2 + QUANT_K/2 + 1] = v[3];
|
||||
#else
|
||||
data_d[dst_idx + j + 0] = v[0];
|
||||
data_d[dst_idx + j + 1] = v[1];
|
||||
data_d[dst_idx + j + 2] = v[2];
|
||||
data_d[dst_idx + j + 3] = v[3];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user