mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
1 Commits
b5412
...
gg/metal-m
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
16843dba33 |
22
.github/actions/get-tag-name/action.yml
vendored
22
.github/actions/get-tag-name/action.yml
vendored
@@ -1,22 +0,0 @@
|
||||
name: "Determine tag name"
|
||||
description: "Determine the tag name to use for a release"
|
||||
outputs:
|
||||
name:
|
||||
description: "The name of the tag"
|
||||
value: ${{ steps.tag.outputs.name }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
67
.github/actions/windows-setup-cuda/action.yml
vendored
67
.github/actions/windows-setup-cuda/action.yml
vendored
@@ -1,67 +0,0 @@
|
||||
name: "Windows - Setup CUDA Toolkit"
|
||||
description: "Setup CUDA Toolkit for Windows"
|
||||
inputs:
|
||||
cuda_version:
|
||||
description: "CUDA toolkit version"
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Install Cuda Toolkit 11.7
|
||||
if: ${{ inputs.cuda_version == '11.7' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 12.4
|
||||
if: ${{ inputs.cuda_version == '12.4' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
@@ -5,10 +5,6 @@ inputs:
|
||||
description: 'CURL version'
|
||||
required: false
|
||||
default: '8.6.0_6'
|
||||
architecture:
|
||||
description: 'Architecture of the libcurl to download'
|
||||
required: false
|
||||
default: 'win64'
|
||||
outputs:
|
||||
curl_path:
|
||||
description: "Path to the downloaded libcurl"
|
||||
@@ -22,9 +18,8 @@ runs:
|
||||
shell: powershell
|
||||
env:
|
||||
CURL_VERSION: ${{ inputs.curl_version }}
|
||||
ARCHITECTURE: ${{ inputs.architecture }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
|
||||
mkdir $env:RUNNER_TEMP/libcurl
|
||||
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
|
||||
echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
|
||||
|
||||
91
.github/workflows/build-linux-cross.yml
vendored
91
.github/workflows/build-linux-cross.yml
vendored
@@ -140,94 +140,3 @@ jobs:
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-ppc64el-cpu-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup PowerPC64le
|
||||
run: |
|
||||
sudo dpkg --add-architecture ppc64el
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-powerpc64le-linux-gnu \
|
||||
g++-14-powerpc64le-linux-gnu \
|
||||
libcurl4-openssl-dev:ppc64el
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
|
||||
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-ppc64el-vulkan-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup PowerPC64le
|
||||
run: |
|
||||
sudo dpkg --add-architecture ppc64el
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
glslc \
|
||||
gcc-14-powerpc64le-linux-gnu \
|
||||
g++-14-powerpc64le-linux-gnu \
|
||||
libvulkan-dev:ppc64el \
|
||||
libcurl4-openssl-dev:ppc64el
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
|
||||
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
793
.github/workflows/build.yml
vendored
793
.github/workflows/build.yml
vendored
@@ -2,19 +2,30 @@ name: CI
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
# Fine-grant permission
|
||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||
permissions:
|
||||
contents: write # for creating release
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
@@ -29,6 +40,8 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -61,6 +74,33 @@ jobs:
|
||||
cd build
|
||||
ctest -L 'main|curl' --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||
name: llama-bin-macos-arm64.zip
|
||||
|
||||
macOS-latest-cmake-x64:
|
||||
runs-on: macos-13
|
||||
|
||||
@@ -68,6 +108,8 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -101,6 +143,33 @@ jobs:
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||
name: llama-bin-macos-x64.zip
|
||||
|
||||
ubuntu-cpu-cmake:
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -116,6 +185,8 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -154,6 +225,33 @@ jobs:
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.zip
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -280,6 +378,8 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
@@ -307,7 +407,34 @@ jobs:
|
||||
run: |
|
||||
cd build
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 3600
|
||||
ctest -L main --verbose --timeout 2700
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
||||
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||
|
||||
ubuntu-22-cmake-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -644,7 +771,7 @@ jobs:
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-msys2
|
||||
variant: ccache
|
||||
variant: sccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Setup ${{ matrix.sys }}
|
||||
@@ -687,29 +814,39 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
|
||||
- build: 'noavx-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
|
||||
- build: 'avx2-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
|
||||
- build: 'avx-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
|
||||
- build: 'avx512-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
|
||||
- build: 'openblas-x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
|
||||
- build: 'vulkan-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
|
||||
- build: 'llvm-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
- build: 'msvc-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
- build: 'llvm-arm64-opencl-adreno'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
# - build: 'kompute-x64'
|
||||
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.build }}
|
||||
variant: ccache
|
||||
variant: sccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Clone Kompute submodule
|
||||
@@ -785,26 +922,68 @@ jobs:
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Check AVX512F support
|
||||
id: check_avx512f
|
||||
if: ${{ matrix.build == 'avx512-x64' }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cd build
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
|
||||
echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
|
||||
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
|
||||
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }}
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
# TODO: disabled for now, consider adding tests for all CPU variants instead
|
||||
# - name: Test (Intel SDE)
|
||||
# id: cmake_test_sde
|
||||
# if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
# run: |
|
||||
# curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# # for some weird reason windows tar doesn't like sde tar.xz
|
||||
# 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
|
||||
# 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
# $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
# cd build
|
||||
# $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||
# & $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||
- name: Test (Intel SDE)
|
||||
id: cmake_test_sde
|
||||
if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# for some weird reason windows tar doesn't like sde tar.xz
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
cd build
|
||||
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
ubuntu-latest-cmake-cuda:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -814,6 +993,8 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
env:
|
||||
@@ -845,23 +1026,77 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '11.7']
|
||||
build: ['cuda']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
|
||||
variant: sccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
with:
|
||||
cuda_version: ${{ matrix.cuda }}
|
||||
- name: Install Cuda Toolkit 11.7
|
||||
if: ${{ matrix.cuda == '11.7' }}
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 12.4
|
||||
if: ${{ matrix.cuda == '12.4' }}
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
@@ -882,8 +1117,6 @@ jobs:
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DLLAMA_BUILD_SERVER=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=ON ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DGGML_RPC=ON ^
|
||||
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
|
||||
@@ -891,6 +1124,51 @@ jobs:
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
run: |
|
||||
echo "Cuda install location: ${{ env.CUDA_PATH }}"
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-latest-cmake-sycl:
|
||||
runs-on: windows-latest
|
||||
|
||||
@@ -906,12 +1184,14 @@ jobs:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-sycl
|
||||
variant: ccache
|
||||
variant: sccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
@@ -924,6 +1204,52 @@ jobs:
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
windows-latest-cmake-hip:
|
||||
if: ${{ github.event.inputs.create_release != 'true' }}
|
||||
runs-on: windows-latest
|
||||
@@ -981,12 +1307,110 @@ jobs:
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
# TODO: reuse windows-latest-cmake-hip instead of duplicating this job
|
||||
windows-latest-cmake-hip-release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
gpu_target: [gfx1100, gfx1101, gfx1030]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-release
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
||||
- name: Verify ROCm
|
||||
id: verify
|
||||
run: |
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_RPC=ON `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -1013,6 +1437,32 @@ jobs:
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework
|
||||
|
||||
android-build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -1040,8 +1490,283 @@ jobs:
|
||||
- name: Build
|
||||
run: |
|
||||
cd examples/llama.android
|
||||
|
||||
./gradlew build --no-daemon
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- ubuntu-cpu-cmake
|
||||
- ubuntu-22-cmake-vulkan
|
||||
- windows-latest-cmake
|
||||
- windows-2019-cmake-cuda
|
||||
- windows-latest-cmake-sycl
|
||||
- windows-latest-cmake-hip-release
|
||||
- macOS-latest-cmake-arm64
|
||||
- macOS-latest-cmake-x64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: release
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
|
||||
- name: Move artifacts
|
||||
id: move_artifacts
|
||||
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: ggml-org/action-create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
uses: actions/github-script@v3
|
||||
with:
|
||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||
script: |
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||
if (path.extname(file) === '.zip') {
|
||||
console.log('uploadReleaseAsset', file);
|
||||
await github.repos.uploadReleaseAsset({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
release_id: release_id,
|
||||
name: file,
|
||||
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
# ubuntu-latest-gcc:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Debug, Release]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
# sudo apt-get install cmake
|
||||
#
|
||||
# - name: Configure
|
||||
# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# make
|
||||
#
|
||||
# ubuntu-latest-clang:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Debug, Release]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
# sudo apt-get install cmake
|
||||
#
|
||||
# - name: Configure
|
||||
# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# make
|
||||
#
|
||||
# ubuntu-latest-gcc-sanitized:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
# sudo apt-get install cmake
|
||||
#
|
||||
# - name: Configure
|
||||
# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# make
|
||||
#
|
||||
# windows:
|
||||
# runs-on: windows-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Release]
|
||||
# arch: [Win32, x64]
|
||||
# include:
|
||||
# - arch: Win32
|
||||
# s2arc: x86
|
||||
# - arch: x64
|
||||
# s2arc: x64
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Add msbuild to PATH
|
||||
# uses: microsoft/setup-msbuild@v1
|
||||
#
|
||||
# - name: Configure
|
||||
# run: >
|
||||
# cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cd ./build
|
||||
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||
#
|
||||
# - name: Upload binaries
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: llama-bin-${{ matrix.arch }}
|
||||
# path: build/bin/${{ matrix.build }}
|
||||
#
|
||||
# windows-blas:
|
||||
# runs-on: windows-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Release]
|
||||
# arch: [Win32, x64]
|
||||
# blas: [ON]
|
||||
# include:
|
||||
# - arch: Win32
|
||||
# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
|
||||
# s2arc: x86
|
||||
# - arch: x64
|
||||
# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
|
||||
# s2arc: x64
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Add msbuild to PATH
|
||||
# uses: microsoft/setup-msbuild@v1
|
||||
#
|
||||
# - name: Fetch OpenBLAS
|
||||
# if: matrix.blas == 'ON'
|
||||
# run: |
|
||||
# C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
|
||||
# 7z x blas.zip -oblas -y
|
||||
# copy blas/include/cblas.h .
|
||||
# copy blas/include/openblas_config.h .
|
||||
# echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
|
||||
#
|
||||
# - name: Configure
|
||||
# run: >
|
||||
# cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
# -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
|
||||
# -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cd ./build
|
||||
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||
#
|
||||
# - name: Copy libopenblas.dll
|
||||
# if: matrix.blas == 'ON'
|
||||
# run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
|
||||
#
|
||||
# - name: Upload binaries
|
||||
# if: matrix.blas == 'ON'
|
||||
# uses: actions/upload-artifact@v4
|
||||
# with:
|
||||
# name: llama-blas-bin-${{ matrix.arch }}
|
||||
# path: build/bin/${{ matrix.build }}
|
||||
#
|
||||
# emscripten:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [Release]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# run: |
|
||||
# wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
|
||||
# tar -xvf master.tar.gz
|
||||
# emsdk-master/emsdk update
|
||||
# emsdk-master/emsdk install latest
|
||||
# emsdk-master/emsdk activate latest
|
||||
#
|
||||
# - name: Configure
|
||||
# run: echo "tmp"
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# pushd emsdk-master
|
||||
# source ./emsdk_env.sh
|
||||
# popd
|
||||
# emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
# make
|
||||
|
||||
openEuler-latest-cmake-cann:
|
||||
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
|
||||
defaults:
|
||||
|
||||
7
.github/workflows/docker.yml
vendored
7
.github/workflows/docker.yml
vendored
@@ -36,13 +36,10 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
# Multi-stage build
|
||||
# Note: the arm64 images are failing, which prevents the amd64 images from being built
|
||||
# https://github.com/ggml-org/llama.cpp/issues/11888
|
||||
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
||||
|
||||
716
.github/workflows/release.yml
vendored
716
.github/workflows/release.yml
vendored
@@ -1,716 +0,0 @@
|
||||
name: Create Release
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
|
||||
|
||||
jobs:
|
||||
macOS-arm64:
|
||||
runs-on: macos-14
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-arm64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
brew install curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DGGML_RPC=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||
name: llama-bin-macos-arm64.zip
|
||||
|
||||
macOS-x64:
|
||||
runs-on: macos-13
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-x64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
brew install curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||
name: llama-bin-macos-x64.zip
|
||||
|
||||
ubuntu-22-cpu:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-22.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-cpu-cmake
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.zip
|
||||
|
||||
ubuntu-22-vulkan:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-vulkan
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
|
||||
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_VULKAN=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
||||
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
VULKAN_VERSION: 1.4.309.0
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
|
||||
#- build: 'openblas-x64'
|
||||
# arch: 'x64'
|
||||
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
arch: 'x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'cpu-arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
|
||||
- build: 'opencl-adreno-arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.build }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
mkdir $env:RUNNER_TEMP/openblas
|
||||
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'vulkan-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
if: ${{ matrix.build == 'opencl-adreno-arm64' }}
|
||||
run: |
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||
cd OpenCL-Headers
|
||||
cmake -B build `
|
||||
-DBUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build --target install
|
||||
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
||||
cd OpenCL-ICD-Loader
|
||||
cmake -B build-arm64-release `
|
||||
-A arm64 `
|
||||
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build-arm64-release --target install --config release
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
with:
|
||||
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
windows-cuda:
|
||||
runs-on: windows-2019
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '11.7']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
with:
|
||||
cuda_version: ${{ matrix.cuda }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=ON ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
run: |
|
||||
echo "Cuda install location: ${{ env.CUDA_PATH }}"
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-sycl:
|
||||
runs-on: windows-latest
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
# TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
windows-hip:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
gpu_target: [gfx1100, gfx1101, gfx1030]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone rocWMMA repository
|
||||
id: clone_rocwmma
|
||||
run: |
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-release
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
||||
- name: Verify ROCm
|
||||
id: verify
|
||||
run: |
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
./build-xcframework.sh
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
# Fine-grant permission
|
||||
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||
permissions:
|
||||
contents: write # for creating release
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- ubuntu-22-cpu
|
||||
- ubuntu-22-vulkan
|
||||
- windows
|
||||
- windows-cuda
|
||||
- windows-sycl
|
||||
- windows-hip
|
||||
- macOS-arm64
|
||||
- macOS-x64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
|
||||
- name: Move artifacts
|
||||
id: move_artifacts
|
||||
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: ggml-org/action-create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
uses: actions/github-script@v3
|
||||
with:
|
||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||
script: |
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||
if (path.extname(file) === '.zip') {
|
||||
console.log('uploadReleaseAsset', file);
|
||||
await github.repos.uploadReleaseAsset({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
release_id: release_id,
|
||||
name: file,
|
||||
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -252,3 +252,20 @@ configure_file(cmake/llama.pc.in
|
||||
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
|
||||
#
|
||||
# copy the license files
|
||||
#
|
||||
|
||||
# Check if running in GitHub Actions
|
||||
if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
||||
message(STATUS "Running inside GitHub Actions - copying license files")
|
||||
|
||||
# Copy all files from licenses/ to build/bin/
|
||||
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
||||
foreach(LICENSE_FILE ${LICENSE_FILES})
|
||||
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
||||
configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
|
||||
@@ -38,6 +38,15 @@
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-llvm", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
@@ -64,6 +73,10 @@
|
||||
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
|
||||
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
||||
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
||||
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
||||
|
||||
43
Makefile
43
Makefile
@@ -1187,6 +1187,11 @@ llama-cli: tools/main/main.cpp \
|
||||
@echo '==== Run ./llama-cli -h for help. ===='
|
||||
@echo
|
||||
|
||||
llama-infill: examples/infill/infill.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-run: tools/run/run.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
@@ -1389,36 +1394,36 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
libllava.a: tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
libllava.a: tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
common/stb_image.h \
|
||||
common/base64.hpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
|
||||
|
||||
llama-llava-cli: tools/mtmd/llava-cli.cpp \
|
||||
tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
llama-llava-cli: tools/llava/llava-cli.cpp \
|
||||
tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-minicpmv-cli: tools/mtmd/minicpmv-cli.cpp \
|
||||
tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
llama-minicpmv-cli: tools/llava/minicpmv-cli.cpp \
|
||||
tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-qwen2vl-cli: tools/mtmd/qwen2vl-cli.cpp \
|
||||
tools/mtmd/llava.cpp \
|
||||
tools/mtmd/llava.h \
|
||||
tools/mtmd/clip.cpp \
|
||||
tools/mtmd/clip.h \
|
||||
llama-qwen2vl-cli: tools/llava/qwen2vl-cli.cpp \
|
||||
tools/llava/llava.cpp \
|
||||
tools/llava/llava.h \
|
||||
tools/llava/clip.cpp \
|
||||
tools/llava/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
|
||||
12
README.md
12
README.md
@@ -16,9 +16,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
|
||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
@@ -572,11 +571,4 @@ automatically. For example:
|
||||
$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
|
||||
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
|
||||
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
|
||||
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
|
||||
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
|
||||
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
|
||||
## References
|
||||
|
||||
@@ -117,7 +117,6 @@ setup_framework_structure() {
|
||||
# Copy all required headers (common for all platforms)
|
||||
cp include/llama.h ${header_path}
|
||||
cp ggml/include/ggml.h ${header_path}
|
||||
cp ggml/include/ggml-opt.h ${header_path}
|
||||
cp ggml/include/ggml-alloc.h ${header_path}
|
||||
cp ggml/include/ggml-backend.h ${header_path}
|
||||
cp ggml/include/ggml-metal.h ${header_path}
|
||||
|
||||
6
cmake/arm64-windows-msvc.cmake
Normal file
6
cmake/arm64-windows-msvc.cmake
Normal file
@@ -0,0 +1,6 @@
|
||||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-pc-windows-msvc )
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
@@ -3,3 +3,9 @@ set( CMAKE_SYSTEM_PROCESSOR x86_64 )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( arch_c_flags "-march=native" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
|
||||
|
||||
|
||||
@@ -73,8 +73,6 @@ add_library(${TARGET} STATIC
|
||||
minja/minja.hpp
|
||||
ngram-cache.cpp
|
||||
ngram-cache.h
|
||||
regex-partial.cpp
|
||||
regex-partial.h
|
||||
sampling.cpp
|
||||
sampling.h
|
||||
speculative.cpp
|
||||
@@ -121,8 +119,8 @@ if (LLAMA_LLGUIDANCE)
|
||||
|
||||
ExternalProject_Add(llguidance_ext
|
||||
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
||||
# v0.7.20 (+ fix to build on GCC 15):
|
||||
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
|
||||
# v0.7.10:
|
||||
GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
|
||||
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
||||
SOURCE_DIR ${LLGUIDANCE_SRC}
|
||||
BUILD_IN_SOURCE TRUE
|
||||
@@ -146,27 +144,3 @@ endif ()
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||
|
||||
|
||||
#
|
||||
# copy the license files
|
||||
#
|
||||
|
||||
# Check if running in GitHub Actions
|
||||
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
||||
message(STATUS "Running inside GitHub Actions - copying license files")
|
||||
|
||||
# Copy all files from licenses/ to build/bin/
|
||||
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
||||
foreach(LICENSE_FILE ${LICENSE_FILES})
|
||||
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
||||
add_custom_command(
|
||||
POST_BUILD
|
||||
TARGET ${TARGET}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
"${LICENSE_FILE}"
|
||||
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
|
||||
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
||||
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
@@ -40,7 +40,7 @@ using json = nlohmann::ordered_json;
|
||||
|
||||
std::initializer_list<enum llama_example> mmproj_examples = {
|
||||
LLAMA_EXAMPLE_LLAVA,
|
||||
LLAMA_EXAMPLE_SERVER,
|
||||
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
|
||||
};
|
||||
|
||||
static std::string read_file(const std::string & fname) {
|
||||
@@ -1283,7 +1283,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.use_color = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||
add_opt(common_arg(
|
||||
{"-t", "--threads"}, "N",
|
||||
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
||||
@@ -1416,7 +1416,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"-n", "--predict", "--n-predict"}, "N",
|
||||
string_format(
|
||||
ex == LLAMA_EXAMPLE_MAIN
|
||||
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
|
||||
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
||||
: "number of tokens to predict (default: %d, -1 = infinity)",
|
||||
params.n_predict),
|
||||
@@ -1655,7 +1655,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.input_prefix = value;
|
||||
params.enable_chat_template = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
add_opt(common_arg(
|
||||
{"--in-suffix"}, "STRING",
|
||||
"string to suffix after user inputs with (default: empty)",
|
||||
@@ -1663,7 +1663,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.input_suffix = value;
|
||||
params.enable_chat_template = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
add_opt(common_arg(
|
||||
{"--no-warmup"},
|
||||
"skip warming up the model with an empty run",
|
||||
@@ -1680,7 +1680,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.spm_infill = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
||||
add_opt(common_arg(
|
||||
{"--samplers"}, "SAMPLERS",
|
||||
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
||||
@@ -2097,6 +2097,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.cache_type_v = kv_cache_type_from_str(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
||||
add_opt(common_arg(
|
||||
{"--perplexity", "--all-logits"},
|
||||
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.logits_all = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
||||
add_opt(common_arg(
|
||||
{"--hellaswag"},
|
||||
"compute HellaSwag score over random tasks from datafile supplied with -f",
|
||||
@@ -2204,33 +2211,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj"}, "FILE",
|
||||
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
||||
"note: if -hf is used, this argument can be omitted",
|
||||
"path to a multimodal projector file. see tools/llava/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.path = value;
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--mmproj-url"}, "URL",
|
||||
"URL to a multimodal projector file. see tools/mtmd/README.md",
|
||||
"URL to a multimodal projector file. see tools/llava/README.md",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.mmproj.url = value;
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--no-mmproj"},
|
||||
"explicitly disable multimodal projector, useful when using -hf",
|
||||
[](common_params & params) {
|
||||
params.no_mmproj = true;
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--no-mmproj-offload"},
|
||||
"do not offload multimodal projector to GPU",
|
||||
[](common_params & params) {
|
||||
params.mmproj_use_gpu = false;
|
||||
}
|
||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
|
||||
).set_examples(mmproj_examples));
|
||||
add_opt(common_arg(
|
||||
{"--image"}, "FILE",
|
||||
"path to an image file. use with multimodal models. Specify multiple times for batching",
|
||||
@@ -2437,13 +2443,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--no-op-offload"},
|
||||
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.no_op_offload = true;
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--lora"}, "FNAME",
|
||||
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
||||
@@ -2635,13 +2634,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.i_chunk = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"--parse-special"},
|
||||
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.parse_special = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"-pps"},
|
||||
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
||||
@@ -2900,7 +2892,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.simple_io = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
add_opt(common_arg(
|
||||
{"--positive-file"}, "FNAME",
|
||||
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
||||
|
||||
250
common/chat.cpp
250
common/chat.cpp
@@ -6,15 +6,6 @@
|
||||
|
||||
#include <optional>
|
||||
|
||||
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
|
||||
auto time = std::chrono::system_clock::to_time_t(now);
|
||||
auto local_time = *std::localtime(&time);
|
||||
std::ostringstream ss;
|
||||
ss << std::put_time(&local_time, format.c_str());
|
||||
auto res = ss.str();
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef minja::chat_template common_chat_template;
|
||||
|
||||
struct common_chat_templates {
|
||||
@@ -33,7 +24,6 @@ struct templates_params {
|
||||
std::string grammar;
|
||||
bool add_generation_prompt = true;
|
||||
bool extract_reasoning = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
};
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
||||
@@ -135,9 +125,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||
msgs.push_back(msg);
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
// @ngxson : disable otherwise it's bloating the API response
|
||||
// printf("%s\n", std::string("; messages = ") + messages.dump(2));
|
||||
throw std::runtime_error("Failed to parse messages: " + std::string(e.what()));
|
||||
throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2));
|
||||
}
|
||||
|
||||
return msgs;
|
||||
@@ -949,83 +937,78 @@ static void expect_tool_parameters(const std::string & name, const json & parame
|
||||
}
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_llama_3_x(const common_chat_template & tmpl, const struct templates_params & inputs, bool allow_python_tag_builtin_tools) {
|
||||
static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct templates_params & inputs, bool allow_python_tag_builtin_tools) {
|
||||
auto builtin_tools = json::array();
|
||||
common_chat_params data;
|
||||
if (!inputs.tools.is_null()) {
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
|
||||
auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
|
||||
if (name == "wolfram_alpha" || name == "web_search" || name == "brave_search") {
|
||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
|
||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
|
||||
expect_tool_parameters(name, parameters, {"query"});
|
||||
} else if (name == "python" || name == "code_interpreter") {
|
||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
|
||||
expect_tool_parameters(name, parameters, {"code"});
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<std::string> kvs;
|
||||
for (const auto & [key, value] : parameters.at("properties").items()) {
|
||||
kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); // NOLINT
|
||||
}
|
||||
|
||||
tool_rules.push_back(
|
||||
builder.add_rule(
|
||||
name + "-call",
|
||||
"\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
|
||||
builtin_tools.push_back(name);
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
|
||||
// https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
|
||||
if (allow_python_tag_builtin_tools) {
|
||||
handle_builtin_tool(name, parameters);
|
||||
}
|
||||
tool_rules.push_back(
|
||||
builder.add_rule(
|
||||
name + "-call",
|
||||
"\"{\" space "
|
||||
"( \"\\\"type\\\"\" space \":\" space \"\\\"function\\\"\" space \",\" space )? "
|
||||
" \"\\\"name\\\"\" space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
|
||||
" \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
|
||||
"\"}\" space"));
|
||||
});
|
||||
// Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
"\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
|
||||
});
|
||||
if (!builtin_tools.empty()) {
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
|
||||
data.preserved_tokens.push_back("<|python_tag|>");
|
||||
auto handle_builtin_tool = [&](const std::string & name, const json & parameters) {
|
||||
if (name == "wolfram_alpha" || name == "web_search" || name == "brave_search") {
|
||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
|
||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
|
||||
expect_tool_parameters(name, parameters, {"query"});
|
||||
} else if (name == "python" || name == "code_interpreter") {
|
||||
// https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
|
||||
expect_tool_parameters(name, parameters, {"code"});
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
// Allow a few empty lines on top of the usual constrained json schema space rule.
|
||||
builder.add_rule("root", string_join(tool_rules, " | "));
|
||||
data.additional_stops.push_back("<|eom_id|>");
|
||||
|
||||
std::vector<std::string> kvs;
|
||||
for (const auto & [key, value] : parameters.at("properties").items()) {
|
||||
kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); // NOLINT
|
||||
}
|
||||
|
||||
tool_rules.push_back(
|
||||
builder.add_rule(
|
||||
name + "-call",
|
||||
"\"<|python_tag|>" + name + ".call(\" " + string_join(kvs, " \", \" ") + " \")\""));
|
||||
builtin_tools.push_back(name);
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
|
||||
// https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
|
||||
if (allow_python_tag_builtin_tools) {
|
||||
handle_builtin_tool(name, parameters);
|
||||
}
|
||||
tool_rules.push_back(
|
||||
builder.add_rule(
|
||||
name + "-call",
|
||||
"\"{\" space "
|
||||
"( \"\\\"type\\\"\" space \":\" space \"\\\"function\\\"\" space \",\" space )? "
|
||||
" \"\\\"name\\\"\" space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
|
||||
" \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
|
||||
"\"}\" space"));
|
||||
});
|
||||
data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
|
||||
? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
|
||||
: COMMON_CHAT_FORMAT_LLAMA_3_X;
|
||||
} else {
|
||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
}
|
||||
// Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
|
||||
"\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
|
||||
});
|
||||
if (!builtin_tools.empty()) {
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
|
||||
data.preserved_tokens.push_back("<|python_tag|>");
|
||||
}
|
||||
// Allow a few empty lines on top of the usual constrained json schema space rule.
|
||||
builder.add_rule("root", string_join(tool_rules, " | "));
|
||||
});
|
||||
data.additional_stops.push_back("<|eom_id|>");
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
|
||||
{"date_string", format_time(inputs.now, "%d %b %Y")},
|
||||
{"tools_in_user_message", false},
|
||||
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
|
||||
});
|
||||
data.format = allow_python_tag_builtin_tools && !builtin_tools.empty()
|
||||
? COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS
|
||||
: COMMON_CHAT_FORMAT_LLAMA_3_X;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
|
||||
@@ -1165,7 +1148,7 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
|
||||
LOG_DBG("%s\n", __func__);
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
|
||||
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
|
||||
{"datetime", "Jan 29 2025 13:00:00 GMT"},
|
||||
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
||||
});
|
||||
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
||||
@@ -1300,59 +1283,55 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in
|
||||
static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
// https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt
|
||||
common_chat_params data;
|
||||
json tools = inputs.tools.is_null() ? inputs.tools : json::array();
|
||||
std::string python_code_argument_name;
|
||||
auto has_raw_python = false;
|
||||
|
||||
if (!inputs.tools.is_null()) {
|
||||
std::string python_code_argument_name;
|
||||
auto has_raw_python = false;
|
||||
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
const auto & parameters = function.at("parameters");
|
||||
std::string name = function.at("name");
|
||||
if (name == "python" || name == "ipython") {
|
||||
if (!parameters.contains("type")) {
|
||||
throw std::runtime_error("Missing type in python tool");
|
||||
}
|
||||
has_raw_python = true;
|
||||
const auto & type = parameters.at("type");
|
||||
if (type == "object") {
|
||||
auto properties = parameters.at("properties");
|
||||
for (auto it = properties.begin(); it != properties.end(); ++it) {
|
||||
if (it.value().at("type") == "string") {
|
||||
if (!python_code_argument_name.empty()) {
|
||||
throw std::runtime_error("Multiple string arguments found in python tool");
|
||||
}
|
||||
python_code_argument_name = it.key();
|
||||
}
|
||||
}
|
||||
if (python_code_argument_name.empty()) {
|
||||
throw std::runtime_error("No string argument found in python tool");
|
||||
}
|
||||
} else if (type != "string") {
|
||||
throw std::runtime_error("Invalid type in python tool: " + type.dump());
|
||||
}
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
const auto & parameters = function.at("parameters");
|
||||
std::string name = function.at("name");
|
||||
if (name == "python" || name == "ipython") {
|
||||
if (!parameters.contains("type")) {
|
||||
throw std::runtime_error("Missing type in python tool");
|
||||
}
|
||||
has_raw_python = true;
|
||||
const auto & type = parameters.at("type");
|
||||
if (type == "object") {
|
||||
auto properties = parameters.at("properties");
|
||||
for (auto it = properties.begin(); it != properties.end(); ++it) {
|
||||
if (it.value().at("type") == "string") {
|
||||
if (!python_code_argument_name.empty()) {
|
||||
throw std::runtime_error("Multiple string arguments found in python tool");
|
||||
}
|
||||
python_code_argument_name = it.key();
|
||||
}
|
||||
}
|
||||
if (python_code_argument_name.empty()) {
|
||||
throw std::runtime_error("No string argument found in python tool");
|
||||
}
|
||||
} else if (type != "string") {
|
||||
throw std::runtime_error("Invalid type in python tool: " + type.dump());
|
||||
}
|
||||
tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
|
||||
});
|
||||
if (has_raw_python) {
|
||||
tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
|
||||
data.preserved_tokens.push_back("<|python_tag|>");
|
||||
}
|
||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
|
||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
|
||||
tool_rules.push_back(builder.add_rule(name + "-call", "\"<function=" + name + ">\" " + builder.add_schema(name + "-args", parameters) + " \"</function>\" space"));
|
||||
});
|
||||
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
|
||||
} else {
|
||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
}
|
||||
if (has_raw_python) {
|
||||
tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
|
||||
data.preserved_tokens.push_back("<|python_tag|>");
|
||||
}
|
||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
|
||||
builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
|
||||
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
|
||||
});
|
||||
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
// TODO: if (has_raw_python)
|
||||
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1;
|
||||
return data;
|
||||
}
|
||||
static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
|
||||
@@ -1612,7 +1591,6 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
params.extract_reasoning = inputs.extract_reasoning;
|
||||
params.tool_choice = inputs.tool_choice;
|
||||
params.grammar = inputs.grammar;
|
||||
params.now = inputs.now;
|
||||
if (!inputs.json_schema.empty()) {
|
||||
params.json_schema = json::parse(inputs.json_schema);
|
||||
}
|
||||
@@ -1664,21 +1642,21 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
return common_chat_params_init_firefunction_v2(tmpl, params);
|
||||
}
|
||||
|
||||
// Plain handler (no tools)
|
||||
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
return common_chat_params_init_without_tools(tmpl, params);
|
||||
}
|
||||
|
||||
// Functionary v3.1 (w/ tools)
|
||||
if (src.find("<|start_header_id|>") != std::string::npos
|
||||
&& src.find("<function=") != std::string::npos) {
|
||||
return common_chat_params_init_functionary_v3_1_llama_3_1(tmpl, params);
|
||||
}
|
||||
|
||||
// Llama 3.1, 3.2, 3.3 (also requires date_string so using it even w/o tools)
|
||||
// Llama 3.1, 3.2, 3.3 (w/ tools)
|
||||
if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
|
||||
auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
|
||||
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
|
||||
}
|
||||
|
||||
// Plain handler (no tools)
|
||||
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
return common_chat_params_init_without_tools(tmpl, params);
|
||||
return common_chat_params_init_llama_3_1_tool_calls(tmpl, params, allow_python_tag_builtin_tools);
|
||||
}
|
||||
|
||||
// Mistral Nemo (w/ tools)
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include <chrono>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@@ -72,7 +71,6 @@ struct common_chat_templates_inputs {
|
||||
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
bool parallel_tool_calls = false;
|
||||
bool extract_reasoning = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
};
|
||||
|
||||
struct common_chat_params {
|
||||
|
||||
@@ -443,25 +443,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
|
||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
||||
}
|
||||
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
|
||||
if (!str.empty() && !stop.empty()) {
|
||||
const char text_last_char = str.back();
|
||||
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
|
||||
if (stop[char_index] == text_last_char) {
|
||||
const auto current_partial = stop.substr(0, char_index + 1);
|
||||
if (string_ends_with(str, current_partial)) {
|
||||
return str.size() - char_index - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return std::string::npos;
|
||||
}
|
||||
|
||||
std::string regex_escape(const std::string & s) {
|
||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||
return std::regex_replace(s, special_chars, "\\$0");
|
||||
@@ -1115,6 +1096,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.n_threads = params.cpuparams.n_threads;
|
||||
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embeddings = params.embedding;
|
||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||
cparams.rope_freq_base = params.rope_freq_base;
|
||||
@@ -1132,7 +1114,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.offload_kqv = !params.no_kv_offload;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.op_offload = !params.no_op_offload;
|
||||
|
||||
if (params.reranking) {
|
||||
cparams.embeddings = true;
|
||||
@@ -1584,20 +1565,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
|
||||
const int64_t ne_datapoint = llama_n_ctx(ctx);
|
||||
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
|
||||
ggml_opt_dataset_t result = ggml_opt_dataset_init(
|
||||
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
|
||||
|
||||
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
|
||||
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
|
||||
|
||||
for (int64_t idata = 0; idata < ndata; ++idata) {
|
||||
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
|
||||
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
|
||||
@@ -67,6 +66,7 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_COMMON,
|
||||
LLAMA_EXAMPLE_SPECULATIVE,
|
||||
LLAMA_EXAMPLE_MAIN,
|
||||
LLAMA_EXAMPLE_INFILL,
|
||||
LLAMA_EXAMPLE_EMBEDDING,
|
||||
LLAMA_EXAMPLE_PERPLEXITY,
|
||||
LLAMA_EXAMPLE_RETRIEVAL,
|
||||
@@ -96,7 +96,6 @@ enum common_sampler_type {
|
||||
COMMON_SAMPLER_TYPE_XTC = 8,
|
||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
@@ -162,7 +161,6 @@ struct common_params_sampling {
|
||||
std::vector<enum common_sampler_type> samplers = {
|
||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||
COMMON_SAMPLER_TYPE_DRY,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||
COMMON_SAMPLER_TYPE_TOP_P,
|
||||
@@ -325,6 +323,7 @@ struct common_params {
|
||||
bool ctx_shift = true; // context shift on inifinite text generation
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
@@ -333,7 +332,6 @@ struct common_params {
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
||||
|
||||
bool single_turn = false; // single turn chat conversation
|
||||
|
||||
@@ -342,7 +340,7 @@ struct common_params {
|
||||
|
||||
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
|
||||
|
||||
// multimodal models (see tools/mtmd)
|
||||
// multimodal models (see tools/llava)
|
||||
struct common_params_model mmproj;
|
||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
@@ -411,7 +409,6 @@ struct common_params {
|
||||
|
||||
bool process_output = false; // collect data for the output tensor
|
||||
bool compute_ppl = true; // whether to compute perplexity
|
||||
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
||||
|
||||
// cvector-generator params
|
||||
int n_pca_batch = 100;
|
||||
@@ -504,9 +501,10 @@ static bool string_starts_with(const std::string & str,
|
||||
return str.rfind(prefix, 0) == 0;
|
||||
}
|
||||
|
||||
// While we wait for C++20's std::string::ends_with...
|
||||
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
|
||||
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
|
||||
static bool string_ends_with(const std::string & str,
|
||||
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
|
||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
||||
}
|
||||
|
||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
void string_process_escapes(std::string & input);
|
||||
@@ -666,9 +664,3 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// training utils
|
||||
//
|
||||
|
||||
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
||||
|
||||
@@ -189,7 +189,6 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
|
||||
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
|
||||
/* .use_approximate_greedy_tokenize_fn = */ false,
|
||||
/* .tokenize_user_data = */ vocab,
|
||||
/* .slices = */ nullptr,
|
||||
};
|
||||
|
||||
char error_buffer[1024];
|
||||
|
||||
@@ -13,12 +13,10 @@
|
||||
#include <chrono>
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <ctime>
|
||||
#include <exception>
|
||||
#include <iomanip>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@@ -395,8 +393,8 @@ class chat_template {
|
||||
|
||||
for (const auto & message_ : adjusted_messages) {
|
||||
auto message = message_;
|
||||
if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
|
||||
throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
|
||||
if (!message.contains("role") || !message.contains("content")) {
|
||||
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
|
||||
}
|
||||
std::string role = message.at("role");
|
||||
|
||||
@@ -417,6 +415,7 @@ class chat_template {
|
||||
}
|
||||
}
|
||||
if (polyfill_tool_calls) {
|
||||
auto content = message.at("content");
|
||||
auto tool_calls = json::array();
|
||||
for (const auto & tool_call : message.at("tool_calls")) {
|
||||
if (tool_call.at("type") != "function") {
|
||||
@@ -435,11 +434,8 @@ class chat_template {
|
||||
auto obj = json {
|
||||
{"tool_calls", tool_calls},
|
||||
};
|
||||
if (message.contains("content")) {
|
||||
auto content = message.at("content");
|
||||
if (!content.is_null() && !content.empty()) {
|
||||
obj["content"] = content;
|
||||
}
|
||||
if (!content.is_null() && !content.empty()) {
|
||||
obj["content"] = content;
|
||||
}
|
||||
message["content"] = obj.dump(2);
|
||||
message.erase("tool_calls");
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cmath>
|
||||
#include <exception>
|
||||
#include <functional>
|
||||
@@ -234,7 +233,7 @@ public:
|
||||
}
|
||||
} else if (is_object()) {
|
||||
if (!index.is_hashable())
|
||||
throw std::runtime_error("Unhashable type: " + index.dump());
|
||||
throw std::runtime_error("Unashable type: " + index.dump());
|
||||
auto it = object_->find(index.primitive_);
|
||||
if (it == object_->end())
|
||||
throw std::runtime_error("Key not found: " + index.dump());
|
||||
@@ -253,7 +252,7 @@ public:
|
||||
auto index = key.get<int>();
|
||||
return array_->at(index < 0 ? array_->size() + index : index);
|
||||
} else if (object_) {
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
|
||||
auto it = object_->find(key.primitive_);
|
||||
if (it == object_->end()) return Value();
|
||||
return it->second;
|
||||
@@ -262,7 +261,7 @@ public:
|
||||
}
|
||||
void set(const Value& key, const Value& value) {
|
||||
if (!object_) throw std::runtime_error("Value is not an object: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
|
||||
(*object_)[key.primitive_] = value;
|
||||
}
|
||||
Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {
|
||||
@@ -399,7 +398,7 @@ public:
|
||||
}
|
||||
return false;
|
||||
} else if (object_) {
|
||||
if (!value.is_hashable()) throw std::runtime_error("Unhashable type: " + value.dump());
|
||||
if (!value.is_hashable()) throw std::runtime_error("Unashable type: " + value.dump());
|
||||
return object_->find(value.primitive_) != object_->end();
|
||||
} else {
|
||||
throw std::runtime_error("contains can only be called on arrays and objects: " + dump());
|
||||
@@ -417,7 +416,7 @@ public:
|
||||
return const_cast<Value*>(this)->at(index);
|
||||
}
|
||||
Value& at(const Value & index) {
|
||||
if (!index.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
|
||||
if (!index.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
|
||||
if (is_array()) return array_->at(index.get<int>());
|
||||
if (is_object()) return object_->at(index.primitive_);
|
||||
throw std::runtime_error("Value is not an array or object: " + dump());
|
||||
@@ -677,8 +676,8 @@ public:
|
||||
class VariableExpr : public Expression {
|
||||
std::string name;
|
||||
public:
|
||||
VariableExpr(const Location & loc, const std::string& n)
|
||||
: Expression(loc), name(n) {}
|
||||
VariableExpr(const Location & location, const std::string& n)
|
||||
: Expression(location), name(n) {}
|
||||
std::string get_name() const { return name; }
|
||||
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
|
||||
if (!context->contains(name)) {
|
||||
@@ -1201,9 +1200,9 @@ public:
|
||||
|
||||
class SliceExpr : public Expression {
|
||||
public:
|
||||
std::shared_ptr<Expression> start, end, step;
|
||||
SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e, std::shared_ptr<Expression> && st = nullptr)
|
||||
: Expression(loc), start(std::move(s)), end(std::move(e)), step(std::move(st)) {}
|
||||
std::shared_ptr<Expression> start, end;
|
||||
SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
|
||||
: Expression(loc), start(std::move(s)), end(std::move(e)) {}
|
||||
Value do_evaluate(const std::shared_ptr<Context> &) const override {
|
||||
throw std::runtime_error("SliceExpr not implemented");
|
||||
}
|
||||
@@ -1220,35 +1219,18 @@ public:
|
||||
if (!index) throw std::runtime_error("SubscriptExpr.index is null");
|
||||
auto target_value = base->evaluate(context);
|
||||
if (auto slice = dynamic_cast<SliceExpr*>(index.get())) {
|
||||
auto len = target_value.size();
|
||||
auto wrap = [len](int64_t i) -> int64_t {
|
||||
if (i < 0) {
|
||||
return i + len;
|
||||
}
|
||||
return i;
|
||||
};
|
||||
int64_t step = slice->step ? slice->step->evaluate(context).get<int64_t>() : 1;
|
||||
if (!step) {
|
||||
throw std::runtime_error("slice step cannot be zero");
|
||||
}
|
||||
int64_t start = slice->start ? wrap(slice->start->evaluate(context).get<int64_t>()) : (step < 0 ? len - 1 : 0);
|
||||
int64_t end = slice->end ? wrap(slice->end->evaluate(context).get<int64_t>()) : (step < 0 ? -1 : len);
|
||||
auto start = slice->start ? slice->start->evaluate(context).get<int64_t>() : 0;
|
||||
auto end = slice->end ? slice->end->evaluate(context).get<int64_t>() : (int64_t) target_value.size();
|
||||
if (target_value.is_string()) {
|
||||
std::string s = target_value.get<std::string>();
|
||||
|
||||
std::string result;
|
||||
if (start < end && step == 1) {
|
||||
result = s.substr(start, end - start);
|
||||
} else {
|
||||
for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
|
||||
result += s[i];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
||||
if (start < 0) start = s.size() + start;
|
||||
if (end < 0) end = s.size() + end;
|
||||
return s.substr(start, end - start);
|
||||
} else if (target_value.is_array()) {
|
||||
if (start < 0) start = target_value.size() + start;
|
||||
if (end < 0) end = target_value.size() + end;
|
||||
auto result = Value::array();
|
||||
for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
|
||||
for (auto i = start; i < end; ++i) {
|
||||
result.push_back(target_value.at(i));
|
||||
}
|
||||
return result;
|
||||
@@ -1323,8 +1305,6 @@ public:
|
||||
if (name == "iterable") return l.is_iterable();
|
||||
if (name == "sequence") return l.is_array();
|
||||
if (name == "defined") return !l.is_null();
|
||||
if (name == "true") return l.to_bool();
|
||||
if (name == "false") return !l.to_bool();
|
||||
throw std::runtime_error("Unknown type for 'is' operator: " + name);
|
||||
};
|
||||
auto value = eval();
|
||||
@@ -1540,10 +1520,6 @@ public:
|
||||
vargs.expectArgs("endswith method", {1, 1}, {0, 0});
|
||||
auto suffix = vargs.args[0].get<std::string>();
|
||||
return suffix.length() <= str.length() && std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
|
||||
} else if (method->get_name() == "startswith") {
|
||||
vargs.expectArgs("startswith method", {1, 1}, {0, 0});
|
||||
auto prefix = vargs.args[0].get<std::string>();
|
||||
return prefix.length() <= str.length() && std::equal(prefix.begin(), prefix.end(), str.begin());
|
||||
} else if (method->get_name() == "title") {
|
||||
vargs.expectArgs("title method", {0, 0}, {0, 0});
|
||||
auto res = str;
|
||||
@@ -2106,37 +2082,28 @@ private:
|
||||
|
||||
while (it != end && consumeSpaces() && peekSymbols({ "[", "." })) {
|
||||
if (!consumeToken("[").empty()) {
|
||||
std::shared_ptr<Expression> index;
|
||||
auto slice_loc = get_location();
|
||||
std::shared_ptr<Expression> start, end, step;
|
||||
bool has_first_colon = false, has_second_colon = false;
|
||||
|
||||
if (!peekSymbols({ ":" })) {
|
||||
start = parseExpression();
|
||||
}
|
||||
|
||||
if (!consumeToken(":").empty()) {
|
||||
has_first_colon = true;
|
||||
if (!peekSymbols({ ":", "]" })) {
|
||||
end = parseExpression();
|
||||
}
|
||||
std::shared_ptr<Expression> index;
|
||||
if (!consumeToken(":").empty()) {
|
||||
has_second_colon = true;
|
||||
if (!peekSymbols({ "]" })) {
|
||||
step = parseExpression();
|
||||
auto slice_end = parseExpression();
|
||||
index = std::make_shared<SliceExpr>(slice_end->location, nullptr, std::move(slice_end));
|
||||
} else {
|
||||
auto slice_start = parseExpression();
|
||||
if (!consumeToken(":").empty()) {
|
||||
consumeSpaces();
|
||||
if (peekSymbols({ "]" })) {
|
||||
index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), nullptr);
|
||||
} else {
|
||||
auto slice_end = parseExpression();
|
||||
index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), std::move(slice_end));
|
||||
}
|
||||
} else {
|
||||
index = std::move(slice_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!index) throw std::runtime_error("Empty index in subscript");
|
||||
if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
|
||||
|
||||
if ((has_first_colon || has_second_colon) && (start || end || step)) {
|
||||
index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
|
||||
} else {
|
||||
index = std::move(start);
|
||||
}
|
||||
if (!index) throw std::runtime_error("Empty index in subscript");
|
||||
if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
|
||||
|
||||
value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
|
||||
value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
|
||||
} else if (!consumeToken(".").empty()) {
|
||||
auto identifier = parseIdentifier();
|
||||
if (!identifier) throw std::runtime_error("Expected identifier in subscript");
|
||||
|
||||
@@ -1,204 +0,0 @@
|
||||
#include "regex-partial.h"
|
||||
#include "common.h"
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
|
||||
common_regex::common_regex(const std::string & pattern) :
|
||||
pattern(pattern),
|
||||
rx(pattern),
|
||||
rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
|
||||
|
||||
common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
|
||||
std::smatch match;
|
||||
if (pos > input.size()) {
|
||||
throw std::runtime_error("Position out of bounds");
|
||||
}
|
||||
auto start = input.begin() + pos;
|
||||
auto found = as_match
|
||||
? std::regex_match(start, input.end(), match, rx)
|
||||
: std::regex_search(start, input.end(), match, rx);
|
||||
if (found) {
|
||||
common_regex_match res;
|
||||
res.type = COMMON_REGEX_MATCH_TYPE_FULL;
|
||||
for (size_t i = 0; i < match.size(); ++i) {
|
||||
auto begin = pos + match.position(i);
|
||||
res.groups.emplace_back(begin, begin + match.length(i));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
std::match_results<std::string::const_reverse_iterator> srmatch;
|
||||
if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
|
||||
auto group = srmatch[1].str();
|
||||
if (group.length() != 0) {
|
||||
auto it = srmatch[1].second.base();
|
||||
// auto position = static_cast<size_t>(std::distance(input.begin(), it));
|
||||
if ((!as_match) || it == input.begin()) {
|
||||
common_regex_match res;
|
||||
res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
|
||||
const size_t begin = std::distance(input.begin(), it);
|
||||
const size_t end = input.size();
|
||||
if (begin == std::string::npos || end == std::string::npos || begin > end) {
|
||||
throw std::runtime_error("Invalid range");
|
||||
}
|
||||
res.groups.push_back({begin, end});
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
/*
|
||||
Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
|
||||
|
||||
Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
|
||||
to see if a string ends with a partial regex match, but but it's not in std::regex yet.
|
||||
Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
|
||||
|
||||
- /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
|
||||
- /a|b/ -> (a|b).*
|
||||
- /a*?/ -> error, could match ""
|
||||
- /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
|
||||
- /.*?ab/ -> ((?:b)?a).* (merge .*)
|
||||
- /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
|
||||
- /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
|
||||
- /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
|
||||
- /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
|
||||
|
||||
The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
|
||||
(i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
|
||||
*/
|
||||
std::string regex_to_reversed_partial_regex(const std::string & pattern) {
|
||||
auto it = pattern.begin();
|
||||
const auto end = pattern.end();
|
||||
|
||||
std::function<std::string()> process = [&]() {
|
||||
std::vector<std::vector<std::string>> alternatives(1);
|
||||
std::vector<std::string> * sequence = &alternatives.back();
|
||||
|
||||
while (it != end) {
|
||||
if (*it == '[') {
|
||||
auto start = it;
|
||||
++it;
|
||||
while (it != end) {
|
||||
if ((*it == '\\') && (++it != end)) {
|
||||
++it;
|
||||
} else if ((it != end) && (*it == ']')) {
|
||||
break;
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
if (it == end) {
|
||||
throw std::runtime_error("Unmatched '[' in pattern");
|
||||
}
|
||||
++it;
|
||||
sequence->push_back(std::string(start, it));
|
||||
} else if (*it == '*' || *it == '?' || *it == '+') {
|
||||
if (sequence->empty()) {
|
||||
throw std::runtime_error("Quantifier without preceding element");
|
||||
}
|
||||
sequence->back() += *it;
|
||||
auto is_star = *it == '*';
|
||||
++it;
|
||||
if (is_star) {
|
||||
if (*it == '?') {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
} else if (*it == '{') {
|
||||
if (sequence->empty()) {
|
||||
throw std::runtime_error("Repetition without preceding element");
|
||||
}
|
||||
++it;
|
||||
auto start = it;
|
||||
while (it != end && *it != '}') {
|
||||
++it;
|
||||
}
|
||||
if (it == end) {
|
||||
throw std::runtime_error("Unmatched '{' in pattern");
|
||||
}
|
||||
auto parts = string_split(std::string(start, it), ",");
|
||||
++it;
|
||||
if (parts.size() > 2) {
|
||||
throw std::runtime_error("Invalid repetition range in pattern");
|
||||
}
|
||||
|
||||
auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
|
||||
if (s.empty()) {
|
||||
return def;
|
||||
}
|
||||
return std::stoi(s);
|
||||
};
|
||||
auto min = parseOptInt(parts[0], 0);
|
||||
auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
|
||||
if (min && max && *max < *min) {
|
||||
throw std::runtime_error("Invalid repetition range in pattern");
|
||||
}
|
||||
// Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
|
||||
auto part = sequence->back();
|
||||
sequence->pop_back();
|
||||
for (int i = 0; i < *min; i++) {
|
||||
sequence->push_back(part);
|
||||
}
|
||||
if (max) {
|
||||
for (int i = *min; i < *max; i++) {
|
||||
sequence->push_back(part + "?");
|
||||
}
|
||||
} else {
|
||||
sequence->push_back(part + "*");
|
||||
}
|
||||
} else if (*it == '(') {
|
||||
++it;
|
||||
if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
|
||||
it += 2;
|
||||
}
|
||||
auto sub = process();
|
||||
if (*it != ')') {
|
||||
throw std::runtime_error("Unmatched '(' in pattern");
|
||||
}
|
||||
++it;
|
||||
auto & part = sequence->emplace_back("(?:");
|
||||
part += sub;
|
||||
part += ")";
|
||||
} else if (*it == ')') {
|
||||
break;
|
||||
} else if (*it == '|') {
|
||||
++it;
|
||||
alternatives.emplace_back();
|
||||
sequence = &alternatives.back();
|
||||
} else if (*it == '\\' && (++it != end)) {
|
||||
auto str = std::string("\\") + *it;
|
||||
sequence->push_back(str);
|
||||
++it;
|
||||
} else if (it != end) {
|
||||
sequence->push_back(std::string(1, *it));
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
// /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
|
||||
// if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
|
||||
// We'll do the outermost capturing group and final .* in the enclosing function.
|
||||
std::vector<std::string> res_alts;
|
||||
for (const auto & parts : alternatives) {
|
||||
auto & res = res_alts.emplace_back();
|
||||
for (size_t i = 0; i < parts.size() - 1; i++) {
|
||||
res += "(?:";
|
||||
}
|
||||
for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
|
||||
res += *it;
|
||||
if (it != parts.rend() - 1) {
|
||||
res += ")?";
|
||||
}
|
||||
}
|
||||
}
|
||||
return string_join(res_alts, "|");
|
||||
};
|
||||
auto res = process();
|
||||
if (it != end) {
|
||||
throw std::runtime_error("Unmatched '(' in pattern");
|
||||
}
|
||||
|
||||
return "(" + res + ")[\\s\\S]*";
|
||||
}
|
||||
@@ -1,56 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <regex>
|
||||
#include <string>
|
||||
|
||||
enum common_regex_match_type {
|
||||
COMMON_REGEX_MATCH_TYPE_NONE,
|
||||
COMMON_REGEX_MATCH_TYPE_PARTIAL,
|
||||
COMMON_REGEX_MATCH_TYPE_FULL,
|
||||
};
|
||||
|
||||
struct common_string_range {
|
||||
size_t begin;
|
||||
size_t end;
|
||||
common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
|
||||
if (begin > end) {
|
||||
throw std::runtime_error("Invalid range");
|
||||
}
|
||||
}
|
||||
// prevent default ctor
|
||||
common_string_range() = delete;
|
||||
bool empty() const {
|
||||
return begin == end;
|
||||
}
|
||||
bool operator==(const common_string_range & other) const {
|
||||
return begin == other.begin && end == other.end;
|
||||
}
|
||||
};
|
||||
|
||||
struct common_regex_match {
|
||||
common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
|
||||
std::vector<common_string_range> groups;
|
||||
|
||||
bool operator==(const common_regex_match & other) const {
|
||||
return type == other.type && groups == other.groups;
|
||||
}
|
||||
bool operator!=(const common_regex_match & other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
};
|
||||
|
||||
class common_regex {
|
||||
std::string pattern;
|
||||
std::regex rx;
|
||||
std::regex rx_reversed_partial;
|
||||
|
||||
public:
|
||||
explicit common_regex(const std::string & pattern);
|
||||
|
||||
common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
|
||||
|
||||
const std::string & str() const { return pattern; }
|
||||
};
|
||||
|
||||
// For testing only (pretty print of failures).
|
||||
std::string regex_to_reversed_partial_regex(const std::string & pattern);
|
||||
@@ -1,7 +1,6 @@
|
||||
#include "sampling.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
@@ -230,48 +229,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||
params.logit_bias.data()));
|
||||
|
||||
if (params.mirostat == 0) {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
if (params.top_n_sigma >= 0) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
} else {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
switch (cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_DRY:
|
||||
{
|
||||
std::vector<const char *> c_breakers;
|
||||
c_breakers.reserve(params.dry_sequence_breakers.size());
|
||||
for (const auto & str : params.dry_sequence_breakers) {
|
||||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
}
|
||||
}
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||
@@ -473,7 +475,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||
@@ -489,7 +490,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||
@@ -504,7 +504,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
||||
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
@@ -518,7 +517,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
@@ -535,16 +533,14 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||
auto sampler = sampler_canonical_name_map.find(name);
|
||||
if (sampler != sampler_canonical_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
continue;
|
||||
}
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
continue;
|
||||
} else {
|
||||
if (allow_alt_names) {
|
||||
sampler = sampler_alt_name_map.find(name);
|
||||
if (sampler != sampler_alt_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
||||
}
|
||||
|
||||
return samplers;
|
||||
@@ -556,7 +552,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||
@@ -571,8 +566,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||
const auto sampler = sampler_name_map.find(c);
|
||||
if (sampler != sampler_name_map.end()) {
|
||||
samplers.push_back(sampler->second);
|
||||
} else {
|
||||
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -426,11 +426,7 @@ class ModelBase:
|
||||
logger.warning(f"Failed to load model config from {dir_model}: {e}")
|
||||
logger.warning("Trying to load config.json instead")
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
if "llm_config" in config:
|
||||
# rename for InternVL
|
||||
config["text_config"] = config["llm_config"]
|
||||
return config
|
||||
return json.load(f)
|
||||
|
||||
@classmethod
|
||||
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
|
||||
@@ -798,9 +794,6 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
|
||||
# ref: https://huggingface.co/mistral-community/pixtral-12b
|
||||
res = "pixtral"
|
||||
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
|
||||
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
|
||||
res = "seed-coder"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -1395,10 +1388,10 @@ class BaichuanModel(TextModel):
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
head_count = self.hparams["num_attention_heads"]
|
||||
@@ -1519,10 +1512,10 @@ class XverseModel(TextModel):
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
@@ -1785,12 +1778,6 @@ class LlamaModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
undo_permute = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
||||
if self.hf_arch == "VLlama3ForCausalLM":
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
@@ -1835,10 +1822,10 @@ class LlamaModel(TextModel):
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||
@@ -2069,9 +2056,6 @@ class Llama4Model(LlamaModel):
|
||||
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
if name.startswith("language_model."):
|
||||
name = name.replace("language_model.", "")
|
||||
|
||||
# split the gate_up into gate and up
|
||||
if "gate_up_proj" in name:
|
||||
name_up = name.replace("gate_up_proj", "up_proj.weight")
|
||||
@@ -2216,10 +2200,10 @@ class DeciModel(TextModel):
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||
@@ -2459,10 +2443,10 @@ class MiniCPMModel(TextModel):
|
||||
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
|
||||
self.gguf_writer.add_logit_scale(logit_scale)
|
||||
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
||||
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
||||
if self.hparams.get("rope_scaling") is not None:
|
||||
if self.hparams["rope_scaling"].get("type") == "longrope":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
|
||||
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
@@ -2607,20 +2591,15 @@ class Qwen2Model(TextModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self._try_set_pooling_type()
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if self.hf_arch == "Qwen2Model":
|
||||
name = f"model.{name}" # map to Qwen2ForCausalLM tensors
|
||||
if "language_model." in name:
|
||||
name = name.replace("language_model.", "") # for InternVL
|
||||
if name.startswith("mlp") or name.startswith("vision_model"):
|
||||
# skip visual tensors
|
||||
return []
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@@ -2724,62 +2703,6 @@ class Qwen2VLVisionModel(VisionModel):
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("InternVisionModel")
|
||||
class InternVisionModel(VisionModel):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
|
||||
# hidden_act
|
||||
if hparams["hidden_act"] == "silu":
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
elif hparams["hidden_act"] == "gelu":
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
else:
|
||||
raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
|
||||
# downsample_ratio
|
||||
downsample_ratio = self.global_config.get("downsample_ratio")
|
||||
assert downsample_ratio is not None
|
||||
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, name, n_dims # unused
|
||||
if ".patch_embd." in new_name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
if ".position_embd." in new_name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return False
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
if name.startswith("vision_model") or name.startswith("mlp"):
|
||||
# process visual tensors
|
||||
# correct name
|
||||
if name.startswith("vision_model"):
|
||||
name = "vision_tower." + name
|
||||
if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
# split QKV tensors if needed
|
||||
if ".qkv." in name:
|
||||
if data_torch.ndim == 2: # weight
|
||||
c3, _ = data_torch.shape
|
||||
else: # bias
|
||||
c3 = data_torch.shape[0]
|
||||
assert c3 % 3 == 0
|
||||
c = c3 // 3
|
||||
wq = data_torch[:c]
|
||||
wk = data_torch[c: c * 2]
|
||||
wv = data_torch[c * 2:]
|
||||
return [
|
||||
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq),
|
||||
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk),
|
||||
(self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv),
|
||||
]
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("WavTokenizerDec")
|
||||
class WavTokenizerDecModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
|
||||
@@ -2832,13 +2755,6 @@ class Qwen2MoeModel(TextModel):
|
||||
if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
|
||||
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
|
||||
logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
|
||||
# YaRN is not enabled by default
|
||||
# To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
@@ -3106,7 +3022,7 @@ class Phi3MiniModel(TextModel):
|
||||
|
||||
scale = max_pos_embds / orig_max_pos_embds
|
||||
|
||||
rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
|
||||
rope_scaling_type = rope_scaling.get('type', '').lower()
|
||||
if len(rope_scaling_type) == 0:
|
||||
raise KeyError('Missing the required key rope_scaling.type')
|
||||
|
||||
@@ -3418,10 +3334,10 @@ class InternLM2Model(TextModel):
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
num_heads = self.hparams["num_attention_heads"]
|
||||
@@ -3431,11 +3347,6 @@ class InternLM2Model(TextModel):
|
||||
head_dim = n_embd // num_heads
|
||||
num_groups = num_heads // q_per_kv
|
||||
|
||||
name = name.replace("language_model.", "") # InternVL
|
||||
if name.startswith("mlp") or name.startswith("vision_model"):
|
||||
# skip visual tensors
|
||||
return []
|
||||
|
||||
if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
|
||||
qkv = data_torch
|
||||
|
||||
@@ -3501,18 +3412,14 @@ class InternLM3Model(TextModel):
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
name = name.replace("language_model.", "") # InternVL
|
||||
if name.startswith("mlp") or name.startswith("vision_model"):
|
||||
# skip visual tensors
|
||||
return []
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
@@ -3995,16 +3902,6 @@ class Gemma3VisionModel(VisionModel):
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
# calculate proj_scale_factor (used by tinygemma3 test model)
|
||||
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
|
||||
n_per_side = int(image_seq_length ** 0.5)
|
||||
image_size = self.hparams["image_size"]
|
||||
patch_size = self.hparams["patch_size"]
|
||||
proj_scale_factor = (image_size // patch_size) // n_per_side
|
||||
if proj_scale_factor > 0 and proj_scale_factor != 4:
|
||||
# we only need to write this if it's not the default value
|
||||
# in this case, we are converting a test model
|
||||
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, new_name, n_dims # unused
|
||||
@@ -4018,9 +3915,6 @@ class Gemma3VisionModel(VisionModel):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if "vision_model.head." in name:
|
||||
return [] # skip redundant tensors for tinygemma3
|
||||
|
||||
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
|
||||
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
|
||||
# process vision tensors
|
||||
@@ -4946,12 +4840,12 @@ class DeepseekV2Model(TextModel):
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
@@ -5443,11 +5337,11 @@ class Glm4Model(TextModel):
|
||||
super().set_gguf_parameters()
|
||||
rope_dim = self.hparams["head_dim"]
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
|
||||
|
||||
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
||||
@@ -5680,10 +5574,10 @@ class ExaoneModel(TextModel):
|
||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
||||
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
|
||||
if hparams["rope_scaling"].get("type") == "linear":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||
@@ -5749,20 +5643,11 @@ class GraniteModel(LlamaModel):
|
||||
logger.info("gguf: (granite) logits_scale = %s", logits_scale)
|
||||
|
||||
|
||||
@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
|
||||
@ModelBase.register("GraniteMoeForCausalLM")
|
||||
class GraniteMoeModel(GraniteModel):
|
||||
"""Conversion for IBM's GraniteMoeForCausalLM"""
|
||||
model_arch = gguf.MODEL_ARCH.GRANITE_MOE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
"""GraniteMoeShared uses GraniteMoe parameters plus the following:
|
||||
- shared_intermediate_size
|
||||
"""
|
||||
super().set_gguf_parameters()
|
||||
if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"):
|
||||
self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length)
|
||||
logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
"""In modeling_granitemoe, the JetMoe implementation of parallel experts
|
||||
is used. This essentially merges w1 and w3 into a single tensor with 2x
|
||||
@@ -5773,21 +5658,12 @@ class GraniteMoeModel(GraniteModel):
|
||||
if name.endswith("block_sparse_moe.input_linear.weight"):
|
||||
ffn_dim = self.hparams["intermediate_size"]
|
||||
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
|
||||
gate, up = data_torch.split(ffn_dim, dim=-2)
|
||||
gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
|
||||
return [
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
||||
]
|
||||
|
||||
if name.endswith("shared_mlp.input_linear.weight"):
|
||||
ffn_dim = self.hparams["shared_intermediate_size"]
|
||||
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
|
||||
gate, up = data_torch.split(ffn_dim, dim=-2)
|
||||
return [
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
|
||||
]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@@ -5804,13 +5680,7 @@ class BailingMoeModel(TextModel):
|
||||
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
else:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||
|
||||
@@ -116,7 +116,6 @@ models = [
|
||||
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
|
||||
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
|
||||
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -731,7 +731,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
|
||||
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
||||
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
|
||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||
|
||||
@@ -742,7 +741,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
||||
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
|
||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
|
||||
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
# Multimodal
|
||||
|
||||
llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
|
||||
- [llama-mtmd-cli](../tools/mtmd/README.md)
|
||||
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
|
||||
|
||||
To enable it, can use use one of the 2 methods below:
|
||||
|
||||
- Use `-hf` option with a supported model (see a list of pre-quantized model below)
|
||||
- To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
|
||||
- To load a model using `-hf` while using a custom mmproj file, use `--mmproj local_file.gguf`
|
||||
- Use `-m model.gguf` option with `--mmproj file.gguf` to specify text and multimodal projector respectively
|
||||
|
||||
By default, multimodal projector will be offloaded to GPU. To disable this, add `--no-mmproj-offload`
|
||||
|
||||
For example:
|
||||
|
||||
```sh
|
||||
# simple usage with CLI
|
||||
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
|
||||
# simple usage with server
|
||||
llama-server -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
|
||||
# using local file
|
||||
llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.gguf
|
||||
|
||||
# no GPU offload
|
||||
llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
|
||||
```
|
||||
|
||||
## Pre-quantized models
|
||||
|
||||
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org
|
||||
|
||||
Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
|
||||
|
||||
NOTE: some models may require large context window, for example: `-c 8192`
|
||||
|
||||
```sh
|
||||
# Gemma 3
|
||||
(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
|
||||
(tool_name) -hf ggml-org/gemma-3-12b-it-GGUF
|
||||
(tool_name) -hf ggml-org/gemma-3-27b-it-GGUF
|
||||
|
||||
# SmolVLM
|
||||
(tool_name) -hf ggml-org/SmolVLM-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/SmolVLM-256M-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/SmolVLM-500M-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
|
||||
|
||||
# Pixtral 12B
|
||||
(tool_name) -hf ggml-org/pixtral-12b-GGUF
|
||||
|
||||
# Qwen 2 VL
|
||||
(tool_name) -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
|
||||
|
||||
# Qwen 2.5 VL
|
||||
(tool_name) -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
|
||||
|
||||
# Mistral Small 3.1 24B (IQ2_M quantization)
|
||||
(tool_name) -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
|
||||
|
||||
# InternVL 2.5 and 3
|
||||
(tool_name) -hf ggml-org/InternVL2_5-1B-GGUF
|
||||
(tool_name) -hf ggml-org/InternVL2_5-4B-GGUF
|
||||
(tool_name) -hf ggml-org/InternVL3-1B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
|
||||
(tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
|
||||
```
|
||||
@@ -33,13 +33,13 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./tools/mtmd/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
python ./tools/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py \
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B \
|
||||
@@ -47,7 +47,7 @@ python ./tools/mtmd/convert_image_encoder_to_gguf.py \
|
||||
```
|
||||
|
||||
```sh
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py \
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||
@@ -69,10 +69,10 @@ Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directo
|
||||
|
||||
## Android compile and run
|
||||
### compile
|
||||
refer to `tools/mtmd/android/build_64.sh`
|
||||
refer to `tools/llava/android/build_64.sh`
|
||||
```sh
|
||||
mkdir tools/mtmd/android/build_64
|
||||
cd tools/mtmd/android/build_64
|
||||
mkdir tools/llava/android/build_64
|
||||
cd tools/llava/android/build_64
|
||||
../build_64.sh
|
||||
```
|
||||
### run on Android
|
||||
|
||||
@@ -25,13 +25,13 @@ git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/T
|
||||
2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./tools/mtmd/glmedge-surgery.py -m ../model_path
|
||||
python ./tools/llava/glmedge-surgery.py -m ../model_path
|
||||
```
|
||||
|
||||
4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./tools/mtmd/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
||||
python ./tools/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
||||
```
|
||||
|
||||
5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
|
||||
|
||||
@@ -37,19 +37,19 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
2. Install the required Python packages:
|
||||
|
||||
```sh
|
||||
pip install -r tools/mtmd/requirements.txt
|
||||
pip install -r tools/llava/requirements.txt
|
||||
```
|
||||
|
||||
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./tools/mtmd/llava_surgery.py -m ../llava-v1.5-7b
|
||||
python ./tools/llava/llava_surgery.py -m ../llava-v1.5-7b
|
||||
```
|
||||
|
||||
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||
```
|
||||
|
||||
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
@@ -69,12 +69,12 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
||||
2) Install the required Python packages:
|
||||
|
||||
```sh
|
||||
pip install -r tools/mtmd/requirements.txt
|
||||
pip install -r tools/llava/requirements.txt
|
||||
```
|
||||
|
||||
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
```console
|
||||
python tools/mtmd/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
python tools/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
```
|
||||
- you will find a llava.projector and a llava.clip file in your model directory
|
||||
|
||||
@@ -88,7 +88,7 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso
|
||||
|
||||
5) Create the visual gguf model:
|
||||
```console
|
||||
python ./tools/mtmd/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
python ./tools/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
```
|
||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||
|
||||
|
||||
@@ -29,8 +29,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -28,8 +28,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -28,8 +28,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||
python ./tools/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./tools/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -21,6 +21,7 @@ else()
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf)
|
||||
add_subdirectory(gritlm)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(parallel)
|
||||
@@ -32,7 +33,6 @@ else()
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(speculative-simple)
|
||||
add_subdirectory(gen-docs)
|
||||
add_subdirectory(training)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
|
||||
@@ -35,14 +35,23 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||
const struct llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
// run model
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
|
||||
// encoder-only model
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
}
|
||||
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
||||
// decoder-only model
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
LOG_ERR("%s : failed to decode\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
|
||||
5
examples/infill/CMakeLists.txt
Normal file
5
examples/infill/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
set(TARGET llama-infill)
|
||||
add_executable(${TARGET} infill.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
47
examples/infill/README.md
Normal file
47
examples/infill/README.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# llama.cpp/example/infill
|
||||
|
||||
This example shows how to use the infill mode with Code Llama models supporting infill mode.
|
||||
Currently the 7B and 13B models support infill mode.
|
||||
|
||||
Infill supports most of the options available in the main example.
|
||||
|
||||
For further information have a look at the main README.md in llama.cpp/example/main/README.md
|
||||
|
||||
## Common Options
|
||||
|
||||
In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
|
||||
|
||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
|
||||
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||
|
||||
## Input Prompts
|
||||
|
||||
The `infill` program provides several ways to interact with the LLaMA models using input prompts:
|
||||
|
||||
- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
|
||||
- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
|
||||
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
|
||||
|
||||
## Interaction
|
||||
|
||||
The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
|
||||
|
||||
### Interaction Options
|
||||
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
|
||||
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
|
||||
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
|
||||
|
||||
### Example
|
||||
|
||||
Download a model that supports infill, for example CodeLlama:
|
||||
```console
|
||||
scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
|
||||
```
|
||||
|
||||
```bash
|
||||
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
||||
```
|
||||
590
examples/infill/infill.cpp
Normal file
590
examples/infill/infill.cpp
Normal file
@@ -0,0 +1,590 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include <signal.h>
|
||||
#include <unistd.h>
|
||||
#elif defined (_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <signal.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
static llama_context ** g_ctx;
|
||||
static llama_model ** g_model;
|
||||
static common_sampler ** g_smpl;
|
||||
static common_params * g_params;
|
||||
static std::vector<llama_token> * g_input_tokens;
|
||||
static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
|
||||
static bool is_interacting = false;
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
static void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (!is_interacting) {
|
||||
is_interacting = true;
|
||||
} else {
|
||||
console::cleanup();
|
||||
LOG("\n");
|
||||
common_perf_print(*g_ctx, *g_smpl);
|
||||
|
||||
// make sure all logs are flushed
|
||||
LOG("Interrupted by user\n");
|
||||
common_log_pause(common_log_main());
|
||||
|
||||
_exit(130);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
g_params = ¶ms;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
auto & sparams = params.sampling;
|
||||
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
if (params.logits_all) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.embedding) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
|
||||
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 0.0) {
|
||||
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 0.0) {
|
||||
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
LOG_INF("%s: llama backend init\n", __func__);
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model = nullptr;
|
||||
llama_context * ctx = nullptr;
|
||||
common_sampler * smpl = nullptr;
|
||||
|
||||
g_model = &model;
|
||||
g_ctx = &ctx;
|
||||
g_smpl = &smpl;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
|
||||
model = llama_init.model.get();
|
||||
ctx = llama_init.context.get();
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
const int n_ctx_train = llama_model_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG_DBG("n_ctx: %d\n", n_ctx);
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
||||
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
std::vector<llama_token> embd_end;
|
||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
||||
|
||||
GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
|
||||
GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
|
||||
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
||||
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
const llama_token middle_token = llama_vocab_fim_mid(vocab);
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
LOG_DBG("add_bos: %d\n", add_bos);
|
||||
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
||||
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
||||
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
}
|
||||
|
||||
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
||||
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
if (params.interactive_first) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
LOG_CNT("'\n");
|
||||
}
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
if (params.interactive) {
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = sigint_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
||||
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
||||
};
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
LOG_INF("%s: interactive mode on.\n", __func__);
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_INF("Input prefix with BOS\n");
|
||||
}
|
||||
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
}
|
||||
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
}
|
||||
}
|
||||
smpl = common_sampler_init(model, sparams);
|
||||
|
||||
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
||||
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
||||
|
||||
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
|
||||
LOG_INF("\n");
|
||||
LOG_INF("\n##### Infill mode #####\n\n");
|
||||
if (params.interactive) {
|
||||
const char *control_message;
|
||||
if (params.multiline_input) {
|
||||
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n";
|
||||
} else {
|
||||
control_message = " - Press Return to return control to LLaMA.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n"
|
||||
" - If you want to submit another line, end your input with '\\'.\n";
|
||||
}
|
||||
LOG_INF("== Running in interactive mode. ==\n");
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
||||
#endif
|
||||
LOG_INF( "%s\n", control_message);
|
||||
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
||||
bool input_echo = true;
|
||||
|
||||
int n_past = 0;
|
||||
int n_remain = params.n_predict;
|
||||
int n_consumed = 0;
|
||||
|
||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
console::set_display(console::prompt);
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
||||
// --prompt or --file which uses the same value.
|
||||
int max_embd_size = n_ctx - 4;
|
||||
|
||||
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
||||
if ((int) embd.size() > max_embd_size) {
|
||||
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
||||
embd.resize(max_embd_size);
|
||||
|
||||
console::set_display(console::error);
|
||||
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
console::set_display(console::reset);
|
||||
}
|
||||
|
||||
// infinite text generation via context swapping
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() > n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||
|
||||
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
}
|
||||
|
||||
// evaluate tokens in batches
|
||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
||||
int n_eval = (int) embd.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
|
||||
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_past += n_eval;
|
||||
|
||||
LOG_DBG("n_past = %d\n", n_past);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||
|
||||
embd.push_back(id);
|
||||
|
||||
// echo this to console
|
||||
input_echo = true;
|
||||
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
common_sampler_accept(smpl, embd_inp[n_consumed], false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// display text
|
||||
if (input_echo) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = common_token_to_piece(ctx, id);
|
||||
LOG("%s", token_str.c_str());
|
||||
|
||||
if (embd.size() > 1) {
|
||||
input_tokens.push_back(id);
|
||||
} else {
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
}
|
||||
}
|
||||
// reset color to default if we there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
}
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
// deal with eot token in infill mode
|
||||
if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
|
||||
if (is_interacting && !params.interactive_first) {
|
||||
// print an eot token
|
||||
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
||||
}
|
||||
LOG("\n");
|
||||
console::set_display(console::user_input);
|
||||
std::string buffer;
|
||||
std::string line;
|
||||
bool another_line=true;
|
||||
// set a new prefix via stdin
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line, if so we use the old input
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_prefix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
// set a new suffix via stdin
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_suffix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
// done taking input, reset color
|
||||
console::set_display(console::reset);
|
||||
|
||||
if (params.escape) {
|
||||
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
||||
string_process_escapes(params.input_prefix);
|
||||
string_process_escapes(params.input_suffix);
|
||||
}
|
||||
|
||||
// tokenize new prefix and suffix
|
||||
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
||||
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
||||
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
|
||||
|
||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
n_remain = params.n_predict;
|
||||
n_past = 0;
|
||||
n_consumed = 0;
|
||||
is_interacting = false;
|
||||
}
|
||||
// deal with end of generation tokens in interactive mode
|
||||
else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
||||
LOG_DBG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
||||
is_interacting = true;
|
||||
LOG("\n");
|
||||
console::set_display(console::user_input);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting && !params.interactive) {
|
||||
LOG_DBG("waiting for user input\n");
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_DBG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
buffer += params.input_prefix;
|
||||
LOG("%s", buffer.c_str());
|
||||
}
|
||||
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
do {
|
||||
another_line = console::readline(line, params.multiline_input);
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
|
||||
// done taking input, reset color
|
||||
console::set_display(console::reset);
|
||||
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
// Entering a empty line lets the user pass control back
|
||||
if (buffer.length() > 1) {
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
buffer += params.input_suffix;
|
||||
LOG("%s", params.input_suffix.c_str());
|
||||
}
|
||||
|
||||
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
||||
|
||||
const size_t original_size = embd_inp.size();
|
||||
|
||||
const auto line_inp = common_tokenize(ctx, buffer, false);
|
||||
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
|
||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||
const llama_token token = embd_inp[i];
|
||||
output_tokens.push_back(token);
|
||||
output_ss << common_token_to_piece(ctx, token);
|
||||
}
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
LOG_DBG("empty line, passing control back\n");
|
||||
}
|
||||
|
||||
input_echo = false; // do not echo this again
|
||||
}
|
||||
|
||||
if (n_past > 0) {
|
||||
if (is_interacting) {
|
||||
common_sampler_reset(smpl);
|
||||
}
|
||||
is_interacting = false;
|
||||
}
|
||||
}
|
||||
|
||||
// end of generation
|
||||
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
|
||||
break;
|
||||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
|
||||
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
|
||||
n_remain = params.n_predict;
|
||||
is_interacting = true;
|
||||
}
|
||||
}
|
||||
if (!params.interactive && n_remain <= 0) {
|
||||
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
common_perf_print(ctx, smpl);
|
||||
|
||||
common_sampler_free(smpl);
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-finetune)
|
||||
add_executable(${TARGET} finetune.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
@@ -1,17 +0,0 @@
|
||||
# llama.cpp/examples/training
|
||||
|
||||
This directory contains examples related to language model training using llama.cpp/GGML.
|
||||
So far finetuning is technically functional (for FP32 models and limited hardware setups) but the code is very much WIP.
|
||||
Finetuning of Stories 260K and LLaMA 3.2 1b seems to work with 24 GB of memory.
|
||||
**For CPU training, compile llama.cpp without any additional backends such as CUDA.**
|
||||
**For CUDA training, use the maximum number of GPU layers.**
|
||||
|
||||
Proof of concept:
|
||||
|
||||
``` sh
|
||||
export model_name=llama_3.2-1b && export quantization=f32
|
||||
./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
|
||||
./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
|
||||
```
|
||||
|
||||
The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
|
||||
@@ -1,96 +0,0 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <vector>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.escape = false;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.use_mmap) {
|
||||
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
|
||||
params.use_mmap = false;
|
||||
}
|
||||
if (params.cache_type_k != GGML_TYPE_F32) {
|
||||
LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
|
||||
params.cache_type_k = GGML_TYPE_F32;
|
||||
}
|
||||
if (params.cache_type_v != GGML_TYPE_F32) {
|
||||
LOG_INF("%s: force changing v cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
|
||||
params.cache_type_v = GGML_TYPE_F32;
|
||||
}
|
||||
|
||||
common_init();
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
llama_model_ptr & model = llama_init.model;
|
||||
llama_context_ptr & ctx = llama_init.context;
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
constexpr float val_split = 0.05f;
|
||||
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
|
||||
|
||||
struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
|
||||
optimizer_params.adamw.alpha = 1e-7f; // learning rate
|
||||
|
||||
struct llama_opt_params lopt_params {
|
||||
/*n_ctx_train =*/ 0,
|
||||
/*param_filter =*/ llama_opt_param_filter_all,
|
||||
/*param_filter_ud =*/ nullptr,
|
||||
/*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params,
|
||||
/*get_opt_pars_ud =*/ &optimizer_params,
|
||||
};
|
||||
llama_opt_init(ctx.get(), model.get(), lopt_params);
|
||||
|
||||
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
|
||||
|
||||
ggml_opt_result_t result_train = ggml_opt_result_init();
|
||||
ggml_opt_result_t result_eval = ggml_opt_result_init();
|
||||
|
||||
for (int epoch = 0; epoch < 2; ++epoch) {
|
||||
llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
|
||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
ggml_opt_result_reset(result_train);
|
||||
ggml_opt_result_reset(result_eval);
|
||||
}
|
||||
ggml_opt_result_free(result_train);
|
||||
ggml_opt_result_free(result_eval);
|
||||
|
||||
llama_model_save_to_file(model.get(), "finetuned-model.gguf");
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -193,7 +193,6 @@ option(GGML_RPC "ggml: use RPC"
|
||||
option(GGML_SYCL "ggml: use SYCL" OFF)
|
||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
|
||||
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
|
||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||
"ggml: sycl target device")
|
||||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||
@@ -367,8 +366,6 @@ if (MSVC)
|
||||
/wd4005 # Macro redefinition
|
||||
/wd4244 # Conversion from one type to another type, possible loss of data
|
||||
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
||||
/wd4996 # Disable POSIX deprecation warnings
|
||||
/wd4702 # Unreachable code warnings
|
||||
)
|
||||
function(disable_msvc_warnings target_name)
|
||||
if(TARGET ${target_name})
|
||||
|
||||
@@ -38,7 +38,7 @@ extern "C" {
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||
|
||||
@@ -59,7 +59,7 @@ extern "C" {
|
||||
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
@@ -248,7 +248,7 @@ extern "C" {
|
||||
// preferrably to run on the same backend as the buffer
|
||||
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
|
||||
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
|
||||
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
|
||||
|
||||
// initialize buffers from a max size graph (optional)
|
||||
reserve_graph = build_graph(sched, max_batch_size);
|
||||
@@ -289,7 +289,7 @@ extern "C" {
|
||||
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
||||
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
|
||||
// Initialize backend buffers from a measure graph
|
||||
|
||||
@@ -37,16 +37,13 @@ extern "C" {
|
||||
// ====== Dataset ======
|
||||
|
||||
GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
|
||||
enum ggml_type type_data, // the type for the internal data tensor
|
||||
enum ggml_type type_label, // the type for the internal labels tensor
|
||||
int64_t ne_datapoint, // number of elements per datapoint
|
||||
int64_t ne_label, // number of elements per label
|
||||
int64_t ndata, // total number of datapoints/labels
|
||||
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
|
||||
int64_t ne_datapoint, // number of elements per datapoint
|
||||
int64_t ne_label, // number of elements per label
|
||||
int64_t ndata, // total number of datapoints/labels
|
||||
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
|
||||
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
|
||||
|
||||
// get underlying tensors that store the data
|
||||
GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
|
||||
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
|
||||
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
|
||||
|
||||
@@ -59,19 +56,13 @@ extern "C" {
|
||||
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
|
||||
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
|
||||
int64_t ibatch);
|
||||
GGML_API void ggml_opt_dataset_get_batch_host(
|
||||
ggml_opt_dataset_t dataset,
|
||||
void * data_batch,
|
||||
size_t nb_data_batch,
|
||||
void * labels_batch,
|
||||
int64_t ibatch);
|
||||
|
||||
// ====== Model / Context ======
|
||||
|
||||
enum ggml_opt_build_type {
|
||||
GGML_OPT_BUILD_TYPE_FORWARD = 10,
|
||||
GGML_OPT_BUILD_TYPE_GRAD = 20,
|
||||
GGML_OPT_BUILD_TYPE_OPT = 30,
|
||||
GGML_OPT_BUILD_TYPE_FORWARD,
|
||||
GGML_OPT_BUILD_TYPE_GRAD,
|
||||
GGML_OPT_BUILD_TYPE_OPT,
|
||||
};
|
||||
|
||||
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
||||
@@ -90,22 +81,20 @@ extern "C" {
|
||||
// userdata can be used to pass arbitrary data
|
||||
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
|
||||
|
||||
// returns the default optimizer params (constant, hard-coded values)
|
||||
// returns the default optimizer params (constant)
|
||||
// userdata is not used
|
||||
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
|
||||
|
||||
// casts userdata to ggml_opt_optimizer_params and returns it
|
||||
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
|
||||
|
||||
// parameters for initializing a new optimization context
|
||||
struct ggml_opt_params {
|
||||
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
|
||||
|
||||
// by default the forward graph needs to be reconstructed for each eval
|
||||
// if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
|
||||
struct ggml_context * ctx_compute;
|
||||
struct ggml_tensor * inputs;
|
||||
struct ggml_tensor * outputs;
|
||||
struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
|
||||
|
||||
// the forward graph is defined by inputs and outputs
|
||||
// those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
|
||||
struct ggml_tensor * inputs;
|
||||
struct ggml_tensor * outputs;
|
||||
|
||||
enum ggml_opt_loss_type loss_type;
|
||||
enum ggml_opt_build_type build_type;
|
||||
@@ -118,9 +107,12 @@ extern "C" {
|
||||
|
||||
// get parameters for an optimization context with defaults set where possible
|
||||
// parameters for which no sensible defaults exist are supplied as arguments to this function
|
||||
GGML_API struct ggml_opt_params ggml_opt_default_params(
|
||||
ggml_backend_sched_t backend_sched,
|
||||
enum ggml_opt_loss_type loss_type);
|
||||
GGML_API ggml_opt_params ggml_opt_default_params(
|
||||
ggml_backend_sched_t backend_sched,
|
||||
struct ggml_context * ctx_compute,
|
||||
struct ggml_tensor * inputs,
|
||||
struct ggml_tensor * outputs,
|
||||
enum ggml_opt_loss_type loss_type);
|
||||
|
||||
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
|
||||
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
|
||||
@@ -129,7 +121,6 @@ extern "C" {
|
||||
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
|
||||
|
||||
// get underlying tensors that store data
|
||||
// if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
|
||||
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
|
||||
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
|
||||
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
|
||||
@@ -137,12 +128,11 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
|
||||
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
|
||||
|
||||
// get the gradient accumulator for a node from the forward graph
|
||||
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
||||
|
||||
// ====== Optimization Result ======
|
||||
|
||||
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
|
||||
GGML_API ggml_opt_result_t ggml_opt_result_init();
|
||||
GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
|
||||
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
|
||||
|
||||
@@ -154,20 +144,11 @@ extern "C" {
|
||||
|
||||
// ====== Computation ======
|
||||
|
||||
// if not using static graphs, this function must be called prior to ggml_opt_alloc
|
||||
GGML_API void ggml_opt_prepare_alloc(
|
||||
ggml_opt_context_t opt_ctx,
|
||||
struct ggml_context * ctx_compute,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * inputs,
|
||||
struct ggml_tensor * outputs);
|
||||
// do forward pass, increment result if not NULL
|
||||
GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
|
||||
|
||||
// allocate the next graph for evaluation, either forward or forward + backward
|
||||
// must be called exactly once prior to calling ggml_opt_eval
|
||||
GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
|
||||
|
||||
// do forward pass, increment result if not NULL, do backward pass if allocated
|
||||
GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
|
||||
// do forward pass, increment result if not NULL, do backward pass
|
||||
GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
|
||||
|
||||
// ############################################################################
|
||||
// ## The high-level functions start here. They do not depend on any private ##
|
||||
@@ -219,9 +200,9 @@ extern "C" {
|
||||
// fit model defined by inputs and outputs to dataset
|
||||
GGML_API void ggml_opt_fit(
|
||||
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
|
||||
struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
|
||||
struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
|
||||
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
||||
ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
|
||||
ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
|
||||
ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
||||
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
||||
enum ggml_opt_loss_type loss_type, // loss to minimize
|
||||
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
||||
|
||||
@@ -673,15 +673,11 @@ extern "C" {
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
|
||||
// returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
|
||||
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
||||
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||
|
||||
// returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
|
||||
GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor);
|
||||
|
||||
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
||||
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
||||
|
||||
@@ -768,7 +764,7 @@ extern "C" {
|
||||
// Tensor flags
|
||||
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_param(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
@@ -938,7 +934,7 @@ extern "C" {
|
||||
GGML_API struct ggml_tensor * ggml_repeat_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// concat a and b along dim
|
||||
// used in stable-diffusion
|
||||
@@ -2049,14 +2045,15 @@ extern "C" {
|
||||
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(
|
||||
struct ggml_context * ctx, // context for gradient computation
|
||||
struct ggml_cgraph * cgraph,
|
||||
struct ggml_tensor ** grad_accs);
|
||||
struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
|
||||
struct ggml_context * ctx_compute, // context for gradient computation
|
||||
struct ggml_cgraph * cgraph,
|
||||
bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
|
||||
|
||||
// graph allocation in a context
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
|
||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||
|
||||
@@ -214,7 +214,7 @@ add_library(ggml
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
target_link_libraries(ggml PRIVATE dl)
|
||||
target_link_libraries(ggml PRIVATE dl stdc++fs)
|
||||
endif()
|
||||
|
||||
function(ggml_add_backend_library backend)
|
||||
|
||||
@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
return SIZE_MAX;
|
||||
}
|
||||
|
||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
|
||||
size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||
if (buft->iface.get_alloc_size) {
|
||||
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
||||
@@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
||||
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
|
||||
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
||||
}
|
||||
|
||||
@@ -674,8 +674,6 @@ struct ggml_backend_sched {
|
||||
char * context_buffer;
|
||||
size_t context_buffer_size;
|
||||
|
||||
bool op_offload;
|
||||
|
||||
int debug;
|
||||
};
|
||||
|
||||
@@ -768,7 +766,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||
// check if a backend with higher prio wants to offload the op
|
||||
if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
|
||||
if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
|
||||
for (int b = 0; b < src_backend_id; b++) {
|
||||
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||
SET_CAUSE(tensor, "1.off");
|
||||
@@ -1111,7 +1109,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
const int node_backend_id = tensor_backend_id(node);
|
||||
|
||||
assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
|
||||
assert(node_backend_id != -1); // all nodes should be assigned by now
|
||||
|
||||
// check if we should start a new split based on the sources of the current node
|
||||
bool need_new_split = false;
|
||||
@@ -1454,8 +1452,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
ggml_backend_buffer_type_t * bufts,
|
||||
int n_backends,
|
||||
size_t graph_size,
|
||||
bool parallel,
|
||||
bool op_offload) {
|
||||
bool parallel) {
|
||||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
@@ -1500,7 +1497,6 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
}
|
||||
|
||||
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
||||
sched->op_offload = op_offload;
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
|
||||
@@ -385,9 +385,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
# Fetch KleidiAI sources:
|
||||
include(FetchContent)
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.6.0")
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.5.0")
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "75b4ad68f25ab673dcc01065e5a0b05f")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "ea22e1aefb800e9bc8c74d91633cc58e")
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
@@ -428,7 +428,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
${KLEIDIAI_SRC}/kai/ukernels/
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
|
||||
|
||||
set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
|
||||
@@ -439,19 +438,17 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
|
||||
string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
|
||||
|
||||
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
|
||||
set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
|
||||
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
|
||||
|
||||
if (NOT DOTPROD_ENABLED MATCHES -1)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
|
||||
endif()
|
||||
|
||||
if (NOT I8MM_ENABLED MATCHES -1)
|
||||
@@ -459,13 +456,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
|
||||
if (NOT SME_ENABLED MATCHES -1)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
|
||||
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
|
||||
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
|
||||
list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
|
||||
set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
||||
endif()
|
||||
|
||||
set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
|
||||
|
||||
@@ -72,6 +72,8 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
#elif defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
@@ -20,6 +20,12 @@
|
||||
#define GROUP_MAX_EPS_IQ1_M 1e-7f
|
||||
#define GROUP_MAX_EPS_IQ1_S 1e-12f
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
||||
@@ -6590,118 +6596,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
#elif defined(__VXE__) || defined(__VXE2__)
|
||||
uint32_t aux[3];
|
||||
uint32_t utmp[4];
|
||||
|
||||
const int32x4_t v_z = vec_splat_s32(0);
|
||||
const uint8x16_t v_3m = vec_splat_u8(0x03);
|
||||
|
||||
const uint8x16_t v_0c = vec_splat_u8(1);
|
||||
const uint8x16_t v_1c = vec_sl(v_0c, 1);
|
||||
const uint8x16_t v_2c = vec_sl(v_0c, 2);
|
||||
const uint8x16_t v_3c = vec_sl(v_0c, 3);
|
||||
|
||||
uint8x16_t q3h[4];
|
||||
uint8x16_t q3b[2];
|
||||
int8x16_t q3bytes[4];
|
||||
int8x16_t q8bytes[4];
|
||||
uint8x16_t qhbits[2];
|
||||
|
||||
float sum = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * restrict x0l = x[i].qs;
|
||||
const uint8_t * restrict x0h = x[i].hmask;
|
||||
const int8_t * restrict y0 = y[i].qs;
|
||||
|
||||
qhbits[0] = vec_xl(0 , x0h);
|
||||
qhbits[1] = vec_xl(16, x0h);
|
||||
|
||||
int32_t isum = 0;
|
||||
|
||||
memcpy(aux, x[i].scales, 12);
|
||||
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
||||
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
||||
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
||||
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
||||
|
||||
int8_t * scale = (int8_t *)utmp;
|
||||
for (int j = 0; j < 16; ++j) scale[j] -= 32;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
int32x4_t isum0, isum1, isum2, isum3;
|
||||
|
||||
q3b[0] = vec_xl(0 , x0l);
|
||||
q3b[1] = vec_xl(16, x0l);
|
||||
x0l += 32;
|
||||
|
||||
q8bytes[0] = vec_xl(0 , y0);
|
||||
q8bytes[1] = vec_xl(16 , y0);
|
||||
q8bytes[2] = vec_xl(32 , y0);
|
||||
q8bytes[3] = vec_xl(48 , y0);
|
||||
q8bytes[4] = vec_xl(64 , y0);
|
||||
q8bytes[5] = vec_xl(80 , y0);
|
||||
q8bytes[6] = vec_xl(96 , y0);
|
||||
q8bytes[7] = vec_xl(112, y0);
|
||||
y0 += 128;
|
||||
|
||||
q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
|
||||
q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
|
||||
q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
|
||||
q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
|
||||
|
||||
q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
|
||||
q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
|
||||
q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
|
||||
q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
|
||||
|
||||
isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
|
||||
isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
|
||||
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
|
||||
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
|
||||
|
||||
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
|
||||
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
|
||||
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
|
||||
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
q3h[0] = vec_andc(v_2c, qhbits[0]);
|
||||
q3h[1] = vec_andc(v_2c, qhbits[1]);
|
||||
q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
|
||||
q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
|
||||
|
||||
q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
|
||||
q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
|
||||
q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
|
||||
q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
|
||||
|
||||
isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
|
||||
isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
|
||||
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
|
||||
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
|
||||
|
||||
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
|
||||
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
|
||||
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
|
||||
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
|
||||
|
||||
scale += 4;
|
||||
|
||||
if (j == 0) {
|
||||
qhbits[0] = vec_sr(qhbits[0], 4);
|
||||
qhbits[1] = vec_sr(qhbits[1], 4);
|
||||
}
|
||||
}
|
||||
|
||||
sum += d * isum;
|
||||
}
|
||||
|
||||
*s = sum;
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
@@ -8519,11 +8414,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
#ifdef __ARM_FEATURE_MATMUL_INT8
|
||||
assert((nrc == 2) || (nrc == 1));
|
||||
#else
|
||||
assert(nrc == 1);
|
||||
#endif
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
@@ -8534,197 +8425,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const block_q6_K * GGML_RESTRICT x0 = x;
|
||||
const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx);
|
||||
const block_q8_K * GGML_RESTRICT y0 = y;
|
||||
const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
|
||||
|
||||
float32x4_t vfsum = vdupq_n_f32(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
|
||||
const uint8_t * GGML_RESTRICT ql0 = x0->ql;
|
||||
const uint8_t * GGML_RESTRICT ql1 = x1->ql;
|
||||
const uint8_t * GGML_RESTRICT qh0 = x0->qh;
|
||||
const uint8_t * GGML_RESTRICT qh1 = x1->qh;
|
||||
const int8_t * GGML_RESTRICT qy0 = y0->qs;
|
||||
const int8_t * GGML_RESTRICT qy1 = y1->qs;
|
||||
|
||||
const uint8x16_t mone = vdupq_n_u8(0x30);
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
|
||||
int32x4_t visum = vdupq_n_s32(0);
|
||||
|
||||
// process 8 blocks per iteration, totally 16 blocks
|
||||
for (int j = 0; j < 2; ++j, qh0 += 32, ql0 += 64, qh1 += 32, ql1 += 64) {
|
||||
int8x16_t vx0[8], vx1[8];
|
||||
|
||||
// de-quantize vx0[8]
|
||||
{
|
||||
const uint8x16x2_t qh_bits = vld1q_u8_x2(qh0);
|
||||
const uint8x16x4_t ql_bits = vld1q_u8_x4(ql0);
|
||||
|
||||
uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
|
||||
uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
|
||||
uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
|
||||
uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
|
||||
|
||||
vx0[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
|
||||
vx0[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
|
||||
vx0[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
|
||||
vx0[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
|
||||
|
||||
q6h_0 = vandq_u8(mone, qh_bits.val[0]);
|
||||
q6h_1 = vandq_u8(mone, qh_bits.val[1]);
|
||||
q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
|
||||
q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
|
||||
|
||||
vx0[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
|
||||
vx0[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
|
||||
vx0[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
|
||||
vx0[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
|
||||
}
|
||||
|
||||
// de-quantize vx1[8]
|
||||
{
|
||||
const uint8x16x2_t qh_bits = vld1q_u8_x2(qh1);
|
||||
const uint8x16x4_t ql_bits = vld1q_u8_x4(ql1);
|
||||
|
||||
uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
|
||||
uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
|
||||
uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
|
||||
uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
|
||||
|
||||
vx1[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
|
||||
vx1[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
|
||||
vx1[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
|
||||
vx1[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
|
||||
|
||||
q6h_0 = vandq_u8(mone, qh_bits.val[0]);
|
||||
q6h_1 = vandq_u8(mone, qh_bits.val[1]);
|
||||
q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
|
||||
q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
|
||||
|
||||
vx1[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
|
||||
vx1[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
|
||||
vx1[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
|
||||
vx1[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
|
||||
}
|
||||
|
||||
// process 16 elements (one block with same scale) per iteration
|
||||
// - vx = concat(ql, qh) - 32
|
||||
// - r1,r2,r3,r4 = smmla(vx, vy)
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
const int blk = j * 8 + k;
|
||||
|
||||
const int8x16_t vy0 = vld1q_s8(qy0);
|
||||
const int8x16_t vy1 = vld1q_s8(qy1);
|
||||
qy0 += 16;
|
||||
qy1 += 16;
|
||||
|
||||
const int32x4_t block_scale = {
|
||||
x0->scales[blk],
|
||||
x0->scales[blk],
|
||||
x1->scales[blk],
|
||||
x1->scales[blk],
|
||||
};
|
||||
|
||||
// calculate four results at once with outer product
|
||||
const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
|
||||
const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
|
||||
const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
|
||||
const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
|
||||
int32x4_t vr = vdupq_n_s32(0);
|
||||
vr = vmmlaq_s32(vr, vx_l, vy_l);
|
||||
vr = vmmlaq_s32(vr, vx_h, vy_h);
|
||||
|
||||
// apply block scale, will NOT overflow
|
||||
// block_scale * sum_256(int6*int8) <= 2^(8+8+6+8) = 30 bits
|
||||
visum = vmlaq_s32(visum, vr, block_scale);
|
||||
}
|
||||
}
|
||||
|
||||
// adjust bias, apply superblock scale
|
||||
{
|
||||
int32_t bias[4];
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
|
||||
const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8);
|
||||
const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums);
|
||||
const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8);
|
||||
const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums);
|
||||
const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8);
|
||||
const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales));
|
||||
const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8));
|
||||
const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales));
|
||||
const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8));
|
||||
const svint64_t zero = svdup_n_s64(0);
|
||||
bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0),
|
||||
svdot_s64(zero, y0_q8sums_1, x0_q6scales_1)));
|
||||
bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0),
|
||||
svdot_s64(zero, y1_q8sums_1, x0_q6scales_1)));
|
||||
bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0),
|
||||
svdot_s64(zero, y0_q8sums_1, x1_q6scales_1)));
|
||||
bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0),
|
||||
svdot_s64(zero, y1_q8sums_1, x1_q6scales_1)));
|
||||
#else
|
||||
// NEON doesn't support int16 dot product, fallback to separated mul and add
|
||||
const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums);
|
||||
const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums);
|
||||
|
||||
int8x16_t scales_s8 = vld1q_s8(x0->scales);
|
||||
const int16x8x2_t q6scales0 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
|
||||
scales_s8 = vld1q_s8(x1->scales);
|
||||
const int16x8x2_t q6scales1 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
|
||||
|
||||
int32x4_t prod;
|
||||
prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales0.val[0])),
|
||||
vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales0.val[0]))),
|
||||
vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales0.val[1])),
|
||||
vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales0.val[1]))));
|
||||
bias[0] = vaddvq_s32(prod);
|
||||
prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales0.val[0])),
|
||||
vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales0.val[0]))),
|
||||
vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales0.val[1])),
|
||||
vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales0.val[1]))));
|
||||
bias[1] = vaddvq_s32(prod);
|
||||
prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales1.val[0])),
|
||||
vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales1.val[0]))),
|
||||
vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales1.val[1])),
|
||||
vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales1.val[1]))));
|
||||
bias[2] = vaddvq_s32(prod);
|
||||
prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales1.val[0])),
|
||||
vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales1.val[0]))),
|
||||
vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales1.val[1])),
|
||||
vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1]))));
|
||||
bias[3] = vaddvq_s32(prod);
|
||||
|
||||
#endif
|
||||
const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
|
||||
|
||||
const float32x4_t superblock_scale = {
|
||||
GGML_FP16_TO_FP32(x0->d) * y0->d,
|
||||
GGML_FP16_TO_FP32(x0->d) * y1->d,
|
||||
GGML_FP16_TO_FP32(x1->d) * y0->d,
|
||||
GGML_FP16_TO_FP32(x1->d) * y1->d,
|
||||
};
|
||||
|
||||
visum = vsubq_s32(visum, vibias);
|
||||
vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
|
||||
}
|
||||
}
|
||||
|
||||
// vfsum = ABCD -> ACBD
|
||||
// AC -> s, BD -> (s+bs)
|
||||
vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
|
||||
vst1_f32(s, vget_low_f32 (vfsum));
|
||||
vst1_f32(s + bs, vget_high_f32(vfsum));
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
||||
float sum = 0;
|
||||
|
||||
@@ -50,6 +50,19 @@
|
||||
#include "llamafile/sgemm.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
|
||||
// disable POSIX deprecation warnings
|
||||
// these functions are never going away, anyway
|
||||
#pragma warning(disable: 4996)
|
||||
|
||||
// unreachable code because of multiple instances of code after GGML_ABORT
|
||||
#pragma warning(disable: 4702)
|
||||
#endif
|
||||
|
||||
// Note: once we move threading into a separate C++ file
|
||||
// will use std::hardware_destructive_interference_size instead of hardcoding it here
|
||||
// and we'll use C++ attribute syntax.
|
||||
@@ -282,11 +295,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.from_float = quantize_row_q6_K,
|
||||
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
.nrows = 2,
|
||||
#else
|
||||
.nrows = 1,
|
||||
#endif
|
||||
},
|
||||
[GGML_TYPE_IQ2_XXS] = {
|
||||
.from_float = NULL,
|
||||
|
||||
@@ -11,26 +11,24 @@
|
||||
#include <vector>
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
# include "ggml-cpu-hbm.h"
|
||||
#include "ggml-cpu-hbm.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_KLEIDIAI
|
||||
# include "kleidiai/kleidiai.h"
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#else
|
||||
# include <unistd.h>
|
||||
#include "kleidiai/kleidiai.h"
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
# include <sys/sysctl.h>
|
||||
# include <sys/types.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// ggml-backend interface
|
||||
@@ -72,10 +70,8 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_ty
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
|
||||
for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra == buft) {
|
||||
return true;
|
||||
}
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra == buft) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -334,18 +330,9 @@ static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t d
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
#ifdef _WIN32
|
||||
MEMORYSTATUSEX status;
|
||||
status.dwLength = sizeof(status);
|
||||
GlobalMemoryStatusEx(&status);
|
||||
*total = status.ullTotalPhys;
|
||||
*free = status.ullAvailPhys;
|
||||
#else
|
||||
long pages = sysconf(_SC_PHYS_PAGES);
|
||||
long page_size = sysconf(_SC_PAGE_SIZE);
|
||||
*total = pages * page_size;
|
||||
*free = *total;
|
||||
#endif
|
||||
// TODO
|
||||
*free = 0;
|
||||
*total = 0;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
@@ -4,22 +4,16 @@
|
||||
|
||||
// KleidiAI micro-kernels
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
|
||||
#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
|
||||
#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
|
||||
#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
|
||||
#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
|
||||
#include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
|
||||
#include "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
|
||||
|
||||
#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
|
||||
#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
|
||||
#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
|
||||
|
||||
#include "kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.h"
|
||||
#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
|
||||
#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
|
||||
|
||||
#include "kai_common.h"
|
||||
|
||||
#include "kernels.h"
|
||||
@@ -67,53 +61,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
{
|
||||
/* SME GEMM */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_lhs_offset = */ kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .run_kernel = */ kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
},
|
||||
/* SME GEMV */
|
||||
/* .kern_info = */ {
|
||||
/* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_mr = */ kai_get_mr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_nr = */ kai_get_nr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_kr = */ kai_get_kr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_sr = */ kai_get_sr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_lhs_offset = */ kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
/* .run_kernel = */ kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
|
||||
},
|
||||
/* .lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
|
||||
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_pack_bf16p2vlx2_f32_sme,
|
||||
/* .packed_size = */ kai_get_lhs_packed_size_lhs_pack_bf16p2vlx2_f32_sme,
|
||||
/* .pack_func = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
|
||||
/* .pack_func = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_SME,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_F16,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#if defined(__APPLE__)
|
||||
@@ -158,9 +105,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
@@ -204,9 +148,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#else
|
||||
@@ -251,9 +192,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
@@ -297,33 +235,12 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .pack_func = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
},
|
||||
/* .required_cpu = */ CPU_FEATURE_DOTPROD,
|
||||
/* .lhs_type = */ GGML_TYPE_F32,
|
||||
/* .rhs_type = */ GGML_TYPE_Q4_0,
|
||||
/* .op_type = */ GGML_TYPE_F32,
|
||||
},
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor) {
|
||||
ggml_kleidiai_kernels * kernel = nullptr;
|
||||
|
||||
if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
|
||||
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
|
||||
if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu &&
|
||||
gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type &&
|
||||
gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type &&
|
||||
gemm_gemv_kernels[i].op_type == tensor->type) {
|
||||
kernel = &gemm_gemv_kernels[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return kernel;
|
||||
}
|
||||
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) {
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature features) {
|
||||
ggml_kleidiai_kernels * kernels = nullptr;
|
||||
|
||||
for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
|
||||
|
||||
@@ -4,10 +4,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <variant>
|
||||
#include "ggml.h"
|
||||
|
||||
enum cpu_feature {
|
||||
CPU_FEATURE_NONE = 0,
|
||||
CPU_FEATURE_DOTPROD = 1,
|
||||
@@ -30,53 +26,26 @@ struct kernel_info {
|
||||
size_t (*get_nr)(void);
|
||||
size_t (*get_kr)(void);
|
||||
size_t (*get_sr)(void);
|
||||
std::variant<
|
||||
std::function<size_t(size_t n_idx, size_t k, size_t bl)>,
|
||||
std::function<size_t(size_t m_idx, size_t k)>
|
||||
> get_lhs_offset;
|
||||
std::variant<
|
||||
std::function<size_t(size_t n_idx, size_t k, size_t bl)>,
|
||||
std::function<size_t(size_t n_idx, size_t k)>
|
||||
> get_rhs_packed_offset;
|
||||
size_t (*get_lhs_offset)(size_t m_idx, size_t k, size_t bl);
|
||||
size_t (*get_rhs_packed_offset)(size_t n_idx, size_t k, size_t bl);
|
||||
size_t (*get_dst_offset)(size_t m_idx, size_t n_idx, size_t stride);
|
||||
size_t (*get_dst_size)(size_t m, size_t n);
|
||||
std::variant<
|
||||
std::function<void(size_t m, size_t n, size_t k, size_t bl, const void* lhs_packed, const void* rhs_packed,
|
||||
float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max)>,
|
||||
std::function<void(size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, void* dst, size_t dst_stride_row,
|
||||
size_t dst_stride_col, float clamp_min, float clamp_max)>
|
||||
> run_kernel;
|
||||
void (*run_kernel)(size_t m, size_t n, size_t k, size_t bl, const void* lhs_packed, const void* rhs_packed,
|
||||
float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max);
|
||||
};
|
||||
|
||||
struct lhs_packing_info {
|
||||
size_t (*get_offset)(size_t m_idx, size_t lhs_stride);
|
||||
std::variant<
|
||||
std::function<size_t(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr)>,
|
||||
std::function<size_t(size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr)>
|
||||
> get_packed_offset;
|
||||
std::variant<
|
||||
std::function<size_t(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr)>,
|
||||
std::function<size_t(size_t m, size_t k, size_t mr, size_t kr, size_t sr)>
|
||||
> packed_size;
|
||||
std::variant<
|
||||
std::function<void(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
|
||||
size_t lhs_stride, void* lhs_packed)>,
|
||||
std::function<void(size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const void* lhs, size_t lhs_stride,
|
||||
void* lhs_packed)>
|
||||
> pack_func;
|
||||
size_t (*get_packed_offset)(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
|
||||
size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
|
||||
void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
|
||||
size_t lhs_stride, void* lhs_packed);
|
||||
};
|
||||
|
||||
struct rhs_packing_info {
|
||||
std::variant<
|
||||
std::function<size_t(size_t n, size_t k, size_t nr, size_t kr, size_t bl)>,
|
||||
std::function<size_t(size_t n, size_t k)>
|
||||
> packed_size;
|
||||
std::variant<
|
||||
std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
|
||||
const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params)>,
|
||||
std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
|
||||
const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params)>
|
||||
> pack_func;
|
||||
size_t (*packed_size)(size_t n, size_t k, size_t nr, size_t kr, size_t bl);
|
||||
void (*pack_func)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
|
||||
const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params);
|
||||
};
|
||||
|
||||
struct ggml_kleidiai_kernels {
|
||||
@@ -86,10 +55,6 @@ struct ggml_kleidiai_kernels {
|
||||
rhs_packing_info rhs_info;
|
||||
|
||||
cpu_feature required_cpu;
|
||||
ggml_type lhs_type;
|
||||
ggml_type rhs_type;
|
||||
ggml_type op_type;
|
||||
};
|
||||
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor);
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features);
|
||||
ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features);
|
||||
|
||||
@@ -3,9 +3,7 @@
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
#include <atomic>
|
||||
#include <cfloat>
|
||||
#include <stdexcept>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#if defined(__linux__)
|
||||
@@ -36,9 +34,8 @@
|
||||
#include "ggml-common.h"
|
||||
|
||||
struct ggml_kleidiai_context {
|
||||
cpu_feature features;
|
||||
ggml_kleidiai_kernels * kernels;
|
||||
} static ctx = { CPU_FEATURE_NONE, NULL };
|
||||
} static ctx = { NULL };
|
||||
|
||||
static void init_kleidiai_context(void) {
|
||||
|
||||
@@ -50,18 +47,18 @@ static void init_kleidiai_context(void) {
|
||||
const char *env_var = getenv("GGML_KLEIDIAI_SME");
|
||||
int sme_enabled = 0;
|
||||
|
||||
ctx.features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
|
||||
(ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) |
|
||||
(ggml_cpu_has_sve() ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);
|
||||
cpu_feature features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
|
||||
(ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) |
|
||||
(ggml_cpu_has_sve() ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);
|
||||
|
||||
if (env_var) {
|
||||
sme_enabled = atoi(env_var);
|
||||
}
|
||||
|
||||
if (sme_enabled != 0) {
|
||||
ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
|
||||
features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
|
||||
}
|
||||
ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features);
|
||||
ctx.kernels = ggml_kleidiai_select_kernels(features);
|
||||
}
|
||||
ggml_critical_section_end();
|
||||
}
|
||||
@@ -71,275 +68,95 @@ static inline int64_t ggml_ne(const ggml_tensor * tensor, int dim) {
|
||||
return tensor->ne[dim];
|
||||
}
|
||||
|
||||
template<typename Ret, typename Variant, typename... Args>
|
||||
static Ret variant_call(const Variant & var, Args&&... args) {
|
||||
return std::visit([&](auto&& func) -> Ret {
|
||||
if constexpr (std::is_invocable_r_v<Ret, decltype(func), Args...>) {
|
||||
return func(std::forward<Args>(args)...);
|
||||
} else {
|
||||
throw std::runtime_error("Invalid function type in variant_call");
|
||||
}
|
||||
}, var);
|
||||
}
|
||||
|
||||
namespace ggml::cpu::kleidiai {
|
||||
|
||||
static size_t round_down(size_t x, size_t y) {
|
||||
return y == 0 ? x : x - (x % y);
|
||||
}
|
||||
|
||||
static void transpose_f32kxn_f16nxk(size_t n, size_t k, float * dst, const uint16_t * src, size_t rhs_stride) {
|
||||
size_t src_stride = rhs_stride / sizeof(uint16_t);
|
||||
size_t dst_stride = n;
|
||||
|
||||
for (size_t k_idx = 0; k_idx < k; ++k_idx) {
|
||||
for (size_t n_idx = 0; n_idx < n; ++n_idx) {
|
||||
uint16_t v = *(src + k_idx + n_idx * src_stride);
|
||||
*(dst + n_idx + k_idx * dst_stride) = kai_cast_f32_f16(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
||||
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op);
|
||||
GGML_ASSERT(kernels);
|
||||
kernel_info * kernel = op->src[1]->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
|
||||
GGML_ASSERT(ctx.kernels);
|
||||
kernel_info * kernel = op->src[1]->ne[1] == 1 ? &ctx.kernels->gemv : &ctx.kernels->gemm;
|
||||
|
||||
size_t k = op->src[0]->ne[0];
|
||||
size_t n = op->src[0]->ne[1];
|
||||
size_t m = op->src[1]->ne[1];
|
||||
|
||||
size_t mr = kernel->get_mr();
|
||||
size_t kr = kernel->get_kr();
|
||||
size_t sr = kernel->get_sr();
|
||||
|
||||
if (kernels->rhs_type == GGML_TYPE_Q4_0) {
|
||||
size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, QK4_0, mr, kr, sr);
|
||||
} else if (kernels->rhs_type == GGML_TYPE_F16) {
|
||||
size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, mr, kr, sr) +
|
||||
variant_call<size_t>(kernels->rhs_info.packed_size, n, k) +
|
||||
k * n * sizeof(float) + n * sizeof(float);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
size = ctx.kernels->lhs_info.packed_size(m, k, QK4_0, mr, kr, sr);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * dst) override {
|
||||
if (dst->op == GGML_OP_MUL_MAT) {
|
||||
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
return compute_forward_q4_0(params, dst);
|
||||
} else if (dst->src[0]->type == GGML_TYPE_F16) {
|
||||
return compute_forward_kv_cache(params, dst);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
bool compute_forward_kv_cache(ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
GGML_ASSERT(ctx.kernels);
|
||||
kernel_info * kernel = src1->ne[1] == 1 ? &ctx.kernels->gemv : &ctx.kernels->gemm;
|
||||
lhs_packing_info * lhs_info = &ctx.kernels->lhs_info;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
GGML_ASSERT(kernel);
|
||||
|
||||
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
|
||||
GGML_ASSERT(kernels);
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
kernel_info * kernel = src1->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
|
||||
GGML_ASSERT(kernel);
|
||||
const size_t k = ne00;
|
||||
const size_t m = ne11;
|
||||
const size_t n = ne01;
|
||||
|
||||
const int nth = params->nth;
|
||||
const int ith = params->ith;
|
||||
const size_t n_step = kernel->get_n_step();
|
||||
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
|
||||
const size_t n_start = ith * num_n_per_thread;
|
||||
|
||||
const int64_t lhs_batch_size0 = ne12;
|
||||
const int64_t rhs_batch_size0 = ne02;
|
||||
const int64_t batch_size = rhs_batch_size0;
|
||||
|
||||
const int64_t r = lhs_batch_size0 / rhs_batch_size0;
|
||||
|
||||
const int64_t m = ne11 * r;
|
||||
const int64_t n = ne01;
|
||||
const int64_t k = ne00;
|
||||
|
||||
const size_t lhs_stride = src1->nb[1];
|
||||
const size_t rhs_stride = src0->nb[1];
|
||||
const size_t dst_stride = dst->nb[1];
|
||||
|
||||
const int64_t mr = static_cast<int64_t>(kernel->get_mr());
|
||||
const int64_t nr = static_cast<int64_t>(kernel->get_nr());
|
||||
const int64_t kr = static_cast<int64_t>(kernel->get_kr());
|
||||
const int64_t sr = static_cast<int64_t>(kernel->get_sr());
|
||||
|
||||
const size_t lhs_packed_size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, mr, kr, sr);
|
||||
const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, n, k);
|
||||
const size_t kxn_size = k * n * sizeof(float);
|
||||
const size_t bias_size = n * sizeof(float);
|
||||
|
||||
const size_t wsize_required = lhs_packed_size + rhs_packed_size + kxn_size + bias_size;
|
||||
GGML_ASSERT(wsize_required <= params->wsize);
|
||||
|
||||
uint8_t * lhs_packed = static_cast<uint8_t *>(params->wdata);
|
||||
uint8_t * rhs_packed = lhs_packed + lhs_packed_size;
|
||||
uint8_t * rhs_kxn = rhs_packed + rhs_packed_size;
|
||||
uint8_t * bias = rhs_kxn + kxn_size;
|
||||
|
||||
for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
|
||||
const uint8_t * lhs_batch = static_cast<const uint8_t *>(src1->data) + batch_idx * m * lhs_stride;
|
||||
const uint8_t * rhs_batch = static_cast<const uint8_t *>(src0->data) + batch_idx * n * rhs_stride;
|
||||
uint8_t * dst_batch = static_cast<uint8_t *>(dst->data) + batch_idx * m * dst_stride;
|
||||
|
||||
// LHS packing
|
||||
{
|
||||
const int64_t m_roundup_mr = kai_roundup(m, mr);
|
||||
const int64_t num_threads = KAI_MIN(m_roundup_mr / mr, nth);
|
||||
|
||||
if (ith < num_threads) {
|
||||
const int64_t num_m_per_thread0 = round_down(m_roundup_mr / num_threads, mr);
|
||||
const int64_t num_m_per_threadN_1 = m - (num_threads - 1) * num_m_per_thread0;
|
||||
|
||||
const int64_t m_start = ith * num_m_per_thread0;
|
||||
const int64_t num_m_per_thread = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;
|
||||
|
||||
const size_t lhs_offset = variant_call<size_t>(kernels->gemm.get_lhs_offset, m_start, lhs_stride);
|
||||
const size_t lhs_packed_offset = variant_call<size_t>(kernels->lhs_info.get_packed_offset, m_start, k, mr, kr, sr);
|
||||
|
||||
const void * src_ptr = static_cast<const uint8_t *>(lhs_batch) + lhs_offset;
|
||||
void * dst_ptr = static_cast<uint8_t *>(lhs_packed) + lhs_packed_offset;
|
||||
|
||||
variant_call<void>(kernels->lhs_info.pack_func, num_m_per_thread, k, mr, kr, sr, 0, src_ptr, lhs_stride, dst_ptr);
|
||||
}
|
||||
size_t n_to_process = num_n_per_thread;
|
||||
if ((n_start + n_to_process) > n) {
|
||||
n_to_process = n - n_start;
|
||||
}
|
||||
|
||||
// RHS packing
|
||||
if (first_to_arrive.test_and_set(std::memory_order_acquire) == false) {
|
||||
// First thread to reach this point handles RHS packing
|
||||
memset(bias, 0, n * sizeof(float));
|
||||
transpose_f32kxn_f16nxk(n, k, reinterpret_cast<float *>(rhs_kxn),
|
||||
reinterpret_cast<const uint16_t *>(rhs_batch), rhs_stride);
|
||||
const uint8_t * lhs = static_cast<const uint8_t *>(src1->data);
|
||||
uint8_t * lhs_packed = (uint8_t*)params->wdata;
|
||||
const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
|
||||
|
||||
variant_call<void>(kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, n * sizeof(float),
|
||||
rhs_kxn, bias, nullptr, rhs_packed, 0, nullptr);
|
||||
size_t mr = kernel->get_mr();
|
||||
size_t kr = kernel->get_kr();
|
||||
size_t sr = kernel->get_sr();
|
||||
|
||||
// Calculate number of columns to be processed per thread
|
||||
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
|
||||
const size_t m_start = ith * num_m_per_thread;
|
||||
size_t m_to_process = num_m_per_thread;
|
||||
if ((m_start + m_to_process) > m) {
|
||||
m_to_process = m - m_start;
|
||||
}
|
||||
|
||||
if(m_start < m) {
|
||||
// Transform LHS
|
||||
const size_t src_stride = src1->nb[1];
|
||||
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
|
||||
const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
|
||||
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
|
||||
|
||||
lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
first_to_arrive.clear(std::memory_order_release);
|
||||
// Perform the operation
|
||||
const size_t dst_stride = dst->nb[1];
|
||||
const size_t lhs_packed_offset = lhs_info->get_packed_offset(0, k, QK4_0, mr, kr, sr);
|
||||
const size_t rhs_packed_offset = kernel->get_rhs_packed_offset(n_start, k, QK4_0);
|
||||
const size_t dst_offset = kernel->get_dst_offset(0, n_start, dst_stride);
|
||||
const void * rhs_ptr = static_cast<const void *>(rhs_packed + rhs_packed_offset);
|
||||
const void* lhs_ptr = (const void*)((const char *)lhs_packed + lhs_packed_offset);
|
||||
float *dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
|
||||
|
||||
// Perform the matmul
|
||||
{
|
||||
const int64_t m_to_process = m;
|
||||
const int64_t m_start = 0;
|
||||
|
||||
const int64_t n_step = static_cast<int64_t>(kernel->get_n_step());
|
||||
const int64_t num_threads = KAI_MIN(n / n_step, nth);
|
||||
|
||||
if (ith < num_threads) {
|
||||
const int64_t num_n_per_thread0 = round_down(n / num_threads, n_step);
|
||||
const int64_t num_n_per_threadN_1 = n - (num_threads - 1) * num_n_per_thread0;
|
||||
|
||||
const int64_t n_start = ith * num_n_per_thread0;
|
||||
const int64_t n_to_process = (ith == num_threads - 1) ? num_n_per_threadN_1 : num_n_per_thread0;
|
||||
|
||||
const size_t lhs_packed_offset = variant_call<size_t>(kernel->get_lhs_offset, m_start, k);
|
||||
const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k);
|
||||
const size_t dst_offset = kernel->get_dst_offset(m_start, n_start, dst_stride);
|
||||
|
||||
const void * lhs_ptr = lhs_packed + lhs_packed_offset;
|
||||
const void * rhs_ptr = rhs_packed + rhs_packed_offset;
|
||||
float * dst_ptr = reinterpret_cast<float *>(dst_batch + dst_offset);
|
||||
|
||||
variant_call<void>(kernel->run_kernel, m_to_process, n_to_process, k, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
}
|
||||
}
|
||||
|
||||
if (batch_idx != batch_size - 1) {
|
||||
// This barrier is necessary when the batch size is larger than 1. While processing a batch,
|
||||
// the work data buffer (params->wdata) is used as temporary storage which means that only
|
||||
// a single batch can be processed at any given time. No barrier is needed for the last
|
||||
// batch since GGML inserts a barrier between the execution of every operator.
|
||||
ggml_barrier(params->threadpool);
|
||||
}
|
||||
kernel->run_kernel(m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr,
|
||||
dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
return true;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compute_forward_q4_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
|
||||
GGML_ASSERT(kernels);
|
||||
|
||||
kernel_info * kernel = src1->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
|
||||
lhs_packing_info * lhs_info = &kernels->lhs_info;
|
||||
|
||||
GGML_ASSERT(kernel);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const size_t k = ne00;
|
||||
const size_t m = ne11;
|
||||
const size_t n = ne01;
|
||||
|
||||
size_t mr = kernel->get_mr();
|
||||
size_t kr = kernel->get_kr();
|
||||
size_t sr = kernel->get_sr();
|
||||
|
||||
const uint8_t * lhs = static_cast<const uint8_t *>(src1->data);
|
||||
uint8_t * lhs_packed = (uint8_t*)params->wdata;
|
||||
const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
|
||||
|
||||
const size_t n_step = kernel->get_n_step();
|
||||
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
|
||||
const size_t n_start = ith * num_n_per_thread;
|
||||
|
||||
size_t n_to_process = num_n_per_thread;
|
||||
if ((n_start + n_to_process) > n) {
|
||||
n_to_process = n - n_start;
|
||||
}
|
||||
|
||||
// Calculate number of columns to be processed per thread
|
||||
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
|
||||
const size_t m_start = ith * num_m_per_thread;
|
||||
size_t m_to_process = num_m_per_thread;
|
||||
if ((m_start + m_to_process) > m) {
|
||||
m_to_process = m - m_start;
|
||||
}
|
||||
|
||||
if (m_start < m) {
|
||||
// Transform LHS
|
||||
const size_t src_stride = src1->nb[1];
|
||||
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
|
||||
const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, m_start, k, QK4_0, mr, kr, sr);
|
||||
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
|
||||
|
||||
variant_call<void>(lhs_info->pack_func, m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
// Perform the operation
|
||||
const size_t dst_stride = dst->nb[1];
|
||||
const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, 0, k, QK4_0, mr, kr, sr);
|
||||
const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k, QK4_0);
|
||||
const size_t dst_offset = kernel->get_dst_offset(0, n_start, dst_stride);
|
||||
const void * rhs_ptr = static_cast<const void *>(rhs_packed + rhs_packed_offset);
|
||||
const void* lhs_ptr = (const void*)((const char *)lhs_packed + lhs_packed_offset);
|
||||
float *dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
|
||||
|
||||
variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
|
||||
sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public:
|
||||
@@ -352,13 +169,13 @@ public:
|
||||
size_t sr = ctx.kernels->gemm.get_sr();
|
||||
|
||||
#ifndef NDEBUG
|
||||
const size_t repacked_size = variant_call<size_t>(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0);
|
||||
const size_t repacked_size = ctx.kernels->rhs_info.packed_size(n, k, nr, kr, QK4_0);
|
||||
GGML_ASSERT(repacked_size <= data_size && "repacked size larger than the packed size!");
|
||||
#endif
|
||||
struct kai_rhs_pack_qs4cxs1s0_param params;
|
||||
params.lhs_zero_point = 1;
|
||||
params.rhs_zero_point = 8;
|
||||
variant_call<void>(ctx.kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, QK4_0, (const uint8_t*)data, nullptr, tensor->data, 0, ¶ms);
|
||||
ctx.kernels->rhs_info.pack_func(1, n, k, nr, kr, sr, QK4_0, (const uint8_t *)data, NULL, tensor->data, 0, ¶ms);
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -372,7 +189,7 @@ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struc
|
||||
}
|
||||
} // namespace ggml::cpu::kleidiai
|
||||
|
||||
static enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
GGML_API enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
@@ -421,11 +238,12 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b
|
||||
namespace ggml::cpu::kleidiai {
|
||||
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
||||
if (op->op == GGML_OP_MUL_MAT &&
|
||||
op->src[0]->type == GGML_TYPE_Q4_0 &&
|
||||
op->src[0]->buffer &&
|
||||
(ggml_n_dims(op->src[0]) == 2) &&
|
||||
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
|
||||
if ( op->op == GGML_OP_MUL_MAT &&
|
||||
op->src[0]->type == GGML_TYPE_Q4_0 &&
|
||||
op->src[0]->buffer &&
|
||||
(ggml_n_dims(op->src[0]) == 2) &&
|
||||
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels
|
||||
) {
|
||||
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
@@ -442,19 +260,6 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
|
||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||
}
|
||||
else if (ggml_kleidiai_select_kernels(ctx.features, op) &&
|
||||
op->src[0]->op == GGML_OP_VIEW &&
|
||||
(op->src[1]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_SOFT_MAX) &&
|
||||
op->src[1]->ne[1] > 1) {
|
||||
if ((op->src[0]->nb[0] != 2) ||
|
||||
(op->src[1]->nb[0] != 4) ||
|
||||
(op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
|
||||
(op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return ggml::cpu::kleidiai::get_tensor_traits(NULL, NULL);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -8,6 +8,19 @@
|
||||
|
||||
#include <float.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
|
||||
// disable POSIX deprecation warnings
|
||||
// these functions are never going away, anyway
|
||||
#pragma warning(disable: 4996)
|
||||
|
||||
// unreachable code because of multiple instances of code after GGML_ABORT
|
||||
#pragma warning(disable: 4702)
|
||||
#endif
|
||||
|
||||
// ggml_compute_forward_dup
|
||||
|
||||
static void ggml_compute_forward_dup_same_cont(
|
||||
|
||||
@@ -2,6 +2,12 @@
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
// precomputed gelu table for f16 (128 KB)
|
||||
ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
||||
|
||||
|
||||
@@ -12,30 +12,12 @@ if (CUDAToolkit_FOUND)
|
||||
# 61 == Pascal, __dp4a instruction (per-byte integer dot product)
|
||||
# 70 == V100, FP16 tensor cores
|
||||
# 75 == Turing, int8 tensor cores
|
||||
# 80 == Ampere, asynchronous data loading, faster tensor core instructions
|
||||
# 86 == RTX 3000, needs CUDA v11.1
|
||||
# 89 == RTX 4000, needs CUDA v11.8
|
||||
#
|
||||
# XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
|
||||
# XX-real == compile CUDA code as device code for this specific architecture
|
||||
# no suffix == compile as both PTX and device code
|
||||
#
|
||||
# The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
|
||||
# for best performance and to also build real architectures for the most commonly used GPUs.
|
||||
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "native")
|
||||
elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
|
||||
endif()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80")
|
||||
else()
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
|
||||
endif()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50;61;70;75;80")
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
@@ -118,7 +100,7 @@ if (CUDAToolkit_FOUND)
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||
set(CUDA_FLAGS -use_fast_math)
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# Options are:
|
||||
|
||||
@@ -1,61 +1,47 @@
|
||||
#include "acc.cuh"
|
||||
|
||||
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
|
||||
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
||||
const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
|
||||
const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
||||
const int ne10, const int ne11, const int ne12,
|
||||
const int nb1, const int nb2, int offset) {
|
||||
const int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t src1_idx = i - offset;
|
||||
|
||||
int64_t tmp = src1_idx;
|
||||
const int64_t i13 = tmp / s13;
|
||||
tmp -= i13 * s13;
|
||||
const int64_t i12 = tmp / s12;
|
||||
tmp -= i12 * s12;
|
||||
const int64_t i11 = tmp / s11;
|
||||
tmp -= i11 * s11;
|
||||
const int64_t i10 = tmp;
|
||||
|
||||
float val = x[i];
|
||||
if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
|
||||
val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
|
||||
int src1_idx = i - offset;
|
||||
int oz = src1_idx / nb2;
|
||||
int oy = (src1_idx - (oz * nb2)) / nb1;
|
||||
int ox = src1_idx % nb1;
|
||||
if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
|
||||
dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
|
||||
} else {
|
||||
dst[i] = x[i];
|
||||
}
|
||||
dst[i] = val;
|
||||
}
|
||||
|
||||
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
|
||||
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
||||
const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
|
||||
const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
||||
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
|
||||
static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
|
||||
const int ne10, const int ne11, const int ne12,
|
||||
const int nb1, const int nb2, const int offset, cudaStream_t stream) {
|
||||
int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
|
||||
acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(dst));
|
||||
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
||||
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
||||
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
||||
int offset = dst->op_params[3] / 4; // offset in bytes
|
||||
|
||||
const int64_t s1 = dst->op_params[0] / sizeof(float);
|
||||
const int64_t s2 = dst->op_params[1] / sizeof(float);
|
||||
const int64_t s3 = dst->op_params[2] / sizeof(float);
|
||||
const int64_t offset = dst->op_params[3] / sizeof(float);
|
||||
|
||||
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
|
||||
acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
|
||||
}
|
||||
|
||||
@@ -130,6 +130,10 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {
|
||||
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#define GGML_CUDA_MAX_STREAMS 8
|
||||
|
||||
[[noreturn]]
|
||||
@@ -296,25 +300,6 @@ static __device__ void no_device_code(
|
||||
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
// The compiler is always able to unroll loops if they contain continue expressions.
|
||||
// In such cases loop unrolling can still be achieved via recursion:
|
||||
template <int n>
|
||||
struct ggml_cuda_unroll {
|
||||
template <typename Func, typename... Args>
|
||||
__device__ void operator()(const Func & f, Args... args) const {
|
||||
f(n - 1, args...);
|
||||
ggml_cuda_unroll<n - 1>{}(f, args...);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct ggml_cuda_unroll<1> {
|
||||
template <typename Func, typename... Args>
|
||||
__device__ void operator()(const Func & f, Args... args) const {
|
||||
f(0, args...);
|
||||
}
|
||||
};
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
|
||||
@@ -2,17 +2,6 @@
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
|
||||
static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) {
|
||||
#ifdef CP_ASYNC_AVAILABLE
|
||||
return __cvta_generic_to_shared(generic_ptr);
|
||||
#else
|
||||
GGML_UNUSED(generic_ptr);
|
||||
NO_DEVICE_CODE;
|
||||
return 0;
|
||||
#endif // CP_ASYNC_AVAILABLE
|
||||
}
|
||||
|
||||
// Copies data from global to shared memory, cg == cache global.
|
||||
// Both the src and dst pointers must be aligned to 16 bit.
|
||||
// Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
|
||||
|
||||
@@ -516,7 +516,7 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
|
||||
nullptr;
|
||||
}
|
||||
|
||||
template<int D, int ncols1, int ncols2> // D == head size
|
||||
template<int D, int ncols1, int ncols2, int KQ_stride> // D == head size
|
||||
__launch_bounds__(D, 1)
|
||||
static __global__ void flash_attn_stream_k_fixup(
|
||||
float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne11) {
|
||||
@@ -665,27 +665,23 @@ static void on_no_fattn_vec_case(const int D) {
|
||||
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size %d.\n", D);
|
||||
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
|
||||
fprintf(stderr, "Only f16 is supported.\n");
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
template <int DV, int ncols1, int ncols2>
|
||||
template <int D, int ncols1, int ncols2, int KQ_stride>
|
||||
void launch_fattn(
|
||||
ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
|
||||
const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
|
||||
) {
|
||||
constexpr int ncols = ncols1 * ncols2;
|
||||
|
||||
const bool is_mla = DV == 512; // TODO better parameterization
|
||||
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
GGML_ASSERT(V || is_mla);
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
ggml_tensor * KQV = dst;
|
||||
@@ -693,13 +689,9 @@ void launch_fattn(
|
||||
GGML_ASSERT(Q->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(KQV->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT( Q->nb[0] == ggml_element_size(Q));
|
||||
GGML_ASSERT( K->nb[0] == ggml_element_size(K));
|
||||
GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));
|
||||
|
||||
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
|
||||
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
|
||||
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
|
||||
|
||||
GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");
|
||||
|
||||
@@ -721,13 +713,12 @@ void launch_fattn(
|
||||
size_t nb12 = K->nb[2];
|
||||
size_t nb13 = K->nb[3];
|
||||
|
||||
const char * V_data = V ? (const char *) V->data : nullptr;
|
||||
size_t nb21 = V ? V->nb[1] : nb11;
|
||||
size_t nb22 = V ? V->nb[2] : nb12;
|
||||
size_t nb23 = V ? V->nb[3] : nb13;
|
||||
const char * V_data = (const char *) V->data;
|
||||
size_t nb21 = V->nb[1];
|
||||
size_t nb22 = V->nb[2];
|
||||
size_t nb23 = V->nb[3];
|
||||
|
||||
if (need_f16_K && K->type != GGML_TYPE_F16) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(K));
|
||||
K_f16.alloc(ggml_nelements(K));
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
|
||||
to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
|
||||
@@ -741,8 +732,7 @@ void launch_fattn(
|
||||
nb13 = nb13*bs*sizeof(half)/ts;
|
||||
}
|
||||
|
||||
if (V && need_f16_V && V->type != GGML_TYPE_F16) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(V));
|
||||
if (need_f16_V && V->type != GGML_TYPE_F16) {
|
||||
V_f16.alloc(ggml_nelements(V));
|
||||
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
|
||||
to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
|
||||
@@ -762,13 +752,10 @@ void launch_fattn(
|
||||
const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];
|
||||
|
||||
const dim3 block_dim(warp_size, nwarps, 1);
|
||||
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
|
||||
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
|
||||
|
||||
dim3 blocks_num;
|
||||
if (stream_k) {
|
||||
// For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
|
||||
const int max_blocks = max_blocks_per_sm*nsm;
|
||||
const int max_blocks = 2*nsm;
|
||||
const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
|
||||
const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);
|
||||
|
||||
@@ -780,11 +767,14 @@ void launch_fattn(
|
||||
blocks_num.y = 1;
|
||||
blocks_num.z = 1;
|
||||
|
||||
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
|
||||
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + D) * sizeof(float));
|
||||
} else {
|
||||
GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
|
||||
const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
|
||||
|
||||
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
|
||||
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
|
||||
|
||||
// parallel_blocks should be at least large enough to achieve max. occupancy for a single wave:
|
||||
parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1);
|
||||
|
||||
@@ -861,19 +851,19 @@ void launch_fattn(
|
||||
|
||||
if (stream_k) {
|
||||
if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
|
||||
const dim3 block_dim_combine(DV, 1, 1);
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
|
||||
|
||||
flash_attn_stream_k_fixup<DV, ncols1, ncols2>
|
||||
flash_attn_stream_k_fixup<D, ncols1, ncols2, KQ_stride>
|
||||
<<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
|
||||
((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
|
||||
}
|
||||
} else if (parallel_blocks > 1) {
|
||||
const dim3 block_dim_combine(DV, 1, 1);
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z);
|
||||
const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);
|
||||
|
||||
flash_attn_combine_results<DV>
|
||||
flash_attn_combine_results<D>
|
||||
<<<blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream>>>
|
||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -307,7 +307,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
launch_fattn<D, cols_per_block, 1, -1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
|
||||
} break;
|
||||
case 128: {
|
||||
@@ -315,7 +315,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
launch_fattn<D, cols_per_block, 1, -1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
|
||||
} break;
|
||||
default: {
|
||||
|
||||
@@ -318,7 +318,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
launch_fattn<D, cols_per_block, 1, -1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
|
||||
} break;
|
||||
case 128: {
|
||||
@@ -326,7 +326,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
constexpr int nwarps = 8;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
|
||||
launch_fattn<D, cols_per_block, 1>
|
||||
launch_fattn<D, cols_per_block, 1, -1>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
|
||||
} break;
|
||||
default: {
|
||||
|
||||
@@ -168,7 +168,6 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ[j*D + tid] = -HALF_MAX_HALF;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||
|
||||
@@ -316,7 +315,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
|
||||
constexpr bool need_f16_K = D != 128;
|
||||
constexpr bool need_f16_V = D != 128 && D != 64;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
}
|
||||
|
||||
template <int D, ggml_type type_K, ggml_type type_V>
|
||||
|
||||
@@ -310,7 +310,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
|
||||
constexpr bool need_f16_K = D != 128;
|
||||
constexpr bool need_f16_V = D != 128 && D != 64;
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
|
||||
}
|
||||
|
||||
template <int D, ggml_type type_K, ggml_type type_V>
|
||||
|
||||
@@ -490,7 +490,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
|
||||
fattn_kernel = flash_attn_ext_f16<
|
||||
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>;
|
||||
}
|
||||
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
|
||||
launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -8,33 +8,58 @@
|
||||
#include "fattn-wmma-f16.cuh"
|
||||
#include "fattn.cuh"
|
||||
|
||||
template <int DKQ, int DV, int ncols2>
|
||||
template <int D, int ncols2>
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
if constexpr (ncols2 <= 8) {
|
||||
if (Q->ne[1] <= 8/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
if (Q->ne[1] <= 8/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<D, 8/ncols2, ncols2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 16/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<D, 16/ncols2, ncols2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || Q->ne[1] <= 32/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 32/ncols2, ncols2>(ctx, dst);
|
||||
if (Q->ne[1] <= 32/ncols2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<D, 32/ncols2, ncols2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 64/ncols2, ncols2>(ctx, dst);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_case<D, 64/ncols2, ncols2>(ctx, dst);
|
||||
}
|
||||
|
||||
template <int DKQ, int DV>
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
template <int ncols2>
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 64, ncols2>(ctx, dst);
|
||||
break;
|
||||
case 80:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 80, ncols2>(ctx, dst);
|
||||
break;
|
||||
case 96:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 96, ncols2>(ctx, dst);
|
||||
break;
|
||||
case 112:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<112, ncols2>(ctx, dst);
|
||||
break;
|
||||
case 128:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<128, ncols2>(ctx, dst);
|
||||
break;
|
||||
case 256:
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<256, ncols2>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
@@ -43,79 +68,27 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
|
||||
float max_bias = 0.0f;
|
||||
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f;
|
||||
const float use_gqa_opt = mask && max_bias == 0.0f;
|
||||
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
|
||||
if (use_gqa_opt && gqa_ratio % 8 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 8>(ctx, dst);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_hs<8>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_gqa_opt && gqa_ratio % 4 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 4>(ctx, dst);
|
||||
if (use_gqa_opt && gqa_ratio == 4) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_hs<4>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (use_gqa_opt && gqa_ratio % 2 == 0) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
|
||||
if (use_gqa_opt && gqa_ratio == 2) {
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_hs<2>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
|
||||
}
|
||||
|
||||
static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * KQV = dst;
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
GGML_ASSERT(V->ne[0] == 64);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 64, 64>(ctx, dst);
|
||||
break;
|
||||
case 80:
|
||||
GGML_ASSERT(V->ne[0] == 80);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 80, 80>(ctx, dst);
|
||||
break;
|
||||
case 96:
|
||||
GGML_ASSERT(V->ne[0] == 96);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 96, 96>(ctx, dst);
|
||||
break;
|
||||
case 112:
|
||||
GGML_ASSERT(V->ne[0] == 112);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<112, 112>(ctx, dst);
|
||||
break;
|
||||
case 128:
|
||||
GGML_ASSERT(V->ne[0] == 128);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst);
|
||||
break;
|
||||
case 256:
|
||||
GGML_ASSERT(V->ne[0] == 256);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
|
||||
break;
|
||||
case 576: {
|
||||
// For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
|
||||
GGML_ASSERT(V->ne[0] == 512);
|
||||
float max_bias = 0.0f;
|
||||
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f;
|
||||
GGML_ASSERT(use_gqa_opt);
|
||||
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
GGML_ASSERT(gqa_ratio % 16 == 0);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_hs<1>(ctx, dst);
|
||||
}
|
||||
|
||||
#define FATTN_VEC_F16_CASE(D, type_K, type_V) \
|
||||
@@ -326,7 +299,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
|
||||
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
|
||||
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
|
||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
|
||||
const bool can_use_vector_kernel = Q->ne[0] % (2*warp_size) == 0;
|
||||
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
|
||||
if (prec == GGML_PREC_DEFAULT) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
|
||||
@@ -10,11 +10,10 @@ static __global__ void k_get_rows(
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2;
|
||||
const int i10 = blockIdx.x;
|
||||
const int i11 = blockIdx.z / ne12;
|
||||
const int i12 = blockIdx.z % ne12;
|
||||
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
||||
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
||||
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
||||
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
@@ -47,11 +46,10 @@ static __global__ void k_get_rows_float(
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
const int i00 = blockIdx.y * blockDim.x + threadIdx.x;
|
||||
const int i10 = blockIdx.x;
|
||||
const int i11 = blockIdx.z / ne12;
|
||||
const int i12 = blockIdx.z % ne12;
|
||||
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
|
||||
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
|
||||
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
@@ -96,8 +94,8 @@ static void get_rows_cuda_q(
|
||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||
cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
||||
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
|
||||
const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
||||
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
||||
|
||||
// strides in elements
|
||||
// const size_t s0 = nb0 / sizeof(dst_t);
|
||||
@@ -129,8 +127,8 @@ static void get_rows_cuda_float(
|
||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||
cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
||||
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
|
||||
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
|
||||
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
|
||||
|
||||
// strides in elements
|
||||
// const size_t s0 = nb0 / sizeof(dst_t);
|
||||
|
||||
@@ -555,8 +555,8 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
|
||||
|
||||
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
// initialize padding to 0 to avoid possible NaN values
|
||||
const size_t original_size = ggml_nbytes(tensor);
|
||||
const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||
size_t original_size = ggml_nbytes(tensor);
|
||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||
|
||||
if (padded_size > original_size) {
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -679,7 +679,6 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
|
||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
}
|
||||
@@ -801,7 +800,6 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
@@ -853,7 +851,6 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
@@ -892,7 +889,6 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
@@ -974,7 +970,6 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf
|
||||
|
||||
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
|
||||
|
||||
size_t total_size = 0;
|
||||
|
||||
@@ -1536,8 +1531,6 @@ static void ggml_cuda_op_mul_mat(
|
||||
|
||||
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
|
||||
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
const size_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
|
||||
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
|
||||
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data, 0, nbytes_padding, stream));
|
||||
@@ -1909,19 +1902,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||
|
||||
// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
|
||||
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
|
||||
// Therefore, in such cases use cuBLAS.
|
||||
const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
|
||||
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
|
||||
|
||||
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
bool any_gpus_with_slow_fp16 = false;
|
||||
@@ -2075,11 +2062,9 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
}
|
||||
|
||||
ggml_tensor src0_slice = *src0;
|
||||
src0_slice.ne[2] = 1;
|
||||
src0_slice.nb[3] = src0_slice.nb[2];
|
||||
src0_slice.op = GGML_OP_VIEW;
|
||||
src0_slice.view_src = dst->src[0]; // non-const pointer to src0
|
||||
src0_slice.data = (char *) src0->data + i02*nb02;
|
||||
src0_slice.ne[2] = 1;
|
||||
src0_slice.nb[3] = src0_slice.nb[2];
|
||||
src0_slice.data = (char *) src0->data + i02*nb02;
|
||||
|
||||
ggml_tensor src1_slice;
|
||||
memset(&src1_slice, 0, sizeof(src1_slice));
|
||||
@@ -3221,16 +3206,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return false;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
||||
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
||||
if (!new_mma_available(cc)) {
|
||||
return false;
|
||||
}
|
||||
const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
|
||||
return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
|
||||
// different head sizes of K and V are not supported yet
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] == 576) {
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -89,17 +89,6 @@ void ggml_cuda_mul_mat_q(
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
const size_t size_data = ggml_nbytes(src0);
|
||||
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||
if (size_alloc > size_data) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
|
||||
const int64_t s01 = src0->nb[1] / ts_src0;
|
||||
@@ -122,7 +111,6 @@ void ggml_cuda_mul_mat_q(
|
||||
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||
quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
|
||||
ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
|
||||
@@ -130,7 +118,7 @@ void ggml_cuda_mul_mat_q(
|
||||
|
||||
const mmq_args args = {
|
||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
|
||||
ne00, ne01, ne1, s01, ne11, s1,
|
||||
ne00, ne01, ne1, s01, s1,
|
||||
ne02, ne12, s02, s12, s2,
|
||||
ne03, ne13, s03, s13, s3,
|
||||
use_stream_k};
|
||||
@@ -206,7 +194,6 @@ void ggml_cuda_mul_mat_q(
|
||||
const int64_t s13 = src1->nb[2] / ts_src1;
|
||||
quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type,
|
||||
ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
|
||||
@@ -215,7 +202,7 @@ void ggml_cuda_mul_mat_q(
|
||||
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
||||
const mmq_args args = {
|
||||
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
|
||||
ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
|
||||
ne00, ne01, ne_get_rows, s01, s1,
|
||||
ne02, ne02, s02, s12, s2,
|
||||
ne03, ne13, s03, s13, s3,
|
||||
use_stream_k};
|
||||
@@ -254,7 +241,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
||||
const mmq_args args = {
|
||||
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
||||
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
|
||||
ne00, row_diff, src1_ncols, stride01, nrows_dst,
|
||||
1, 1, 0, 0, 0,
|
||||
1, 1, 0, 0, 0,
|
||||
use_stream_k};
|
||||
|
||||
@@ -2522,7 +2522,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check, bool fixup>
|
||||
static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||
const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
|
||||
const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||
const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
|
||||
const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
|
||||
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
@@ -2606,7 +2606,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
static __global__ void mul_mat_q(
|
||||
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
|
||||
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_row_x, const int stride_col_dst,
|
||||
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
|
||||
|
||||
@@ -2619,8 +2619,8 @@ static __global__ void mul_mat_q(
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
|
||||
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x; // Number of tiles x
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
const int ntx = (ncols_y + mmq_x - 1) / mmq_x; // Number of tiles x
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
|
||||
// Initialize the ids for writing back data with just the index.
|
||||
// For regular matrix multiplications this is never changed.
|
||||
@@ -2636,7 +2636,6 @@ static __global__ void mul_mat_q(
|
||||
|
||||
ids_dst_shared[j] = j;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
@@ -2648,8 +2647,8 @@ static __global__ void mul_mat_q(
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
int col_high = ncols_dst;
|
||||
int col_diff = ncols_dst;
|
||||
int col_high = ncols_y;
|
||||
int col_diff = ncols_y;
|
||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||
|
||||
@@ -2665,7 +2664,6 @@ static __global__ void mul_mat_q(
|
||||
return;
|
||||
}
|
||||
|
||||
// __syncthreads(); // There is no previous tile that could cause a race condition.
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
|
||||
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
|
||||
@@ -2676,7 +2674,6 @@ static __global__ void mul_mat_q(
|
||||
|
||||
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
@@ -2689,7 +2686,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
constexpr bool fixup = false;
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
|
||||
return;
|
||||
}
|
||||
@@ -2720,8 +2717,8 @@ static __global__ void mul_mat_q(
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
int col_high = ncols_dst;
|
||||
int col_diff = ncols_dst;
|
||||
int col_high = ncols_y;
|
||||
int col_diff = ncols_y;
|
||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||
|
||||
@@ -2743,7 +2740,6 @@ static __global__ void mul_mat_q(
|
||||
continue;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
|
||||
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
|
||||
@@ -2754,7 +2750,6 @@ static __global__ void mul_mat_q(
|
||||
|
||||
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
@@ -2767,7 +2762,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||
|
||||
kbc += blocks_per_ne00;
|
||||
@@ -2792,8 +2787,8 @@ static __global__ void mul_mat_q(
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
int col_high = ncols_dst;
|
||||
int col_diff = ncols_dst;
|
||||
int col_high = ncols_y;
|
||||
int col_diff = ncols_y;
|
||||
int offset_y = wt*stride_sample_y + zt*stride_channel_y;
|
||||
int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst;
|
||||
|
||||
@@ -2810,7 +2805,6 @@ static __global__ void mul_mat_q(
|
||||
}
|
||||
|
||||
// The memory layout for the fixup buffer is always contiguous, therefore reset ids:
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
|
||||
const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
|
||||
@@ -2821,7 +2815,6 @@ static __global__ void mul_mat_q(
|
||||
|
||||
ids_dst_shared[j] = j;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
offset_y += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
|
||||
@@ -2834,7 +2827,7 @@ static __global__ void mul_mat_q(
|
||||
|
||||
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, nrows_x, ncols_y, stride_row_x, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||
}
|
||||
|
||||
@@ -2842,7 +2835,7 @@ static __global__ void mul_mat_q(
|
||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
|
||||
const int ncols_x, const int nrows_x, const int ncols_y, const int stride_col_dst,
|
||||
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst) {
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
@@ -2851,8 +2844,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
|
||||
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
|
||||
|
||||
const int ntx = (ncols_dst + mmq_x - 1) / mmq_x;
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (ncols_y + mmq_x - 1) / mmq_x;
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
|
||||
const int bidx0 = blockIdx.x;
|
||||
|
||||
@@ -2925,8 +2918,8 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
|
||||
dst += offset_dst;
|
||||
|
||||
const int i_max = nrows_x - it*mmq_y - 1;
|
||||
const int j_max = ncols_dst - jt*mmq_x - 1;
|
||||
const int i_max = nrows_x - it*mmq_y - 1;
|
||||
const int j_max = ncols_y - jt*mmq_x - 1;
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
@@ -2958,7 +2951,6 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
for (int j = threadIdx.y*WARP_SIZE + threadIdx.x; j < mmq_x; j += nwarps*WARP_SIZE) {
|
||||
ids_dst_shared[j] = ids_dst[col_low + j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const int offset_dst = it*mmq_y;
|
||||
dst += offset_dst;
|
||||
@@ -2989,7 +2981,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
|
||||
struct mmq_args {
|
||||
const char * x; ggml_type type_x; const int * y; const int32_t * ids_dst; const int32_t * expert_bounds; float * dst;
|
||||
int64_t ncols_x; int64_t nrows_x; int64_t ncols_dst; int64_t stride_row_x; int64_t ncols_y; int64_t nrows_dst;
|
||||
int64_t ncols_x; int64_t nrows_x; int64_t ncols_y; int64_t stride_row_x; int64_t nrows_dst;
|
||||
int64_t nchannels_x; int64_t nchannels_y; int64_t stride_channel_x; int64_t stride_channel_y; int64_t stride_channel_dst;
|
||||
int64_t nsamples_x; int64_t nsamples_y; int64_t stride_sample_x; int64_t stride_sample_y; int64_t stride_sample_dst;
|
||||
bool use_stream_k;
|
||||
@@ -3025,8 +3017,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
|
||||
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x;
|
||||
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (args.ncols_y + mmq_x - 1) / mmq_x;
|
||||
const int ntzw = args.nchannels_y * args.nsamples_y;
|
||||
const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
|
||||
|
||||
@@ -3040,14 +3032,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
}
|
||||
@@ -3068,7 +3060,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
|
||||
@@ -3077,14 +3069,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
|
||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
args.ncols_x, args.nrows_x, args.ncols_y, args.stride_row_x, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst);
|
||||
|
||||
@@ -3093,7 +3085,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
}
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_y,
|
||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -513,17 +513,6 @@ void ggml_cuda_mul_mat_vec_q(
|
||||
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
const size_t size_data = ggml_nbytes(src0);
|
||||
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||
if (size_alloc > size_data) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
||||
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
|
||||
{
|
||||
|
||||
@@ -56,13 +56,13 @@ static __global__ void quantize_mmq_q8_1(
|
||||
constexpr int vals_per_scale = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 64 : 32;
|
||||
constexpr int vals_per_sum = ds_layout == MMQ_Q8_1_DS_LAYOUT_D2S6 ? 16 : 32;
|
||||
|
||||
const int64_t i0 = ((int64_t)blockDim.x*blockIdx.y + threadIdx.x)*4;
|
||||
const int64_t i0 = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*4;
|
||||
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i1 = blockIdx.x;
|
||||
const int64_t i1 = blockIdx.y;
|
||||
const int64_t i2 = blockIdx.z % ne2;
|
||||
const int64_t i3 = blockIdx.z / ne2;
|
||||
|
||||
@@ -75,8 +75,8 @@ static __global__ void quantize_mmq_q8_1(
|
||||
|
||||
block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
|
||||
|
||||
const int64_t ib0 = blockIdx.z*((int64_t)gridDim.x*gridDim.y*blockDim.x/QK8_1); // first block of channel
|
||||
const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.x; // block index in channel
|
||||
const int64_t ib0 = blockIdx.z*((int64_t)gridDim.y*gridDim.x*blockDim.x/QK8_1); // first block of channel
|
||||
const int64_t ib = ib0 + (i0 / (4*QK8_1))*ne1 + blockIdx.y; // block index in channel
|
||||
const int64_t iqs = i0 % (4*QK8_1); // quant index in block
|
||||
|
||||
// Load 4 floats per thread and calculate max. abs. value between them:
|
||||
@@ -163,12 +163,10 @@ void quantize_mmq_q8_1_cuda(
|
||||
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
|
||||
|
||||
// ne1 tends to assume the highest values, therefore use it as the "x" dimension of the CUDA grid:
|
||||
const int64_t block_num_y = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||
const dim3 num_blocks(ne1, block_num_y, ne2*ne3);
|
||||
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
|
||||
const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
|
||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
|
||||
switch (mmq_get_q8_1_ds_layout(type_src0)) {
|
||||
case MMQ_Q8_1_DS_LAYOUT_D4:
|
||||
|
||||
@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 1, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 1, 8);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 16, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 16, 1);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 16, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 16, 2);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 16, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 16, 4);
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 2, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 2, 4);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 2, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 2, 8);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 32, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 32, 1);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 32, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 32, 2);
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 4, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 4, 2);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 4, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 4, 4);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 4, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 4, 8);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 64, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 64, 1);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 8, 1);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 8, 1);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 8, 2);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 8, 2);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 8, 4);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 8, 4);
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(64, 64, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(64, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(80, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(96, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(112, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(128, 8, 8);
|
||||
DECL_FATTN_MMA_F16_CASE(256, 8, 8);
|
||||
|
||||
@@ -18,7 +18,7 @@ SOURCE_FATTN_MMA_START = """// This file has been autogenerated by generate_cu_f
|
||||
|
||||
"""
|
||||
|
||||
SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size_kq}, {head_size_v}, {ncols1}, {ncols2});\n"
|
||||
SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size}, {ncols1}, {ncols2});\n"
|
||||
|
||||
TYPES_MMQ = [
|
||||
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
||||
@@ -57,21 +57,18 @@ for vkq_size in [16, 32]:
|
||||
with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
|
||||
f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))
|
||||
|
||||
for ncols in [8, 16, 32, 64]:
|
||||
for ncols2 in [1, 2, 4, 8, 16]:
|
||||
if ncols2 > ncols:
|
||||
continue
|
||||
for ncols in [8, 16, 32, 64, 128]:
|
||||
for ncols2 in [1, 2, 4, 8]:
|
||||
ncols1 = ncols // ncols2
|
||||
if ncols == 128:
|
||||
continue # Too much register pressure.
|
||||
with open(f"fattn-mma-f16-instance-ncols1_{ncols1}-ncols2_{ncols2}.cu", "w") as f:
|
||||
f.write(SOURCE_FATTN_MMA_START)
|
||||
|
||||
for head_size_kq in [64, 80, 96, 112, 128, 256, 576]:
|
||||
if head_size_kq != 576 and ncols2 == 16:
|
||||
continue
|
||||
if head_size_kq == 576 and ncols2 != 16:
|
||||
continue
|
||||
head_size_v = head_size_kq if head_size_kq != 576 else 512
|
||||
f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))
|
||||
for head_size in [64, 80, 96, 112, 128, 256]:
|
||||
if ncols == 128 and head_size == 256:
|
||||
continue # Needs too much shared memory.
|
||||
f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size=head_size))
|
||||
|
||||
for type in TYPES_MMQ:
|
||||
with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
|
||||
|
||||
@@ -207,10 +207,6 @@ typedef struct {
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
int32_t sect_0;
|
||||
int32_t sect_1;
|
||||
int32_t sect_2;
|
||||
int32_t sect_3;
|
||||
} ggml_metal_kargs_rope;
|
||||
|
||||
typedef struct {
|
||||
@@ -303,42 +299,21 @@ typedef struct {
|
||||
} ggml_metal_kargs_mul_mv_ext;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne10;
|
||||
int32_t ne11; // n_expert_used (bcast)
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
int32_t neh11; // n_tokens
|
||||
uint64_t nbh11;
|
||||
int32_t ne20; // n_expert_used
|
||||
uint64_t nb21;
|
||||
} ggml_metal_kargs_mul_mm_id_map0;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne20; // n_expert_used
|
||||
int32_t neh0;
|
||||
int32_t neh1;
|
||||
uint64_t nbh1;
|
||||
uint64_t nbh2;
|
||||
int32_t ne0;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
} ggml_metal_kargs_mul_mm_id_map1;
|
||||
|
||||
typedef struct {
|
||||
int32_t nei0;
|
||||
int32_t nei1;
|
||||
uint64_t nbi1;
|
||||
int32_t ne00;
|
||||
int32_t ne02;
|
||||
uint64_t nb01;
|
||||
uint64_t nb02;
|
||||
uint64_t nb03;
|
||||
int32_t neh12;
|
||||
uint64_t nbh10;
|
||||
uint64_t nbh11;
|
||||
uint64_t nbh12;
|
||||
uint64_t nbh13;
|
||||
int32_t neh0;
|
||||
int32_t neh1;
|
||||
int16_t r2;
|
||||
int16_t r3;
|
||||
int32_t ne11;
|
||||
int32_t ne12;
|
||||
int32_t ne13;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
int32_t ne0;
|
||||
int32_t ne1;
|
||||
} ggml_metal_kargs_mul_mm_id;
|
||||
|
||||
typedef struct {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user