Compare commits

..

3 Commits

Author SHA1 Message Date
Georgi Gerganov
072e0a4d3b scipts : add LICENSE and gen-authors.sh to sync 2024-04-09 09:19:33 +03:00
Georgi Gerganov
0e0d4e821f authors : update 2024-04-09 09:17:25 +03:00
Georgi Gerganov
805d705032 license : add AUTHORS 2024-03-31 10:37:39 +03:00
123 changed files with 7858 additions and 10905 deletions

View File

@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
COPY requirements requirements
@@ -28,8 +28,6 @@ COPY . .
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make

View File

@@ -40,11 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make
ENTRYPOINT ["/app/.devops/tools.sh"]

View File

@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
apt-get install -y build-essential python3 python3-pip git
COPY requirements.txt requirements.txt
COPY requirements requirements
@@ -15,9 +15,6 @@ WORKDIR /app
COPY . .
ENV LLAMA_CURL=1
RUN make
ENV LC_ALL=C.utf8

View File

@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev
apt-get install -y build-essential git
WORKDIR /app
@@ -22,16 +22,11 @@ COPY . .
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/server /server
ENTRYPOINT [ "/server" ]

View File

@@ -4,7 +4,7 @@ FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
apt-get install -y git
WORKDIR /app
@@ -16,14 +16,11 @@ RUN mkdir build && \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8

View File

@@ -40,11 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Enable cURL
ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make
ENTRYPOINT [ "/app/server" ]

View File

@@ -11,16 +11,12 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
apt update -y && \
apt-get install -y vulkan-sdk
# Install cURL
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
# Build it
WORKDIR /app
COPY . .
RUN mkdir build && \
cd build && \
cmake .. -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target server
# Clean up

View File

@@ -3,21 +3,16 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev
apt-get install -y build-essential git
WORKDIR /app
COPY . .
ENV LLAMA_CURL=1
RUN make
FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
COPY --from=build /app/server /server
ENV LC_ALL=C.utf8

View File

@@ -24,15 +24,15 @@ on:
push:
branches:
- master
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
pull_request_target:
types: [opened, synchronize, reopened]
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
schedule:
- cron: '04 2 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -42,21 +42,11 @@ jobs:
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
N_USERS: 8
DURATION: 10m
strategy:
matrix:
model: [phi-2]
ftype: [q4_0, q8_0, f16]
include:
- model: phi-2
ftype: q4_0
pr_comment_enabled: "true"
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
@@ -79,18 +69,12 @@ jobs:
sleep 0.1
done
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.21'
- name: Install k6 and xk6-sse
- name: Install k6
id: k6_installation
run: |
cd examples/server/bench
go install go.k6.io/xk6/cmd/xk6@latest
xk6 build master \
--with github.com/phymbert/xk6-sse
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
tar xzf k6*.tar.gz --strip-components=1
- name: Build
id: cmake_build
@@ -124,7 +108,7 @@ jobs:
cd examples/server/bench
source venv/bin/activate
python bench.py \
BENCH_K6_BIN_PATH=./k6 python bench.py \
--runner-label ${{ env.RUNNER_LABEL }} \
--name ${{ github.job }} \
--branch ${{ github.head_ref || github.ref_name }} \
@@ -132,7 +116,7 @@ jobs:
--scenario script.js \
--duration ${{ github.event.inputs.duration || env.DURATION }} \
--hf-repo ggml-org/models \
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model-path-prefix /models \
--parallel ${{ env.N_USERS }} \
-ngl 33 \
@@ -150,7 +134,7 @@ jobs:
- uses: actions/upload-artifact@v4
with:
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
name: benchmark-results
compression-level: 9
path: |
examples/server/bench/*.jpg
@@ -162,7 +146,7 @@ jobs:
with:
authToken: ${{secrets.GITHUB_TOKEN}}
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
context: bench-server-baseline
description: |
${{ env.BENCH_RESULTS }}
state: 'success'
@@ -219,26 +203,21 @@ jobs:
- name: Comment PR
uses: mshick/add-pr-comment@v2
id: comment_pr
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
if: ${{ github.event.pull_request != '' }}
with:
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
message: |
<p align="center">
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
</p>
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- ${{ env.BENCH_GRAPH_XLABEL }}
<details>
<summary>Expand details for performance related PR only</summary>
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
- ${{ env.BENCH_GRAPH_XLABEL }}
<summary>Time series</summary>
<p align="center">

View File

@@ -16,7 +16,7 @@ on:
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
@@ -31,7 +31,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -52,7 +52,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest -L 'main|curl' --verbose --timeout 900
ctest -L main --verbose --timeout 900
- name: Determine tag name
id: tag
@@ -76,10 +76,10 @@ jobs:
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
name: llama-bin-macos-arm64.zip
path: |
llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
macOS-latest-cmake-x64:
runs-on: macos-latest
@@ -87,7 +87,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -101,9 +101,7 @@ jobs:
sysctl -a
mkdir build
cd build
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@@ -134,10 +132,10 @@ jobs:
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip
path: |
llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
ubuntu-focal-make:
runs-on: ubuntu-20.04
@@ -148,7 +146,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -160,7 +158,7 @@ jobs:
with:
node-version: "20"
- uses: actions/setup-python@v5
- uses: actions/setup-python@v4
with:
python-version: "3.11"
@@ -183,7 +181,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -205,27 +203,27 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
cmake .. -DLLAMA_FATAL_WARNINGS=ON
cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L 'main|curl' --verbose --timeout 900
ctest -L main --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
@@ -251,7 +249,7 @@ jobs:
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Dependencies
# id: depends
@@ -285,7 +283,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -313,7 +311,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -359,7 +357,7 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Build
id: cmake_build
@@ -400,7 +398,7 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Build
id: cmake_build
@@ -420,7 +418,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -451,7 +449,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
id: depends
@@ -595,7 +593,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -725,10 +723,10 @@ jobs:
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
name: llama-bin-win-${{ matrix.build }}-x64.zip
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
windows-latest-cmake-cuda:
runs-on: windows-latest
@@ -741,7 +739,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -781,10 +779,10 @@ jobs:
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime
run: |
@@ -795,10 +793,10 @@ jobs:
- name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
path: |
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
windows-latest-cmake-sycl:
runs-on: windows-latest
@@ -814,7 +812,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -846,17 +844,17 @@ jobs:
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
ios-xcode-build:
runs-on: macos-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@@ -866,7 +864,7 @@ jobs:
steps:
- name: Clone
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Set up JDK
uses: actions/setup-java@v3
@@ -889,7 +887,7 @@ jobs:
# runs-on: macos-12
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Build
# uses: cross-platform-actions/action@v0.19.0
@@ -920,7 +918,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0
@@ -939,13 +937,7 @@ jobs:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
with:
path: ./artifact
- name: Move artifacts
id: move_artifacts
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
uses: actions/download-artifact@v3
- name: Create release
id: create_release
@@ -964,7 +956,7 @@ jobs:
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./artifact/release')) {
for (let file of await fs.readdirSync('./artifact')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
@@ -972,7 +964,7 @@ jobs:
repo: context.repo.repo,
release_id: release_id,
name: file,
data: await fs.readFileSync(`./artifact/release/${file}`)
data: await fs.readFileSync(`./artifact/${file}`)
});
}
}
@@ -986,7 +978,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |
@@ -1010,7 +1002,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |
@@ -1034,7 +1026,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |
@@ -1064,7 +1056,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
@@ -1080,7 +1072,7 @@ jobs:
# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
#
# - name: Upload binaries
# uses: actions/upload-artifact@v4
# uses: actions/upload-artifact@v1
# with:
# name: llama-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }}
@@ -1103,7 +1095,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
@@ -1135,7 +1127,7 @@ jobs:
#
# - name: Upload binaries
# if: matrix.blas == 'ON'
# uses: actions/upload-artifact@v4
# uses: actions/upload-artifact@v1
# with:
# name: llama-blas-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }}
@@ -1149,7 +1141,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |

View File

@@ -6,7 +6,7 @@ env:
GGML_N_THREADS: 1
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Dependencies
run: |

View File

@@ -16,7 +16,7 @@ on:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -46,7 +46,7 @@ jobs:
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
steps:
- name: Check out the repo
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
@@ -91,12 +91,6 @@ jobs:
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Downcase github.repository_owner
run: |
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Build and push Docker image (versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
@@ -104,7 +98,7 @@ jobs:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
@@ -113,5 +107,5 @@ jobs:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
file: ${{ matrix.config.dockerfile }}

View File

@@ -15,13 +15,13 @@ on:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
editorconfig:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- uses: editorconfig-checker/action-editorconfig-checker@main
- run: editorconfig-checker

View File

@@ -24,9 +24,9 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v5
uses: actions/setup-python@v2
with:
python-version: '3.9.x'
- name: Install dependencies

View File

@@ -18,7 +18,7 @@ on:
paths: ['**/*.nix', 'flake.lock']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:

View File

@@ -9,7 +9,7 @@ on:
types: [opened, synchronize, reopened]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:

View File

@@ -17,7 +17,7 @@ on:
- 'requirements/*.txt'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -26,9 +26,9 @@ jobs:
name: check-requirements
steps:
- name: Check out source repository
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Set up Python environment
uses: actions/setup-python@v5
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: Run check-requirements.sh script

View File

@@ -3,7 +3,7 @@ name: flake8 Lint
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -12,9 +12,9 @@ jobs:
name: Lint
steps:
- name: Check out source repository
uses: actions/checkout@v4
uses: actions/checkout@v3
- name: Set up Python environment
uses: actions/setup-python@v5
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: flake8 Lint

View File

@@ -4,10 +4,6 @@ name: Server
on:
workflow_dispatch: # allows manual triggering
inputs:
sha:
description: 'Commit SHA1 to build'
required: false
type: string
slow_tests:
description: 'Run slow tests'
required: true
@@ -15,15 +11,15 @@ on:
push:
branches:
- master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request_target:
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
schedule:
- cron: '2 4 * * *'
- cron: '0 0 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -48,45 +44,25 @@ jobs:
options: --cpus 4
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Dependencies
id: depends
run: |
apt-get update
apt-get -y install \
build-essential \
xxd \
git \
cmake \
python3-pip \
curl \
wget \
language-pack-en \
libcurl4-openssl-dev
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Verify server deps
id: verify_server_deps
run: |
git config --global --add safe.directory $(realpath .)
cd examples/server
git ls-files --others --modified
git status
./deps.sh
git status
not_ignored_files="$(git ls-files --others --modified)"
echo "Modified files: ${not_ignored_files}"
if [ -n "${not_ignored_files}" ]; then
echo "Repository is dirty or server deps are not built as expected"
echo "${not_ignored_files}"
exit 1
fi
- name: Build
id: cmake_build
run: |
@@ -126,7 +102,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
uses: actions/checkout@v3
with:
fetch-depth: 0

View File

@@ -7,7 +7,7 @@ on:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
@@ -18,7 +18,7 @@ jobs:
runs-on: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.runs-on }}
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
with:
submodules: recursive
fetch-depth: 0

1
.gitignore vendored
View File

@@ -48,7 +48,6 @@ models-mnt
/convert-llama2c-to-ggml
/embd-input-test
/embedding
/eval-callback
/gguf
/gguf-llama-simple
/gguf-split

View File

@@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
@@ -10,7 +10,7 @@ TEST_TARGETS = \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
tests/test-json-schema-to-grammar tests/test-grammar-integration
tests/test-json-schema-to-grammar
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -646,7 +646,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
ifndef CUDA_POWER_ARCH
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -688,7 +688,7 @@ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml
$(CXX) $(CXXFLAGS) -c $< -o $@
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -756,7 +756,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -788,7 +788,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -800,10 +800,6 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -871,10 +867,6 @@ passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift
(cd examples/batched.swift; make build)
@@ -922,10 +914,6 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View File

@@ -2,44 +2,6 @@
import PackageDescription
var sources = [
"ggml.c",
"llama.cpp",
"unicode.cpp",
"unicode-data.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
]
var resources: [Resource] = []
var linkerSettings: [LinkerSetting] = []
var cSettings: [CSetting] = [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.unsafeFlags(["-fno-objc-arc"]),
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
]
#if canImport(Darwin)
sources.append("ggml-metal.m")
resources.append(.process("ggml-metal.metal"))
linkerSettings.append(.linkedFramework("Accelerate"))
cSettings.append(
contentsOf: [
.define("GGML_USE_ACCELERATE"),
.define("GGML_USE_METAL")
]
)
#endif
#if os(Linux)
cSettings.append(.define("_GNU_SOURCE"))
#endif
let package = Package(
name: "llama",
platforms: [
@@ -66,11 +28,34 @@ let package = Package(
"ggml-cuda.h",
"Makefile"
],
sources: sources,
resources: resources,
sources: [
"ggml.c",
"llama.cpp",
"unicode.cpp",
"unicode-data.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"ggml-metal.m",
],
resources: [
.process("ggml-metal.metal")
],
publicHeadersPath: "spm-headers",
cSettings: cSettings,
linkerSettings: linkerSettings
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_ACCELERATE"),
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_USE_METAL"),
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
],
linkerSettings: [
.linkedFramework("Accelerate")
]
)
],
cxxLanguageStandard: .cxx11

View File

@@ -3,14 +3,14 @@
- [Background](#background)
- [News](#news)
- [OS](#os)
- [Hardware](#hardware)
- [Supported Devices](#supported-devices)
- [Docker](#docker)
- [Linux](#linux)
- [Windows](#windows)
- [Environment Variable](#environment-variable)
- [Known Issue](#known-issues)
- [Q&A](#qa)
- [TODO](#todo)
- [Known Issue](#known-issue)
- [Q&A](#q&a)
- [Todo](#todo)
## Background
@@ -24,20 +24,19 @@
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
### Llama.cpp + SYCL
This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
The llama.cpp SYCL backend supports:
- Intel GPUs.
- Nvidia GPUs.
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
*Upcoming support: AMD GPUs*.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
When targetting **Intel CPUs**, it is recommended to use llama.cpp for [x86_64](README.md#intel-onemkl) approach.
## News
- 2024.4
- Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
- 2024.3
- Release binary files of Windows.
- A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
@@ -54,48 +53,57 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
## OS
| OS | Status | Verified |
|---------|---------|------------------------------------|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
| Windows | Support | Windows 11 |
|OS|Status|Verified|
|-|-|-|
|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|Windows|Support|Windows 11|
## Hardware
## Supported devices
### Intel GPU
### Intel GPUs
**Verified devices**
The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following:
```sh
source /opt/intel/oneapi/setvars.sh
```
| Intel GPU | Status | Verified Model |
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc Series | Support | Arc 770, 730M |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
- **Tested devices**
|Intel GPU| Status | Verified Model|
|-|-|-|
|Intel Data Center Max Series| Support| Max 1550|
|Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770, 730M|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
*Notes:*
- **Memory**
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
- Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs.
- **Execution Unit (EU)**
- If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
- If the iGPU has less than 80 EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use.
### Other Vendor GPU
### Nvidia GPUs
The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
**Verified devices**
- **Tested devices**
|Nvidia GPU| Status | Verified Model|
|-|-|-|
|Ampere Series| Support| A100, A4000|
|Ampere Series *(Mobile)*| Support| RTX 40 Series|
*Notes:*
- Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
- Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
| Nvidia GPU | Status | Verified Model |
|--------------------------|---------|----------------|
| Ampere Series | Support | A100, A4000 |
| Ampere Series *(Mobile)* | Support | RTX 40 Series |
## Docker
The docker build option is currently limited to *intel GPU* targets.
### Build image
```sh
# Using FP16
@@ -161,11 +169,30 @@ Platform #0: Intel(R) OpenCL HD Graphics
- **Nvidia GPU**
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
Installation can be verified by running the following:
```sh
nvidia-smi
```
Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
```
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA A100-PCIE-40GB On | 00000000:8D:00.0 Off | 0 |
| N/A 36C P0 57W / 250W | 4MiB / 40960MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+----------------------+----------------------+
```
2. **Install Intel® oneAPI Base toolkit**
- **For Intel GPU**
- **Base installation**
The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
@@ -177,10 +204,10 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
- **Adding support to Nvidia GPUs**
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
```sh
git clone https://github.com/oneapi-src/oneMKL
@@ -212,7 +239,7 @@ When targeting an intel GPU, the user should expect one or more level-zero devic
- **Nvidia GPU**
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
@@ -230,14 +257,10 @@ source /opt/intel/oneapi/setvars.sh
mkdir -p build && cd build
# Option 1: Use FP16 for better performance in long-prompt inference
cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Or without "--build", run "make" next
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build all binary
cmake --build . --config Release -j -v
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
```
#### Nvidia GPU
@@ -252,21 +275,17 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
mkdir -p build && cd build
# Option 1: Use FP16 for better performance in long-prompt inference
cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# Option 2: Use FP32 by default
cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build all binary
cmake --build . --config Release -j -v
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
```
### III. Run the inference
1. Retrieve and prepare model
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
2. Enable oneAPI running environment
@@ -295,10 +314,10 @@ found 6 SYCL devices:
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
| Attribute | Note |
|------------------------|-------------------------------------------------------------|
| compute capability 1.3 | Level-zero driver/runtime, recommended |
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero driver/runtime, recommended |
|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
4. Launch inference
@@ -307,10 +326,10 @@ There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
| Device selection | Parameter |
|------------------|----------------------------------------|
| Single device | --split-mode none --main-gpu DEVICE_ID |
| Multiple devices | --split-mode layer (default) |
|Device selection|Parameter|
|-|-|
|Single device|--split-mode none --main-gpu DEVICE_ID |
|Multiple devices|--split-mode layer (default)|
Examples:
@@ -339,6 +358,7 @@ Otherwise, you can run the script:
*Notes:*
- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
```sh
@@ -419,7 +439,7 @@ cd build
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
make -j
make
```
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
@@ -467,10 +487,10 @@ found 6 SYCL devices:
```
| Attribute | Note |
|------------------------|-----------------------------------------------------------|
| compute capability 1.3 | Level-zero running time, recommended |
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Launch inference
@@ -480,10 +500,10 @@ There are two device selection modes:
- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
| Device selection | Parameter |
|------------------|----------------------------------------|
| Single device | --split-mode none --main-gpu DEVICE_ID |
| Multiple devices | --split-mode layer (default) |
|Device selection|Parameter|
|-|-|
|Single device|--split-mode none --main-gpu DEVICE_ID |
|Multiple devices|--split-mode layer (default)|
Examples:
@@ -506,6 +526,7 @@ Otherwise, run the following wrapper script:
Note:
- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
```sh
@@ -520,23 +541,29 @@ use 1 SYCL GPUs: [0] with Max compute units:512
#### Build
| Name | Value | Function |
|--------------------|-----------------------------------|---------------------------------------------|
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|Name|Value|Function|
|-|-|-|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
#### Runtime
| Name | Value | Function |
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|Name|Value|Function|
|-|-|-|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
## Known Issues
- Hanging during startup
llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
- **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
- `Split-mode:[row]` is not supported.
## Q&A
@@ -548,7 +575,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
- General compiler error:
- Remove **build** folder or try a clean-build.
- Remove build folder or try a clean-build.
- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
@@ -565,6 +592,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
## TODO
## Todo
- Support row layer split for multiple card runs.

View File

@@ -10,7 +10,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Recent API changes
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
@@ -19,13 +18,12 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics
- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387**
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
----
@@ -94,7 +92,6 @@ Typically finetunes of the base models below are supported as well.
- [x] LLaMA 2 🦙🦙
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
- [X] Falcon
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
@@ -120,10 +117,6 @@ Typically finetunes of the base models below are supported as well.
- [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
**Multimodal models:**
@@ -147,7 +140,6 @@ Typically finetunes of the base models below are supported as well.
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@@ -185,10 +177,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [Msty](https://msty.app) (proprietary)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [MindMac](https://mindmac.app) (proprietary)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
---
@@ -486,20 +475,20 @@ Building the program with BLAS support may lead to some performance improvements
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
- #### hipBLAS
This provides BLAS acceleration on HIP-supported AMD GPUs.
Make sure to have ROCm installed.
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
- Using `make`:
```bash
@@ -516,7 +505,7 @@ Building the program with BLAS support may lead to some performance improvements
- Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
```
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
@@ -524,7 +513,7 @@ Building the program with BLAS support may lead to some performance improvements
set PATH=%HIP_PATH%\bin;%PATH%
mkdir build
cd build
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
cmake --build .
```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@@ -535,11 +524,11 @@ Building the program with BLAS support may lead to some performance improvements
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast
@@ -747,11 +736,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| Model | Original size | Quantized size (Q4_0) |
|------:|--------------:|----------------------:|
| 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB |
| 30B | 60 GB | 19.5 GB |
| 65B | 120 GB | 38.5 GB |
|------:|--------------:|-----------------------:|
| 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB |
| 30B | 60 GB | 19.5 GB |
| 65B | 120 GB | 38.5 GB |
### Quantization
@@ -759,7 +748,7 @@ Several quantization methods are supported. They differ in the resulting model d
*(outdated)*
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |

View File

@@ -1,67 +0,0 @@
# Security Policy
- [**Using llama.cpp securely**](#using-llamacpp-securely)
- [Untrusted models](#untrusted-models)
- [Untrusted inputs](#untrusted-inputs)
- [Data privacy](#data-privacy)
- [Untrusted environments or networks](#untrusted-environments-or-networks)
- [Multi-Tenant environments](#multi-tenant-environments)
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
## Using llama.cpp securely
### Untrusted models
Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
> [!NOTE]
> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
### Untrusted inputs
Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
For maximum security when handling untrusted inputs, you may need to employ the following:
* Sandboxing: Isolate the environment where the inference happens.
* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
* Validation: Enforce strict rules on allowed characters and data types.
* Filtering: Remove potentially malicious scripts or code fragments.
* Encoding: Convert special characters into safe representations.
* Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
### Data privacy
To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
### Untrusted environments or networks
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
* Encrypt your data if sending it over the network.
### Multi-Tenant environments
If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
## Reporting a vulnerability
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
<!-- normal version -->
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.

View File

@@ -128,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void {
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}

View File

@@ -153,52 +153,6 @@ function gg_sum_ctest_release {
gg_printf '```\n'
}
# test_scripts_debug
function gg_run_test_scripts_debug {
cd ${SRC}
set -e
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}
function gg_sum_test_scripts_debug {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test scripts in debug mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
gg_printf '```\n'
gg_printf '\n'
}
# test_scripts_release
function gg_run_test_scripts_release {
cd ${SRC}
set -e
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}
function gg_sum_test_scripts_release {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test scripts in release mode\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
gg_printf '```\n'
gg_printf '\n'
}
function gg_get_model {
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
@@ -688,9 +642,6 @@ test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
test $ret -eq 0 && gg_run test_scripts_debug
test $ret -eq 0 && gg_run test_scripts_release
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2

View File

@@ -47,6 +47,9 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
set(TARGET json-schema-to-grammar)
add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
set(TARGET common)
add_library(${TARGET} STATIC
@@ -60,7 +63,6 @@ add_library(${TARGET} STATIC
grammar-parser.h
grammar-parser.cpp
json.hpp
json-schema-to-grammar.cpp
train.h
train.cpp
ngram-cache.h

View File

@@ -1,6 +1,4 @@
#include "common.h"
#include "json.hpp"
#include "json-schema-to-grammar.h"
#include "llama.h"
#include <algorithm>
@@ -18,7 +16,6 @@
#include <unordered_set>
#include <vector>
#include <cinttypes>
#include <codecvt>
#if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h>
@@ -30,6 +27,7 @@
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <codecvt>
#include <locale>
#include <windows.h>
#include <fcntl.h>
@@ -70,8 +68,6 @@
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
#endif // LLAMA_USE_CURL
using json = nlohmann::ordered_json;
int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
@@ -1152,14 +1148,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
);
return true;
}
if (arg == "-j" || arg == "--json-schema") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
return true;
}
if (arg == "--override-kv") {
if (++i >= argc) {
invalid_param = true;
@@ -1365,9 +1353,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
printf(" --grammar-file FNAME file to read grammar from\n");
printf(" -j SCHEMA, --json-schema SCHEMA\n");
printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
printf(" --cfg-negative-prompt PROMPT\n");
printf(" negative prompt to use for guidance. (default: empty)\n");
printf(" --cfg-negative-prompt-file FNAME\n");
@@ -1515,77 +1500,6 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
GGML_UNREACHABLE();
}
// Validate if a filename is safe to use
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
bool validate_file_name(const std::string & filename) {
if (!filename.length()) {
// Empty filename invalid
return false;
}
if (filename.length() > 255) {
// Limit at common largest possible filename on Linux filesystems
// to avoid unnecessary further validation
// (On systems with smaller limits it will be caught by the OS)
return false;
}
std::u32string filename_utf32;
try {
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
filename_utf32 = converter.from_bytes(filename);
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
// or invalid encodings were encountered. Reject such attempts
std::string filename_reencoded = converter.to_bytes(filename_utf32);
if (filename_reencoded != filename) {
return false;
}
} catch (const std::exception &) {
return false;
}
// Check for forbidden codepoints:
// - Control characters
// - Unicode equivalents of illegal characters
// - UTF-16 surrogate pairs
// - UTF-8 replacement character
// - Byte order mark (BOM)
// - Illegal characters: / \ : * ? " < > |
for (char32_t c : filename_utf32) {
if (c <= 0x1F // Control characters (C0)
|| c == 0x7F // Control characters (DEL)
|| (c >= 0x80 && c <= 0x9F) // Control characters (C1)
|| c == 0xFF0E // Fullwidth Full Stop (period equivalent)
|| c == 0x2215 // Division Slash (forward slash equivalent)
|| c == 0x2216 // Set Minus (backslash equivalent)
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|| c == 0xFFFD // Replacement Character (UTF-8)
|| c == 0xFEFF // Byte Order Mark (BOM)
|| c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
return false;
}
}
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
// Unicode and other whitespace is not affected, only 0x20 space
if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
return false;
}
// Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
if (filename.find("..") != std::string::npos) {
return false;
}
// Reject "."
if (filename == ".") {
return false;
}
return true;
}
//
// String utils
//
@@ -1760,8 +1674,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.defrag_thold = params.defrag_thold;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@@ -2016,6 +1928,11 @@ struct llama_model * llama_load_model_from_url(
return NULL;
}
if (!curl) {
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
return NULL;
}
if (!llama_download_file(curl, model_url, path_model)) {
return NULL;
}
@@ -2209,7 +2126,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}
if (params.warmup) {
{
LOG("warming up the model with an empty run\n");
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
@@ -2229,23 +2146,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_special,
bool parse_special) {
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
bool add_bos,
bool special) {
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
}
std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_special,
bool parse_special) {
bool add_bos,
bool special) {
// upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special;
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);

View File

@@ -80,9 +80,6 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
@@ -159,7 +156,6 @@ struct gpt_params {
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V
@@ -183,8 +179,6 @@ std::string gpt_random_prompt(std::mt19937 & rng);
void process_escapes(std::string& input);
bool validate_file_name(const std::string & filename);
//
// String utils
//
@@ -227,14 +221,14 @@ void llama_batch_add(
std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_special,
bool parse_special = false);
bool add_bos,
bool special = false);
std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_special,
bool parse_special = false);
bool add_bos,
bool special = false);
// tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece`

View File

@@ -11,101 +11,35 @@
using json = nlohmann::ordered_json;
template <typename Iterator>
static std::string join(Iterator begin, Iterator end, const std::string & separator);
static std::string repeat(const std::string & str, size_t n);
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
if (separator_rule.empty()) {
if (min_items == 0 && max_items == 1) {
return item_rule + "?";
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
return item_rule + "+";
}
}
std::string result;
if (min_items > 0) {
if (item_rule_is_literal && separator_rule.empty()) {
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
} else {
std::vector<std::string> items(min_items, item_rule);
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
}
}
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
if (up_to_n == 0) {
return "";
} else if (up_to_n == 1) {
return "(" + content + ")?";
} else if (!separator_rule.empty() && !prefix_with_sep) {
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
} else {
std::string res = repeat("(" + content + " ", up_to_n);
// strip trailing space
res = res.substr(0, res.length() - 1);
res += repeat(")?", up_to_n);
return res;
}
};
if (min_items > 0 && max_items != min_items) {
result += " ";
}
if (max_items != std::numeric_limits<int>::max()) {
result += opt_repetitions(max_items - min_items, min_items > 0);
} else {
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
if (min_items == 0 && !separator_rule.empty()) {
result = "(" + item_rule + " " + item_operator + "*)?";
} else {
result += item_operator + "*";
}
}
return result;
}
const std::string SPACE_RULE = "\" \"?";
struct BuiltinRule {
std::string content;
std::vector<std::string> deps;
std::unordered_map<std::string, std::string> PRIMITIVE_RULES = {
{"boolean", "(\"true\" | \"false\") space"},
{"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
{"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
{"value", "object | array | string | number | boolean"},
{"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
{"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
{"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
{"string", " \"\\\"\" (\n"
" [^\"\\\\] |\n"
" \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
" )* \"\\\"\" space"},
{"null", "\"null\" space"}
};
std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};
const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"boolean", {"(\"true\" | \"false\") space", {}}},
{"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
{"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date-time", {"date \"T\" time", {"date", "time"}}},
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
{"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
std::unordered_map<std::string, std::string> DATE_RULES = {
{"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
{"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
{"date-time", "date \"T\" time"},
{"date-string", "\"\\\"\" date \"\\\"\" space"},
{"time-string", "\"\\\"\" time \"\\\"\" space"},
{"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
};
static bool is_reserved_name(const std::string & name) {
@@ -113,7 +47,7 @@ static bool is_reserved_name(const std::string & name) {
if (RESERVED_NAMES.empty()) {
RESERVED_NAMES.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
}
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
}
@@ -258,7 +192,7 @@ private:
if (_dotall) {
rule = "[\\U00000000-\\U0010FFFF]";
} else {
rule = "[^\\x0A\\x0D]";
rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
}
return _add_rule("dot", rule);
};
@@ -374,21 +308,47 @@ private:
auto &sub = last.first;
auto sub_is_literal = last.second;
if (!sub_is_literal) {
std::string & sub_id = sub_rule_ids[sub];
if (sub_id.empty()) {
sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
sub += "*";
} else if (min_times == 0 && max_times == 1) {
sub += "?";
} else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
sub += "+";
} else {
if (!sub_is_literal) {
std::string & sub_id = sub_rule_ids[sub];
if (sub_id.empty()) {
sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
}
sub = sub_id;
}
sub = sub_id;
std::string result;
if (sub_is_literal && min_times > 0) {
result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
} else {
for (int j = 0; j < min_times; j++) {
if (j > 0) {
result += " ";
}
result += sub;
}
}
if (min_times > 0 && min_times < max_times) {
result += " ";
}
if (max_times == std::numeric_limits<int>::max()) {
result += sub + "*";
} else {
for (int j = min_times; j < max_times; j++) {
if (j > min_times) {
result += " ";
}
result += sub + "?";
}
}
seq.back().first = result;
seq.back().second = false;
}
seq.back().first = build_repetition(
sub_is_literal ? "\"" + sub + "\"" : sub,
min_times,
max_times,
"",
sub_is_literal
);
seq.back().second = false;
} else {
std::string literal;
auto is_non_literal = [&](char c) {
@@ -464,7 +424,7 @@ private:
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
prop_kv_rule_names["*"] = kv_rule;
optional_props.push_back("*");
}
@@ -526,25 +486,6 @@ private:
return rule;
}
std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
auto n = _add_rule(name, rule.content);
for (const auto & dep : rule.deps) {
BuiltinRule dep_rule;
auto it = PRIMITIVE_RULES.find(dep);
if (it == PRIMITIVE_RULES.end()) {
it = STRING_FORMAT_RULES.find(dep);
if (it == STRING_FORMAT_RULES.end()) {
_errors.push_back("Rule " + dep + " not known");
continue;
}
}
if (_rules.find(dep) == _rules.end()) {
_add_primitive(dep, it->second);
}
}
return n;
}
public:
SchemaConverter(
const std::function<json(const std::string &)> & fetch_json,
@@ -706,33 +647,49 @@ public:
return _add_rule(rule_name, rule);
} else {
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
std::string successive_items;
int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1;
if (min_items > 0) {
successive_items += repeat(list_item_operator, min_items - 1);
min_items--;
}
if (max_items >= 0 && max_items > min_items) {
successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
} else {
successive_items += list_item_operator + "*";
}
std::string rule;
if (min_items == 0) {
rule = "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
} else {
rule = "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
}
return _add_rule(rule_name, rule);
}
} else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
return _visit_pattern(schema["pattern"], rule_name);
} else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
} else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
auto prim_name = schema_format + "-string";
return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
} else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
} else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
for (const auto & kv : DATE_RULES) {
_add_rule(kv.first, kv.second);
}
return schema_format + "-string";
} else if (schema.empty() || schema_type == "object") {
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
for (const auto & n : OBJECT_RULE_NAMES) {
_add_rule(n, PRIMITIVE_RULES.at(n));
}
return _add_rule(rule_name, "object");
} else {
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
_errors.push_back("Unrecognized schema: " + schema.dump());
return "";
}
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
}
}

View File

@@ -129,7 +129,7 @@ llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = -1);
int idx = 0);
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
llama_token_data_array llama_sampling_prepare(

View File

@@ -43,18 +43,17 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]")
class Model(ABC):
_model_classes: dict[str, type[Model]] = {}
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
self.dir_model = dir_model
self.ftype = ftype
self.fname_out = fname_out
self.is_big_endian = is_big_endian
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.use_temp_file = use_temp_file
self.is_safetensors = self._is_model_safetensors()
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
self.part_names = self._get_part_names()
self.hparams = Model.load_hparams(self.dir_model)
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
@property
@@ -161,7 +160,7 @@ class Model(ABC):
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
@@ -228,14 +227,15 @@ class Model(ABC):
return ("pytorch_model.bin",)
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
# used for GPT-2 BPE and WordPiece vocabs
def get_basic_vocab(self) -> tuple[list[str], list[int]]:
def _set_vocab_gpt2(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[str] = []
toktypes: list[int] = []
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
@@ -255,15 +255,11 @@ class Model(ABC):
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
return tokens, toktypes
def _set_vocab_gpt2(self) -> None:
tokens, toktypes = self.get_basic_vocab()
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_qwen(self):
@@ -327,7 +323,8 @@ class Model(ABC):
toktypes: list[int] = []
if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
sys.exit(1)
tokenizer = SentencePieceProcessor(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
@@ -514,17 +511,6 @@ class BloomModel(Model):
class MPTModel(Model):
model_arch = gguf.MODEL_ARCH.MPT
def set_vocab(self):
try:
self._set_vocab_gpt2()
except Exception:
# Fallback for SEA-LION model
self._set_vocab_sentencepiece()
self.gguf_writer.add_add_bos_token(False)
self.gguf_writer.add_pad_token_id(3)
self.gguf_writer.add_eos_token_id(1)
self.gguf_writer.add_unk_token_id(0)
def set_gguf_parameters(self):
block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name)
@@ -538,10 +524,7 @@ class MPTModel(Model):
self.gguf_writer.add_layer_norm_eps(1e-5)
if self.hparams["attn_config"]["clip_qkv"] is not None:
self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
if self.hparams["attn_config"]["alibi"]:
self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
else:
self.gguf_writer.add_max_alibi_bias(0.0)
self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
@@ -1221,14 +1204,6 @@ class LlamaModel(Model):
except FileNotFoundError:
self._set_vocab_llama_hf()
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
special_vocab._set_special_token("prefix", 32007)
special_vocab._set_special_token("suffix", 32008)
special_vocab._set_special_token("middle", 32009)
special_vocab._set_special_token("eot", 32010)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
@@ -1241,8 +1216,6 @@ class LlamaModel(Model):
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
n_head = self.hparams.get("num_attention_heads")
n_kv_head = self.hparams.get("num_key_value_heads")
n_experts = self.hparams.get("num_local_experts")
experts = dict()
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
@@ -1263,153 +1236,6 @@ class LlamaModel(Model):
data = data.squeeze()
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
experts[name] = data
if len(experts) >= n_experts:
# merge the experts into a single 3d tensor
for bid in range(block_count):
for wid in range(1, 4):
full = True
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
if ename not in experts:
full = False
break
if not full:
continue
datas = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
datas.append(experts[ename])
del experts[ename]
data = np.stack(datas, axis=0)
data_dtype = data.dtype
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
if self.ftype == 1 and data_dtype == np.float32:
data = data.astype(np.float16)
merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight"
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
continue
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# 1d tensors need to be converted to float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts.keys()}")
@Model.register("GrokForCausalLM")
class GrokModel(Model):
model_arch = gguf.MODEL_ARCH.GROK
def set_vocab(self):
self._set_vocab_sentencepiece()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_name("Grok")
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
n_experts = self.hparams.get("num_local_experts")
experts = dict()
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
continue
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
# process the experts separately
if name.find(".moe.") != -1:
experts[name] = data
if len(experts) >= n_experts:
# merge the experts into a single 3d tensor
for bid in range(block_count):
for wid in ["linear", "linear_1", "linear_v"]:
full = True
for xid in range(n_experts):
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
if ename not in experts:
full = False
break
if not full:
continue
datas = []
for xid in range(n_experts):
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
datas.append(experts[ename])
del experts[ename]
data = np.stack(datas, axis=0)
data_dtype = data.dtype
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
if self.ftype == 1 and data_dtype == np.float32:
data = data.astype(np.float16)
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
continue
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
@@ -1436,100 +1262,19 @@ class GrokModel(Model):
self.gguf_writer.add_tensor(new_name, data)
@Model.register("DbrxForCausalLM")
class DbrxModel(Model):
model_arch = gguf.MODEL_ARCH.DBRX
@Model.register("GrokForCausalLM")
class GrokModel(Model):
model_arch = gguf.MODEL_ARCH.GROK
def set_vocab(self):
self._set_vocab_sentencepiece()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def set_gguf_parameters(self):
ffn_config = self.hparams["ffn_config"]
attn_config = self.hparams["attn_config"]
self.gguf_writer.add_name(self.hparams["model_type"])
self.gguf_writer.add_block_count(self.hparams["n_layers"])
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
self.gguf_writer.add_head_count(self.hparams["n_heads"])
self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
self.gguf_writer.add_layer_norm_eps(1e-5)
self.gguf_writer.add_file_type(self.ftype)
print(f"gguf: file type = {self.ftype}")
def write_tensors(self):
block_count = self.hparams.get("n_layers")
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
n_expert = self.hparams["ffn_config"]["moe_num_experts"]
n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
n_embd = self.hparams["d_model"]
# Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
# original implementation expects (n_expert, n_ff, n_embd) for all experts weights
# But llama.cpp moe graph works differently
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
"ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
"ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
experts = False
for exp_tensor_name in exp_tensor_names.keys():
if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
experts = True
data_torch = data_torch.view(n_expert, n_ff, n_embd)
if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
data_torch = data_torch.permute(*permute_tensor)
break
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
# map tensor names
# In MoE models the ffn tensors are typically most of the model weights,
# and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
# Every other model has the weight names ending in .weight,
# let's assume that is the convention which is not the case for dbrx:
# https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# Most of the codebase that takes in 1D tensors only handles F32 tensors
# and most of the outputs tensors are F32.
if data_dtype != np.float32 and n_dims == 1:
print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
sys.exit()
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
super().set_gguf_parameters()
self.gguf_writer.add_name("Grok")
@Model.register("MiniCPMForCausalLM")
@@ -2151,25 +1896,34 @@ class BertModel(Model):
self.gguf_writer.add_pooling_type(pooling_type)
def set_vocab(self):
tokens, toktypes = self.get_basic_vocab()
self.vocab_size = len(tokens)
# use huggingface vocab to get all tokens
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
tokens, scores, toktypes = zip(*vocab.all_tokens())
assert len(tokens) == vocab.vocab_size
self.vocab_size = vocab.vocab_size
# we need this to validate the size of the token_type embeddings
# though currently we are passing all zeros to the token_type embeddings
self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
n_token_types = len(set(toktypes))
self.gguf_writer.add_token_type_count(n_token_types)
# convert to phantom space vocab
def phantom(tok):
if tok.startswith("[") and tok.endswith("]"):
def phantom(tok, typ):
if tok.startswith(b"[") and tok.endswith(b"]"):
return tok
if tok.startswith("##"):
if tok.startswith(b"##"):
return tok[2:]
return "\u2581" + tok
tokens = list(map(phantom, tokens))
return b"\xe2\x96\x81" + tok
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
# set up bos and eos tokens (cls and sep)
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
# add vocab to gguf
self.gguf_writer.add_tokenizer_model("bert")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
# handle special tokens
@@ -2241,6 +1995,16 @@ class NomicBertModel(BertModel):
super().set_gguf_parameters()
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
def get_tensors(self):
assert self.vocab_size is not None
for name, data in super().get_tensors():
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
data = data[:self.vocab_size, :]
yield name, data
@Model.register("GemmaForCausalLM")
class GemmaModel(Model):
@@ -2248,13 +2012,6 @@ class GemmaModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
special_vocab._set_special_token("prefix", 67)
special_vocab._set_special_token("suffix", 69)
special_vocab._set_special_token("middle", 68)
special_vocab._set_special_token("eot", 70)
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
hparams = self.hparams
@@ -2423,8 +2180,7 @@ class MambaModel(Model):
data = data.astype(np.float32)
# if f16 desired, convert big float32 2-dim weight tensors to float16
new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
@@ -2475,7 +2231,6 @@ def parse_args() -> argparse.Namespace:
"model", type=Path,
help="directory containing model file",
)
parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
return parser.parse_args()
@@ -2519,7 +2274,7 @@ def main() -> None:
with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
print("Set model parameters")
model_instance.set_gguf_parameters()

View File

@@ -1,6 +1,4 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import os
import sys

View File

@@ -33,7 +33,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
import gguf
if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias
from typing import TypeAlias
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
faulthandler.register(signal.SIGUSR1)
@@ -139,8 +139,7 @@ class GGMLFileType(enum.IntEnum):
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
if dt is None:
raise ValueError(self)
# Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
# Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
# 1D tensors are always F32.
return dt if len(tensor.shape) > 1 else DT_F32
@@ -517,7 +516,7 @@ class LlamaHfVocab(Vocab):
tokenizer_model = "llama"
name = "hfft"
def __init__(self, base_path: Path):
def __init__(self, base_path: Path, ignore_nonllama: bool = False):
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding='utf-8') as f:
@@ -525,7 +524,9 @@ class LlamaHfVocab(Vocab):
# pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model']
if (
if ignore_nonllama:
pass # workaround incorrect use of this class for WordPiece
elif (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence'
):
@@ -645,17 +646,16 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
class Tensor(ABC):
ndarray: NDArray
data_type: DataType
@abstractmethod
def astype(self, data_type: DataType) -> Self: ...
def astype(self, data_type: DataType) -> Tensor: ...
@abstractmethod
def permute(self, n_head: int, n_head_kv: int) -> Self: ...
def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
@abstractmethod
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
@abstractmethod
def part(self, n_part: int) -> Self: ...
def part(self, n_part: int) -> UnquantizedTensor: ...
@abstractmethod
def to_ggml(self) -> GGMLCompatibleTensor: ...
@@ -672,13 +672,13 @@ class UnquantizedTensor(Tensor):
self.ndarray = ndarray
self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
def astype(self, data_type: DataType) -> UnquantizedTensor:
def astype(self, data_type: DataType) -> Tensor:
dtype = data_type.dtype
if self.data_type == DT_BF16:
self.ndarray = bf16_to_fp32(self.ndarray)
return UnquantizedTensor(self.ndarray.astype(dtype))
def to_ggml(self) -> Self:
def to_ggml(self) -> UnquantizedTensor:
return self
def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
@@ -828,15 +828,6 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
def load() -> Tensor:
tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
s = lazy_tensors[0].shape.copy()
s.insert(0, len(lazy_tensors))
return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
# Functionality that simulates `torch.load` but where individual tensors are
# only loaded into memory on demand, not all at once.
# PyTorch can't do this natively as of time of writing:
@@ -1255,22 +1246,6 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
tmp = model
# merge experts into one tensor
if params.n_experts and params.n_experts > 0:
for i_l in range(params.n_layer):
for w in range(1, 4):
experts = []
for e in range(params.n_experts):
if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
else:
raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
# HF models permut or pack some of the tensors, so we need to undo that
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
@@ -1350,7 +1325,7 @@ def load_some_model(path: Path) -> ModelPlus:
# Be extra-friendly and accept either a file or a directory:
if path.is_dir():
# Check if it's a set of safetensors files first
globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
globs = ["model-00001-of-*.safetensors", "model.safetensors"]
files = [file for glob in globs for file in path.glob(glob)]
if not files:
# Try the PyTorch patterns too, with lower priority

View File

@@ -1,119 +0,0 @@
## Add a new model architecture to `llama.cpp`
Adding a model requires few steps:
1. Convert the model to GGUF
2. Define the model architecture in `llama.cpp`
3. Build the GGML graph implementation
After following these steps, you can open PR.
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](../examples/main)
- [imatrix](../examples/imatrix)
- [quantize](../examples/quantize)
- [server](../examples/server)
### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
The required steps to implement for an HF model are:
1. Define the model `Model.register` annotation in a new `Model` subclass, example:
```python
@Model.register("MyModelForCausalLM")
class MyModel(Model):
model_arch = gguf.MODEL_ARCH.GROK
```
2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
Example for `falcon` model:
```python
MODEL_ARCH.FALCON: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_NORM_2,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
]
```
3. Map the original tensor names to the standardize equivalent in GGUF
As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
Example for the normalization tensor in attention layers:
```python
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
# Attention norm
MODEL_TENSOR.ATTN_NORM: (
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
"transformer.blocks.{bid}.norm_1", # mpt
...
)
}
```
`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
- `Model#set_gguf_parameters`
- `Model#set_vocab`
- `Model#write_tensors`
NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
### 2. Define the model architecture in `llama.cpp`
The model params and tensors layout must be defined in `llama.cpp`:
1. Define a new `llm_arch`
2. Define the tensors layout in `LLM_TENSOR_NAMES`
3. Add any non standard metadata in `llm_load_hparams`
4. Create the tensors for inference in `llm_load_tensors`
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
### 3. Build the GGML graph implementation
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
## GGUF specification
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
## Resources
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948

View File

@@ -19,7 +19,6 @@ else()
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
add_subdirectory(finetune)
add_subdirectory(gritlm)
add_subdirectory(gguf-split)

View File

@@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
# custom set of batches
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
```
## Sample results

View File

@@ -32,15 +32,13 @@ int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ;
}
int n_kv_max = 2048;
int n_batch = 2048;
int n_ubatch = 512;
int is_pp_shared = 0;
int n_gpu_layers = 0;
@@ -58,31 +56,23 @@ int main(int argc, char ** argv) {
}
if (argc >= 4) {
n_batch = std::atoi(argv[3]);
is_pp_shared = std::atoi(argv[3]);
}
if (argc >= 5) {
n_ubatch = std::atoi(argv[4]);
n_gpu_layers = std::atoi(argv[4]);
}
if (argc >= 6) {
is_pp_shared = std::atoi(argv[5]);
n_pp = parse_list(argv[5]);
}
if (argc >= 7) {
n_gpu_layers = std::atoi(argv[6]);
n_tg = parse_list(argv[6]);
}
if (argc >= 8) {
n_pp = parse_list(argv[7]);
}
if (argc >= 9) {
n_tg = parse_list(argv[8]);
}
if (argc >= 10) {
n_pl = parse_list(argv[9]);
n_pl = parse_list(argv[7]);
}
// init LLM
@@ -110,8 +100,7 @@ int main(int argc, char ** argv) {
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = n_batch;
ctx_params.n_ubatch = n_ubatch;
ctx_params.n_batch = 512;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -169,7 +158,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

View File

@@ -123,10 +123,10 @@ int main(int argc, char ** argv) {
inputs.push_back(inp);
}
// add SEP if not present
// add eos if not present
for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(model)) {
inp.push_back(llama_token_sep(model));
if (inp.empty() || inp.back() != llama_token_eos(model)) {
inp.push_back(llama_token_eos(model));
}
}

View File

@@ -1,9 +0,0 @@
set(TARGET eval-callback)
add_executable(${TARGET} eval-callback.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View File

@@ -1,95 +0,0 @@
# llama.cpp/examples/eval-callback
A simple example which demonstrates how to use callback during the inference.
It simply prints to the console all operations and tensor data.
Usage:
```shell
eval-callback \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model phi-2-q4_0.gguf \
--prompt hello \
--seed 42 \
-ngl 33
```
Will print:
```shell
llm_load_tensors: offloaded 33/33 layers to GPU
...
llama_new_context_with_model: n_ctx = 512
...
llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
llama_new_context_with_model: graph nodes = 1225
llama_new_context_with_model: graph splits = 2
ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.0181, 0.0272, 0.0272, ...],
],
]
ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -0.6989, 1.0636, 1.0636, ...],
],
]
ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.1800, 0.2817, 0.2632, ...],
],
]
ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.1863, 0.2970, 0.2604, ...],
],
]
ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
[
[
[ -1.1238, 1.2876, -1.8086, ...],
],
]
ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
[ -0.3608, 0.5076, -1.8866, ...],
[ 1.7643, 0.0273, -2.1065, ...],
...
],
]
ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
[ -0.3608, 0.5076, -1.8866, ...],
[ 1.7643, 0.0273, -2.1065, ...],
...
],
]
```

View File

@@ -1,195 +0,0 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include <cstdio>
#include <random>
#include <string>
#include <tuple>
#include <vector>
/**
* This the arbitrary data which will be passed to each callback.
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
*/
struct callback_data {
std::vector<uint8_t> data;
};
static std::string ggml_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
printf(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) {
printf(" ..., \n");
i2 = ne[2] - n;
}
printf(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2*n) {
printf(" ..., \n");
i1 = ne[1] - n;
}
printf(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2*n) {
printf("..., ");
i0 = ne[0] - n;
}
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
} else if (type == GGML_TYPE_F32) {
v = *(float *) data + i;
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) data + i;
} else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) data + i;
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) data + i;
} else {
GGML_ASSERT(false);
}
printf("%12.4f", v);
sum += v;
if (i0 < ne[0] - 1) printf(", ");
}
printf("],\n");
}
printf(" ],\n");
}
printf(" ]\n");
printf(" sum = %f\n", sum);
}
}
/**
* GGML operations callback during the graph execution.
*
* @param t current tensor
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
* see ggml_backend_sched_eval_callback
* @param user_data user data to pass at each call back
* @return true to receive data or continue the graph, false otherwise
*/
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (callback_data *) user_data;
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
if (ask) {
return true; // Always retrieve data
}
char src1_str[128] = {0};
if (src1) {
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
}
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, ggml_ne_string(src0).c_str(),
src1 ? src1_str : "",
ggml_ne_string(t).c_str());
// copy the data from the GPU memory if needed
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
if (!is_host) {
auto n_bytes = ggml_nbytes(t);
cb_data->data.resize(n_bytes);
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
}
if (!ggml_is_quantized(t->type)) {
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
}
return true;
}
static bool run(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
return true;
}
int main(int argc, char ** argv) {
callback_data cb_data;
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
print_build_info();
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_backend_init();
llama_numa_init(params.numa);
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = ggml_debug;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str());
}
bool OK = run(ctx, params);
if (!OK) {
return 1;
}
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View File

@@ -1,5 +0,0 @@
set(TARGET gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,132 +0,0 @@
#define LLAMA_API_INTERNAL
#include "grammar-parser.h"
#include "ggml.h"
#include "llama.h"
#include "unicode.h"
#include <cstdio>
#include <cstdlib>
#include <string>
#include <vector>
static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
auto decoded = decode_utf8(input_str, {});
const auto & code_points = decoded.first;
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
grammar->stacks = prev_stacks;
return false;
}
++pos;
}
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
return true;
}
}
error_pos = pos;
error_msg = "Unexpected end of input";
return false;
}
static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
fprintf(stdout, "Input string is invalid according to the grammar.\n");
fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
fprintf(stdout, "\n");
fprintf(stdout, "Input string:\n");
fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
if (error_pos < input_str.size()) {
fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
if (error_pos+1 < input_str.size()) {
fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
}
fprintf(stdout, "\033[0m\n");
}
}
int main(int argc, char** argv) {
if (argc != 3) {
fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
return 1;
}
const std::string grammar_filename = argv[1];
const std::string input_filename = argv[2];
// Read the GBNF grammar file
FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
if (!grammar_file) {
fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
return 1;
}
fseek(grammar_file, 0, SEEK_END);
size_t grammar_size = ftell(grammar_file);
fseek(grammar_file, 0, SEEK_SET);
std::string grammar_str(grammar_size, ' ');
fread(&grammar_str[0], 1, grammar_size, grammar_file);
fclose(grammar_file);
// Parse the GBNF grammar
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
fprintf(stdout, "%s: failed to parse grammar\n", __func__);
return 1;
}
// Ensure that there is a "root" node.
if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
return 1;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
// Create the LLAMA grammar
auto grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
// Read the input file
FILE* input_file = fopen(input_filename.c_str(), "r");
if (!input_file) {
fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
return 1;
}
fseek(input_file, 0, SEEK_END);
size_t input_size = ftell(input_file);
fseek(input_file, 0, SEEK_SET);
std::string input_str(input_size, ' ');
fread(&input_str[0], 1, input_size, input_file);
fclose(input_file);
// Validate the input string against the grammar
size_t error_pos;
std::string error_msg;
bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
if (is_valid) {
fprintf(stdout, "Input string is valid according to the grammar.\n");
} else {
print_error_message(input_str, error_pos, error_msg);
}
// Clean up
llama_grammar_free(grammar);
return 0;
}

View File

@@ -5,6 +5,5 @@ CLI to split / merge GGUF files.
**Command line options:**
- `--split`: split GGUF to multiple GGUF, default operation.
- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
- `--split-max-tensors`: maximum tensors in each split: default(128)
- `--merge`: merge multiple GGUF to a single GGUF.

View File

@@ -59,10 +59,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024; // megabytes
n_bytes = n * 1024 * 1024; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
n_bytes = n * 1024 * 1024 * 1024; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}

View File

@@ -1,89 +0,0 @@
#!/bin/bash
set -eu
if [ $# -lt 1 ]
then
echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
echo "example: $0 ../../build/bin ../../tmp"
exit 1
fi
if [ $# -gt 1 ]
then
TMP_DIR=$2
else
TMP_DIR=/tmp
fi
set -x
SPLIT=$1/gguf-split
MAIN=$1/main
WORK_PATH=$TMP_DIR/gguf-split
CUR_DIR=$(pwd)
mkdir -p "$WORK_PATH"
# Clean up in case of previously failed test
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
# 1. Get a model
(
cd $WORK_PATH
"$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
)
echo PASS
# 2. Split with max tensors strategy
$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
echo PASS
echo
# 2b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
echo PASS
echo
# 3. Merge
$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
echo PASS
echo
# 3b. Test the merged model is loading properly
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
echo PASS
echo
# 4. Split with no tensor in metadata
#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
#echo PASS
#echo
# 4b. Test the sharded model is loading properly
#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
#echo PASS
#echo
# 5. Merge
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
#echo PASS
#echo
# 5b. Test the merged model is loading properly
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
#echo PASS
#echo
# 6. Split with size strategy
$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
echo PASS
echo
# 6b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
echo PASS
echo
# Clean up
rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf

View File

@@ -142,7 +142,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
}
// read and create ggml_context containing the tensors and their data
static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
static bool gguf_ex_read_1(const std::string & fname) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
@@ -206,7 +206,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
printf("\n\n");
// check data
if (check_data) {
{
const float * data = (const float *) cur->data;
for (int j = 0; j < ggml_nelements(cur); ++j) {
if (data[j] != 100 + i) {
@@ -229,16 +229,9 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
int main(int argc, char ** argv) {
if (argc < 3) {
printf("usage: %s data.gguf r|w [n]\n", argv[0]);
printf("r: read data.gguf file\n");
printf("w: write data.gguf file\n");
printf("n: no check of tensor data\n");
printf("usage: %s data.gguf r|w\n", argv[0]);
return -1;
}
bool check_data = true;
if (argc == 4) {
check_data = false;
}
const std::string fname(argv[1]);
const std::string mode (argv[2]);
@@ -249,7 +242,7 @@ int main(int argc, char ** argv) {
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
} else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
}
return 0;

View File

@@ -98,36 +98,35 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
if (t->op == GGML_OP_MUL_MAT_ID) {
const int idx = ((int32_t *) t->op_params)[0];
const ggml_tensor * ids = t->src[2];
const int n_as = src0->ne[2];
const int n_as = ((int32_t *) t->op_params)[1];
// the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy ids to host, because it is small
GGML_ASSERT(ids->ne[1] == src1->ne[1]);
m_ids.resize(ggml_nbytes(ids)/sizeof(int));
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
auto & e = m_stats[wname];
++e.ncall;
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
// using the following line, we can correct for that if needed by replacing the line above with:
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
// the top-k selected expert ids are stored in the src0 tensor
// for simplicity, always copy src0 to host, because it is small
// take into account that src0 is not contiguous!
GGML_ASSERT(src0->ne[1] == src1->ne[1]);
GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
m_ids.resize(ggml_nbytes(src0)/sizeof(int));
ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
// loop over all possible experts, regardless if they are used or not in the batch
// this is necessary to guarantee equal number of "ncall" for each tensor
for (int ex = 0; ex < n_as; ++ex) {
size_t e_start = ex*src1->ne[0];
src0 = t->src[2 + ex];
wname = filter_tensor_name(src0->name);
auto& e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0]*n_as, 0);
e.values.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
}
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
// using the following line, we can correct for that if needed
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
@@ -137,7 +136,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (excur != ex) continue;
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j];
e.values[j] += x[j]*x[j];
}
}
if (e.ncall > m_last_call) {
@@ -347,13 +346,12 @@ static void process_logits(
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -595,18 +593,24 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
llama_context_params cparams = llama_context_params_from_gpt_params(params);
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = ik_collect_imatrix;
params.cb_eval_user_data = NULL;
params.warmup = false;
cparams.cb_eval = ik_collect_imatrix;
cparams.cb_eval_user_data = NULL;
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
llama_context * ctx = llama_new_context_with_model(model, cparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: unable to create context\n", __func__);
return 1;
}

View File

@@ -36,11 +36,6 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi
### Example
Download a model that supports infill, for example CodeLlama:
```console
scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
```
```bash
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
```

View File

@@ -239,7 +239,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s\n", get_system_info(params).c_str());
}
const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos);
bool suff_rm_leading_spc = params.escape;
@@ -280,10 +279,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size();

View File

@@ -6,94 +6,37 @@ import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
if not separator_rule:
if min_items == 0 and max_items == 1:
return f'{item_rule}?'
elif min_items == 1 and max_items is None:
return f'{item_rule}+'
result = ''
if min_items > 0:
if item_rule_is_literal and separator_rule is None:
result = '"' + (item_rule[1:-1] * min_items) + '"'
else:
result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
def opt_repetitions(up_to_n, prefix_with_sep=False):
'''
- n=4, no sep: '(a (a (a (a)?)?)?)?'
- n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
- n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
'''
content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
if up_to_n == 0:
return ''
elif up_to_n == 1:
return f'({content})?'
elif separator_rule and not prefix_with_sep:
return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
else:
return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
if min_items > 0 and max_items != min_items:
result += ' '
if max_items is not None:
result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
else:
item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
if min_items == 0 and separator_rule:
result = f'({item_rule} {item_operator}*)?'
else:
result += f'{item_operator}*'
return result
class BuiltinRule:
def __init__(self, content: str, deps: list = None):
self.content = content
self.deps = deps or []
_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []),
'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []),
'boolean': '("true" | "false") space',
'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
'value' : 'object | array | string | number | boolean',
'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
'array' : '"[" space ( value ("," space value)* )? "]" space',
'uuid' : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
'string': r''' "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space''',
'null': '"null" space',
}
OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']
# TODO: support "uri", "email" string formats
STRING_FORMAT_RULES = {
'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
DATE_RULES = {
'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
'date-time': 'date "T" time',
'date-string': '"\\"" date "\\"" space',
'time-string': '"\\"" time "\\"" space',
'date-time-string': '"\\"" date-time "\\"" space',
}
DOTALL = '[\\U00000000-\\U0010FFFF]'
DOT = '[^\\x0A\\x0D]'
RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
@@ -103,6 +46,8 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']'
NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
class SchemaConverter:
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@@ -110,9 +55,7 @@ class SchemaConverter:
self._allow_fetch = allow_fetch
self._dotall = dotall
self._raw_pattern = raw_pattern
self._rules = {
'space': SPACE_RULE,
}
self._rules = {'space': SPACE_RULE}
self._refs = {}
self._refs_being_resolved = set()
@@ -122,29 +65,6 @@ class SchemaConverter:
)
return f'"{escaped}"'
def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
'''
not_literal('a') -> '[^a]'
not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
'''
assert len(literal) > 0, 'Empty literal not supported'
def recurse(i: int):
c = literal[i]
if maybe_escaped_underscores and c == '_':
yield f'[^{c}\\\\]'
yield ' | '
yield f'"\\\\"? "{c}"'
else:
yield f'[^{c}]'
if i < len(literal) - 1:
yield ' | '
yield self._format_literal(c)
yield ' ('
yield from recurse(i + 1)
yield ')?'
return ''.join(('(', *recurse(0), ')'))
def _add_rule(self, name, rule):
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
if esc_name not in self._rules or self._rules[esc_name] == rule:
@@ -249,10 +169,10 @@ class SchemaConverter:
def get_dot():
if self._dotall:
rule = DOTALL
rule = '[\\U00000000-\\U0010FFFF]'
else:
# Accept any character... except \n and \r line break chars (\x0A and \xOD)
rule = DOT
rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
return self._add_rule(f'dot', rule)
def join_seq():
@@ -326,14 +246,26 @@ class SchemaConverter:
(sub, sub_is_literal) = seq[-1]
if not sub_is_literal:
id = sub_rule_ids.get(sub)
if id is None:
id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
sub_rule_ids[sub] = id
sub = id
if min_times == 0 and max_times is None:
seq[-1] = (f'{sub}*', False)
elif min_times == 0 and max_times == 1:
seq[-1] = (f'{sub}?', False)
elif min_times == 1 and max_times is None:
seq[-1] = (f'{sub}+', False)
else:
if not sub_is_literal:
id = sub_rule_ids.get(sub)
if id is None:
id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
sub_rule_ids[sub] = id
sub = id
seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
seq[-1] = (
' '.join(
([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
False
)
else:
literal = ''
while i < length:
@@ -441,47 +373,49 @@ class SchemaConverter:
' "]" space')
else:
item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
list_item_operator = f'( "," space {item_rule_name} )'
successive_items = ""
min_items = schema.get("minItems", 0)
max_items = schema.get("maxItems")
return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
if min_items > 0:
successive_items = list_item_operator * (min_items - 1)
min_items -= 1
if max_items is not None and max_items > min_items:
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
else:
successive_items += list_item_operator + "*"
if min_items == 0:
rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
else:
rule = f'"[" space {item_rule_name} {successive_items} "]" space'
return self._add_rule(rule_name, rule)
elif schema_type in (None, 'string') and 'pattern' in schema:
return self._visit_pattern(schema['pattern'], rule_name)
elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
return self._add_primitive(
return self._add_rule(
'root' if rule_name == 'root' else schema_format,
PRIMITIVE_RULES['uuid']
)
elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
prim_name = f'{schema_format}-string'
return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
min_len = schema.get('minLength', 0)
max_len = schema.get('maxLength')
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
elif schema_type in (None, 'string') and schema_format in DATE_RULES:
for t, r in DATE_RULES.items():
self._add_rule(t, r)
return schema_format + '-string'
elif (schema_type == 'object') or (len(schema) == 0):
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
for n in OBJECT_RULE_NAMES:
self._add_rule(n, PRIMITIVE_RULES[n])
return self._add_rule(rule_name, 'object')
else:
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
# TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
def _add_primitive(self, name: str, rule: BuiltinRule):
n = self._add_rule(name, rule.content)
for dep in rule.deps:
dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
assert dep_rule, f'Rule {dep} not known'
if dep not in self._rules:
self._add_primitive(dep, dep_rule)
return n
return self._add_rule(
'root' if rule_name == 'root' else schema_type,
PRIMITIVE_RULES[schema_type]
)
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
prop_order = self._prop_order
@@ -503,7 +437,7 @@ class SchemaConverter:
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
prop_kv_rule_names["*"] = self._add_rule(
f'{sub_name}-kv',
self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
)
optional_props.append("*")

View File

@@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example:
## Model conversion
1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
```sh
git clone https://huggingface.co/mtgv/MobileVLM-1.7B

View File

@@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example:
## LLaVA 1.5
1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b

View File

@@ -146,6 +146,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
int n_past = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
std::string system_prompt, user_prompt;
size_t image_pos = prompt.find("<image>");
@@ -179,7 +180,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
}
}
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);

View File

@@ -64,10 +64,13 @@ int main(int argc, char ** argv) {
std::tie(model, ctx) = llama_init_from_gpt_params(params);
// Tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos tgt: %d\n", add_bos);
std::vector<llama_token> inp;
std::vector<llama_token> all;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
all = inp;
const int max_context_size = llama_n_ctx(ctx);

View File

@@ -28,8 +28,10 @@ int main(int argc, char ** argv){
GGML_ASSERT(model != nullptr);
// tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
fprintf(stderr, "%s: tokenization done\n", __func__);

View File

@@ -34,8 +34,11 @@ int main(int argc, char ** argv){
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
// tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos tgt: %d\n", add_bos);
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic;

View File

@@ -42,8 +42,11 @@ int main(int argc, char ** argv){
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
// tokenize the prompt
const bool add_bos = llama_should_add_bos_token(model);
LOG("add_bos tgt: %d\n", add_bos);
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, true, true);
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
llama_ngram_cache ngram_cache_context;
llama_ngram_cache ngram_cache_dynamic;

View File

@@ -304,15 +304,13 @@ These options help improve the performance and memory usage of the LLaMA models.
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
### Grammars & JSON schemas
### Grammars
- `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
- `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead.
### Quantization
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
## Additional Options

View File

@@ -235,7 +235,7 @@ int main(int argc, char ** argv) {
// The file exists and is not empty
session_tokens.resize(n_ctx);
size_t n_token_count_out = 0;
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1;
}
@@ -246,7 +246,6 @@ int main(int argc, char ** argv) {
}
const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
LOG("add_bos: %d\n", add_bos);
std::vector<llama_token> embd_inp;
@@ -256,7 +255,7 @@ int main(int argc, char ** argv) {
if (params.chatml) {
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
}
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
} else {
LOG("use session tokens\n");
embd_inp = session_tokens;
@@ -278,10 +277,10 @@ int main(int argc, char ** argv) {
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size();
@@ -340,14 +339,14 @@ int main(int argc, char ** argv) {
}
// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
// chatml prefix & suffix
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
@@ -694,7 +693,7 @@ int main(int argc, char ** argv) {
// optionally save the session on first sample (for faster prompt loading next time)
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
need_to_save_session = false;
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
LOG("saved session to %s\n", path_session.c_str());
}
@@ -936,7 +935,7 @@ int main(int argc, char ** argv) {
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
llama_print_timings(ctx);

View File

@@ -3,18 +3,19 @@
TODO
## Llama 2 70B Scorechart
| Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
|--------------|------------------|------------|---------------|
| Q4_0 | 36.20 | 3.5550 | 3.61% |
| Q4_1 | 40.20 | 3.5125 | 2.37% |
| Q5_0 | 44.20 | 3.4744 | 1.26% |
| Q2_K | 27.27 | 3.7339 | 8.82% |
| Q3_K_S | 27.86 | 3.7019 | 7.89% |
| Q3_K_M | 30.83 | 3.5932 | 4.72% |
| Q3_K_L | 33.67 | 3.5617 | 3.80% |
| Q4_K_S | 36.39 | 3.4852 | 1.57% |
| Q4_K_M | 38.54 | 3.4725 | 1.20% |
| Q5_K_S | 44.20 | 3.4483 | 0.50% |
| Q5_K_M | 45.41 | 3.4451 | 0.40% |
| Q6_K | 52.70 | 3.4367 | 0.16% |
| fp16 | 128.5 | 3.4313 | - |
Quantization | Model size (GiB) | Perplexity | Delta to fp16
-- | -- | -- | --
Q4_0 | 36.20 | 3.5550 | 3.61%
Q4_1 | 40.20 | 3.5125 | 2.37%
Q5_0 | 44.20 | 3.4744 | 1.26%
Q2_K | 27.27 | 3.7339 | 8.82%
Q3_K_S | 27.86 | 3.7019 | 7.89%
Q3_K_M | 30.83 | 3.5932 | 4.72%
Q3_K_L | 33.67 | 3.5617 | 3.80%
Q4_K_S | 36.39 | 3.4852 | 1.57%
Q4_K_M | 38.54 | 3.4725 | 1.20%
Q5_K_S | 44.20 | 3.4483 | 0.50%
Q5_K_M | 45.41 | 3.4451 | 0.40%
Q6_K | 52.70 | 3.4367 | 0.16%
fp16 | 128.5 | 3.4313 | -

View File

@@ -315,11 +315,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
// BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
const int n_ctx = llama_n_ctx(ctx);
@@ -455,7 +454,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
std::ofstream logits_stream;
if (!params.logits_file.empty()) {
@@ -472,7 +470,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -773,6 +771,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
// This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
// The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true;
@@ -817,7 +818,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
for (size_t j = 0; j < 4; j++) {
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
}
// determine the common prefix of the endings
@@ -836,7 +837,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());
//GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
// Delete the selected random example from the prompt
if (randomize_tasks) {
@@ -1109,9 +1110,12 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
// This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
for (auto & task : data) {
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
task.common_prefix = 0;
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@@ -1126,8 +1130,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
task.seq_tokens[0].size() - task.common_prefix +
task.seq_tokens[1].size() - task.common_prefix;
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
}
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
@@ -1318,7 +1322,7 @@ struct multiple_choice_task {
std::vector<float> log_probs;
};
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
if (task.question.empty() || task.mc1.answers.empty()) {
if (log_error) {
printf("%s: found bad task with empty question and/or answers\n", __func__);
@@ -1333,7 +1337,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
}
return false;
}
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
}
auto min_len = task.seq_tokens.front().size();
for (auto& seq : task.seq_tokens) {
@@ -1432,6 +1436,9 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
n_task = params.multiple_choice_tasks;
}
// This is needed as usual for LLaMA models
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
printf("%s: preparing task data", __func__);
fflush(stdout);
if (n_task > 500) {
@@ -1439,7 +1446,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
fflush(stdout);
std::atomic<int> counter(0);
std::atomic<int> n_bad(0);
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
int num_tasks = tasks.size();
int n_bad_local = 0;
while (true) {
@@ -1450,7 +1457,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
}
int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
for (int i = first; i < last; ++i) {
if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
}
}
};
@@ -1472,7 +1479,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
int i_task = 0;
for (auto& task : tasks) {
++i_task;
if (!multiple_choice_prepare_one_task(ctx, task, true)) {
if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
return;
}
if (i_task%n_dot == 0) {
@@ -1708,7 +1715,6 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@@ -1852,20 +1858,12 @@ int main(int argc, char ** argv) {
const int32_t n_ctx = params.n_ctx;
if (n_ctx <= 0) {
fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
return 1;
}
const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
if (ppl) {
const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
const int32_t n_kv = n_seq * n_ctx;
int n_seq = std::max(1, params.n_batch / n_ctx);
int32_t n_kv = n_seq * n_ctx;
params.n_parallel = n_seq;
params.n_ctx = n_kv;
params.n_ctx = n_kv;
params.n_batch = std::min(params.n_batch, n_kv);
} else {
params.n_batch = std::min(params.n_batch, params.n_ctx);

View File

@@ -4,17 +4,17 @@ TODO
## Llama 2 7B
| Quantization | Bits per Weight (BPW) |
|--------------|-----------------------|
| Q2_K | 3.35 |
| Q3_K_S | 3.50 |
| Q3_K_M | 3.91 |
| Q3_K_L | 4.27 |
| Q4_K_S | 4.58 |
| Q4_K_M | 4.84 |
| Q5_K_S | 5.52 |
| Q5_K_M | 5.68 |
| Q6_K | 6.56 |
Quantization | Bits per Weight (BPW)
-- | --
Q2_K | 3.35
Q3_K_S | 3.50
Q3_K_M | 3.91
Q3_K_L | 4.27
Q4_K_S | 4.58
Q4_K_M | 4.84
Q5_K_S | 5.52
Q5_K_M | 5.68
Q6_K | 6.56
## Llama 2 13B
Quantization | Bits per Weight (BPW)

View File

@@ -116,13 +116,13 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
exit(1);
return;
}
int n_entries;
in.read((char *)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
exit(1);
return;
}
for (int i = 0; i < n_entries; ++i) {
int len; in.read((char *)&len, sizeof(len));
@@ -130,11 +130,11 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
exit(1);
return;
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
auto & e = imatrix_data[name];
auto & e = imatrix_data[std::move(name)];
int ncall;
in.read((char *)&ncall, sizeof(ncall));
int nval;
@@ -142,22 +142,18 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n", __func__, i);
imatrix_data = {};
exit(1);
return;
}
e.resize(nval);
in.read((char *)e.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n", __func__, i);
imatrix_data = {};
exit(1);
return;
}
if (ncall > 0) {
for (auto& v : e) v /= ncall;
}
if (getenv("LLAMA_TRACE")) {
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
}
}
printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
}

View File

@@ -8,7 +8,7 @@ print(subprocess.check_output(
"python",
os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"json_schema_to_grammar.py"),
"json-schema-to-grammar.py"),
*rest,
"-",
"--raw-pattern",

View File

@@ -24,7 +24,6 @@ int main(int argc, char ** argv) {
std::string result0;
std::string result1;
std::string result2;
// init
llama_model * model;
@@ -45,8 +44,8 @@ int main(int argc, char ** argv) {
// save state (rng, logits, embedding and kv_cache) to file
{
std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
const size_t written = llama_state_get_data(ctx, state_mem.data());
std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
const size_t written = llama_copy_state_data(ctx, state_mem.data());
FILE *fp_write = fopen("dump_state.bin", "wb");
fwrite(state_mem.data(), 1, written, fp_write);
@@ -98,13 +97,13 @@ int main(int argc, char ** argv) {
// load state (rng, logits, embedding and kv_cache) from file
{
std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));
std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
FILE * fp_read = fopen("dump_state.bin", "rb");
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read);
if (read != llama_state_set_data(ctx2, state_mem.data())) {
if (read != llama_set_state_data(ctx2, state_mem.data())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx2);
llama_free_model(model);
@@ -142,104 +141,16 @@ int main(int argc, char ** argv) {
n_past += 1;
}
printf("\n\n");
printf("\n");
llama_free(ctx2);
llama_free_model(model);
if (result0 != result1) {
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
return 1;
}
// make new context
auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
printf("\nsingle seq run: %s", params.prompt.c_str());
// load state (rng, logits, embedding and kv_cache) from file
{
std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));
FILE * fp_read = fopen("dump_state.bin", "rb");
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read);
if (read != llama_state_set_data(ctx3, state_mem.data())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx3);
llama_free_model(model);
return 1;
}
fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
}
// restore state (last tokens)
n_past = n_past_saved;
// save seq 0 and load into seq 1
{
// save kv of seq 0
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
if (ncopy != seq_store.size()) {
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
llama_free(ctx3);
llama_free_model(model);
return 1;
}
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
// erase whole kv
llama_kv_cache_clear(ctx3);
fprintf(stderr, "%s : kv cache cleared\n", __func__);
// restore kv into seq 1
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
if (nset != seq_store.size()) {
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
llama_free(ctx3);
llama_free_model(model);
return 1;
}
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
}
// third run with seq 1 instead of 0
for (auto i = 0; i < params.n_predict; i++) {
auto * logits = llama_get_logits(ctx3);
auto n_vocab = llama_n_vocab(model);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
auto next_token = llama_sample_token(ctx3, &candidates_p);
auto next_token_str = llama_token_to_piece(ctx3, next_token);
printf("%s", next_token_str.c_str());
result2 += next_token_str;
if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx3);
llama_free_model(model);
return 1;
}
n_past += 1;
}
printf("\n");
llama_free(ctx3);
llama_free_model(model);
if (result0 != result2) {
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
return 1;
}
fprintf(stderr, "\n%s : success\n", __func__);
return 0;

View File

@@ -11,7 +11,7 @@ install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT})
if (LLAMA_SERVER_SSL)
find_package(OpenSSL REQUIRED)
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)

View File

@@ -11,57 +11,57 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
* Continuous batching
* Multimodal (wip)
* Monitoring endpoints
* Schema-constrained JSON response format
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
**Command line options:**
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching, this parameter is used only if one token is to be processed on CPU backend.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused
- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused).
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused).
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`
- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
- `--numa STRATEGY`: Attempt one of the below optimization strategies that help on some NUMA systems
- `--numa distribute`: Spread execution evenly over all nodes
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
- `--numa numactl`: Use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437
- `--numa`: Attempt optimizations that may help on some NUMA systems.
- `--numa`: Attempt optimizations that help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`
- `--port`: Set the port to listen. Default: `8080`
- `--path`: Path from which to serve static files. Default: disabled
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embedding`: Enable embedding extraction. Default: disabled
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
- `--port`: Set the port to listen. Default: `8080`.
- `--path`: path from which to serve static files (default: disabled)
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
- `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled.
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend. Used together with group attention factor `--grp-attn-n`. Default: `512`
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled.
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, not to `llama.log`. default: enabled.
- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
**If compiled with `LLAMA_SERVER_SSL=ON`**
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
@@ -69,7 +69,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
## Build
`server` is built alongside everything else from the root of the project
server is build alongside everything else from the root of the project
- Using `make`:
@@ -85,7 +85,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
## Build with SSL
`server` can also be built with SSL support using OpenSSL 3
server can also be built with SSL support using OpenSSL 3
- Using `make`:
@@ -135,7 +135,7 @@ docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/
## Testing with CURL
Using [curl](https://curl.se/). On Windows, `curl.exe` should be available in the base OS.
Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
```sh
curl --request POST \
@@ -159,7 +159,7 @@ mkdir llama-client
cd llama-client
```
Create a index.js file and put this inside:
Create a index.js file and put inside this:
```javascript
const prompt = `Building a website can be done in 10 simple steps:`;
@@ -190,8 +190,8 @@ node index.js
- 503 -> `{"status": "loading model"}` if the model is still being loaded.
- 500 -> `{"status": "error"}` if the model failed to load.
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slots are currently available.
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slots are currently available.
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
@@ -205,77 +205,75 @@ node index.js
- The model's `tokenizer.ggml.add_bos_token` metadata is `true`
- The system prompt is empty
`temperature`: Adjust the randomness of the generated text. Default: `0.8`
`temperature`: Adjust the randomness of the generated text (default: 0.8).
`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).
`dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
`top_k`: Limit the next token selection to the K most probable tokens. Default: `40`
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05).
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
`stop`: Specify a JSON array of stopping strings.
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
`tfs_z`: Enable tail free sampling with parameter z. Default: `1.0`, which is disabled.
`tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
`typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
`typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
`repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
`repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).
`repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
`repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
`penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`
`penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).
`presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
`presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled).
`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
`frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
`penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`.
`penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens (default: `null` = use the original `prompt`).
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
`mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
`mirostat_eta`: Set the Mirostat learning rate, parameter eta. Default: `0.1`
`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
`grammar`: Set grammar for grammar-based sampling. Default: no grammar
`grammar`: Set grammar for grammar-based sampling (default: no grammar)
`json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features. Default: no JSON schema.
`seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
`seed`: Set the random number generator (RNG) seed. Default: `-1`, which is a random seed.
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
`ignore_eos`: Ignore end of stream token and continue generating. Default: `false`
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
`cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. Default: `false`
`cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values)
### Result JSON
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
@@ -289,7 +287,7 @@ node index.js
},
{
"prob": float,
"tok_str": "<second most likely token>"
"tok_str": "<second most likely tonen>"
},
...
]
@@ -359,16 +357,14 @@ Notice that each `probs` is an array of length `n_probs`.
- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used.
*Options:*
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
*Examples:*
@@ -518,67 +514,16 @@ Available metrics:
- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. `1` means 100 percent usage.
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
- `llamacpp:kv_cache_tokens`: KV-cache tokens.
- `llamacpp:requests_processing`: Number of requests processing.
- `llamacpp:requests_deferred`: Number of requests deferred.
- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
*Options:*
`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
### Result JSON
```json
{
"id_slot": 0,
"filename": "slot_save_file.bin",
"n_saved": 1745,
"n_written": 14309796,
"timings": {
"save_ms": 49.865
}
}
```
- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
*Options:*
`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
### Result JSON
```json
{
"id_slot": 0,
"filename": "slot_save_file.bin",
"n_restored": 1745,
"n_read": 14309796,
"timings": {
"restore_ms": 42.937
}
}
```
- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
### Result JSON
```json
{
"id_slot": 0,
"n_erased": 1745
}
```
- `llamacpp:requests_processing`: Number of request processing.
- `llamacpp:requests_deferred`: Number of request deferred.
## More examples
### Change system prompt on runtime
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
`prompt`: Specify a context that you want all connecting clients to respect.
@@ -617,11 +562,11 @@ bash chat.sh
### OAI-like API
The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi
The HTTP server supports OAI-like API: https://github.com/openai/openai-openapi
### API errors
`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
Server returns error in the same format as OAI: https://github.com/openai/openai-openapi
Example of an error:

View File

@@ -2,15 +2,13 @@
Benchmark is using [k6](https://k6.io/).
##### Install k6 and sse extension
##### Install k6
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
Follow instruction from: https://k6.io/docs/get-started/installation/
Example:
Example for ubuntu:
```shell
go install go.k6.io/xk6/cmd/xk6@latest
xk6 build master \
--with github.com/phymbert/xk6-sse
snap install k6
```
#### Download a dataset
@@ -48,7 +46,7 @@ server --host localhost --port 8080 \
For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
```shell
./k6 run script.js --duration 10m --iterations 500 --vus 8
k6 run script.js --duration 10m --iterations 500 --vus 8
```
The benchmark values can be overridden with:
@@ -88,33 +86,3 @@ K6 metrics might be compared against [server metrics](../README.md), with:
```shell
curl http://localhost:8080/metrics
```
### Using the CI python script
The `bench.py` script does several steps:
- start the server
- define good variable for k6
- run k6 script
- extract metrics from prometheus
It aims to be used in the CI, but you can run it manually:
```shell
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
--runner-label local \
--name local \
--branch `git rev-parse --abbrev-ref HEAD` \
--commit `git rev-parse HEAD` \
--scenario script.js \
--duration 5m \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model-path-prefix models \
--parallel 4 \
-ngl 33 \
--batch-size 2048 \
--ubatch-size 256 \
--ctx-size 4096 \
--n-prompts 200 \
--max-prompt-tokens 256 \
--max-tokens 256
```

View File

@@ -16,7 +16,6 @@ import matplotlib
import matplotlib.dates
import matplotlib.pyplot as plt
import requests
from statistics import mean
def main(args_in: list[str] | None = None) -> None:
@@ -76,6 +75,7 @@ def main(args_in: list[str] | None = None) -> None:
data['metrics'][metric_name][metric_metric]=value
github_env.write(
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
iterations = data['root_group']['checks']['success completion']['passes']
except Exception:
@@ -109,7 +109,6 @@ def main(args_in: list[str] | None = None) -> None:
# Prometheus
end_time = time.time()
prometheus_metrics = {}
if is_server_listening("0.0.0.0", 9090):
metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
@@ -128,7 +127,6 @@ def main(args_in: list[str] | None = None) -> None:
values = metric_data['data']['result'][0]['values']
timestamps, metric_values = zip(*values)
metric_values = [float(value) for value in metric_values]
prometheus_metrics[metric] = metric_values
timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
plt.figure(figsize=(16, 10), dpi=80)
plt.plot(timestamps_dt, metric_values, label=metric)
@@ -178,20 +176,17 @@ xychart-beta
# 140 chars max for commit status description
bench_results = {
"i": iterations,
"req": {
"p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
"avg": round(data['metrics']["http_req_duration"]["avg"], 2),
"p90": data['metrics']["http_req_duration"]["p(90)"],
"avg": data['metrics']["http_req_duration"]["avg"],
},
"pp": {
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
"p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"],
"avg": data['metrics']["llamacpp_prompt_tokens"]["avg"],
},
"tg": {
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
"p90": data['metrics']["llamacpp_tokens_second"]["p(90)"],
"avg": data['metrics']["llamacpp_tokens_second"]["avg"],
},
}
with open("results.github.env", 'a') as github_env:
@@ -205,7 +200,7 @@ xychart-beta
def start_benchmark(args):
k6_path = './k6'
k6_path = 'k6'
if 'BENCH_K6_BIN_PATH' in os.environ:
k6_path = os.environ['BENCH_K6_BIN_PATH']
k6_args = [

View File

@@ -1,4 +1,4 @@
import sse from 'k6/x/sse'
import http from 'k6/http'
import {check, sleep} from 'k6'
import {SharedArray} from 'k6/data'
import {Counter, Rate, Trend} from 'k6/metrics'
@@ -53,9 +53,7 @@ const data = new SharedArray('conversations', function () {
const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -88,62 +86,35 @@ export default function () {
}
],
"model": model,
"stream": true,
"seed": 42,
"stream": false,
"max_tokens": max_tokens
}
const params = {method: 'POST', body: JSON.stringify(payload)};
const body = JSON.stringify(payload)
const startTime = new Date()
let promptEvalEndTime = null
let prompt_tokens = 0
let completions_tokens = 0
let finish_reason = null
const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
client.on('event', function (event) {
if (promptEvalEndTime == null) {
promptEvalEndTime = new Date()
}
let chunk = JSON.parse(event.data)
let choice = chunk.choices[0]
if (choice.finish_reason) {
finish_reason = choice.finish_reason
}
if (chunk.usage) {
prompt_tokens = chunk.usage.prompt_tokens
llamacpp_prompt_tokens.add(prompt_tokens)
llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
completions_tokens = chunk.usage.completion_tokens
llamacpp_completion_tokens.add(completions_tokens)
llamacpp_completion_tokens_total_counter.add(completions_tokens)
}
})
client.on('error', function (e) {
console.log('An unexpected error occurred: ', e.error());
throw e;
})
let res = http.post(`${server_url}/chat/completions`, body, {
headers: {'Content-Type': 'application/json'},
timeout: '300s'
})
check(res, {'success completion': (r) => r.status === 200})
const endTime = new Date()
if (res.status === 200) {
const completions = res.json()
const promptEvalTime = promptEvalEndTime - startTime
if (promptEvalTime > 0) {
llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
}
llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
const completion_time = endTime - promptEvalEndTime
if (completions_tokens > 0 && completion_time > 0) {
llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
llamacpp_completion_tokens.add(completions.usage.completion_tokens)
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
} else {
console.error(`response: ${res.body} request=${payload}`)
}
llamacpp_completions_truncated_rate.add(finish_reason === 'length')
llamacpp_completions_stop_rate.add(finish_reason === 'stop')
sleep(0.3)
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -222,7 +222,6 @@
temperature: 0.7,
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
repeat_penalty: 1.18, // 1.0 = disabled
penalize_nl: false,
top_k: 40, // <= 0 to use vocab size
top_p: 0.95, // 1.0 = disabled
min_p: 0.05, // 0 = disabled
@@ -406,7 +405,7 @@
throw new Error("already running");
}
controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: document.baseURI.replace(/\/+$/, '') })) {
const data = chunk.data;
if (data.stop) {
@@ -628,7 +627,6 @@
const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }
const grammarJsonSchemaPropOrder = signal('')
const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
@@ -672,15 +670,6 @@
`
};
const BoolField = ({ label, name, value }) => {
return html`
<div>
<label for="${name}">${label}</label>
<input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
</div>
`
};
const userTemplateReset = (e) => {
e.preventDefault();
userTemplateResetToDefaultAndApply()
@@ -780,7 +769,6 @@
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
@@ -1015,10 +1003,6 @@
}
function App(props) {
useEffect(() => {
const query = new URLSearchParams(location.search).get("q");
if (query) chat(query);
}, []);
return html`
<div class="mode-${session.value.type}">

View File

@@ -1,95 +1,33 @@
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
// WARNING: This file was ported from json-schema-to-grammar.py, please fix bugs / add features there first.
const SPACE_RULE = '" "?';
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
const separatorRule = opts.separatorRule ?? '';
const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
if (separatorRule === '') {
if (minItems === 0 && maxItems === 1) {
return `${itemRule}?`;
} else if (minItems === 1 && maxItems === undefined) {
return `${itemRule}+`;
}
}
let result = '';
if (minItems > 0) {
if (itemRuleIsLiteral && separatorRule === '') {
result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
} else {
result = Array.from({ length: minItems }, () => itemRule)
.join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
}
}
const optRepetitions = (upToN, prefixWithSep=false) => {
const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
if (upToN === 0) {
return '';
} else if (upToN === 1) {
return `(${content})?`;
} else if (separatorRule !== '' && !prefixWithSep) {
return `(${content} ${optRepetitions(upToN - 1, true)})?`;
} else {
return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
}
};
if (minItems > 0 && maxItems !== minItems) {
result += ' ';
}
if (maxItems !== undefined) {
result += optRepetitions(maxItems - minItems, minItems > 0);
} else {
const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
if (minItems === 0 && separatorRule !== '') {
result = `(${itemRule} ${itemOperator}*)?`;
} else {
result += `${itemOperator}*`;
}
}
return result;
}
class BuiltinRule {
constructor(content, deps) {
this.content = content;
this.deps = deps || [];
}
}
const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
const PRIMITIVE_RULES = {
boolean : new BuiltinRule('("true" | "false") space', []),
'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
uuid : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
null : new BuiltinRule('"null" space', []),
boolean: '("true" | "false") space',
number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
value: 'object | array | string | number | boolean',
object: '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
array: '"[" space ( value ("," space value)* )? "]" space',
uuid: '"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space',
string: ` "\\"" (
[^"\\\\] |
"\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\\"" space`,
null: '"null" space',
};
const OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'];
// TODO: support "uri", "email" string formats
const STRING_FORMAT_RULES = {
'date' : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : new BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']),
'date-time-string': new BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
}
const DATE_RULES = {
'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
'date-time': 'date "T" time',
'date-string': '"\\"" date "\\"" space',
'time-string': '"\\"" time "\\"" space',
'date-time-string': '"\\"" date-time "\\"" space',
};
const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...STRING_FORMAT_RULES};
const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...DATE_RULES};
const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
@@ -220,7 +158,7 @@ export class SchemaConverter {
rule = '[\\U00000000-\\U0010FFFF]';
} else {
// Accept any character... except \n and \r line break chars (\x0A and \xOD)
rule = '[^\\x0A\\x0D]';
rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]';
}
return this._addRule('dot', rule);
};
@@ -321,19 +259,26 @@ export class SchemaConverter {
let [sub, subIsLiteral] = seq[seq.length - 1];
if (!subIsLiteral) {
let id = subRuleIds[sub];
if (id === undefined) {
id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
subRuleIds[sub] = id;
if (minTimes === 0 && maxTimes === Infinity) {
seq[seq.length - 1] = [`${sub}*`, false];
} else if (minTimes === 0 && maxTimes === 1) {
seq[seq.length - 1] = [`${sub}?`, false];
} else if (minTimes === 1 && maxTimes === Infinity) {
seq[seq.length - 1] = [`${sub}+`, false];
} else {
if (!subIsLiteral) {
let id = subRuleIds[sub];
if (id === undefined) {
id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
subRuleIds[sub] = id;
}
sub = id;
}
sub = id;
}
seq[seq.length - 1] = [
_buildRepetition(subIsLiteral ? `"${sub}"` : sub, minTimes, maxTimes, {itemRuleIsLiteral: subIsLiteral}),
false
];
const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub);
const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`];
seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
}
} else {
let literal = '';
while (i < length) {
@@ -449,50 +394,49 @@ export class SchemaConverter {
);
} else {
const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
const minItems = schema.minItems || 0;
const listItemOperator = `( "," space ${itemRuleName} )`;
let successiveItems = '';
let minItems = schema.minItems || 0;
const maxItems = schema.maxItems;
return this._addRule(ruleName, '"[" space ' + _buildRepetition(itemRuleName, minItems, maxItems, {separatorRule: '"," space'}) + ' "]" space');
if (minItems > 0) {
successiveItems = listItemOperator.repeat(minItems - 1);
minItems--;
}
if (maxItems !== undefined && maxItems > minItems) {
successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
} else {
successiveItems += `${listItemOperator}*`;
}
const rule = minItems === 0
? `"[" space ( ${itemRuleName} ${successiveItems} )? "]" space`
: `"[" space ${itemRuleName} ${successiveItems} "]" space`;
return this._addRule(ruleName, rule);
}
} else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
return this._visitPattern(schema.pattern, ruleName);
} else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
return this._addPrimitive(
ruleName === 'root' ? 'root' : schemaFormat,
PRIMITIVE_RULES['uuid']
);
} else if ((schemaType === undefined || schemaType === 'string') && `${schema.format}-string` in STRING_FORMAT_RULES) {
const primName = `${schema.format}-string`
return this._addRule(ruleName, this._addPrimitive(primName, STRING_FORMAT_RULES[primName]));
} else if (schemaType === 'string' && ('minLength' in schema || 'maxLength' in schema)) {
const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
const minLen = schema.minLength || 0;
const maxLen = schema.maxLength;
return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
return this._addRule(
ruleName === 'root' ? 'root' : schemaFormat,
PRIMITIVE_RULES['uuid'])
} else if ((schemaType === undefined || schemaType === 'string') && schema.format in DATE_RULES) {
for (const [t, r] of Object.entries(DATE_RULES)) {
this._addRule(t, r);
}
return schemaFormat + '-string';
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
for (const n of OBJECT_RULE_NAMES) {
this._addRule(n, PRIMITIVE_RULES[n]);
}
return this._addRule(ruleName, 'object');
} else {
if (!(schemaType in PRIMITIVE_RULES)) {
throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
}
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
return this._addPrimitive(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
return this._addRule(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
}
}
_addPrimitive(name, rule) {
let n = this._addRule(name, rule.content);
for (const dep of rule.deps) {
const depRule = PRIMITIVE_RULES[dep] || STRING_FORMAT_RULES[dep];
if (!depRule) {
throw new Error(`Rule ${dep} not known`);
}
if (!(dep in this._rules)) {
this._addPrimitive(dep, depRule);
}
}
return n;
}
_buildObjectRule(properties, required, name, additionalProperties) {
const propOrder = this._propOrder;
// sort by position in prop_order (if specified) then by original order
@@ -518,7 +462,7 @@ export class SchemaConverter {
const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
propKvRuleNames['*'] = this._addRule(
`${subName}-kv`,
`${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
`${this._addRule('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
optionalProps.push('*');
}

View File

@@ -61,10 +61,7 @@ enum server_task_type {
SERVER_TASK_TYPE_COMPLETION,
SERVER_TASK_TYPE_CANCEL,
SERVER_TASK_TYPE_NEXT_RESPONSE,
SERVER_TASK_TYPE_METRICS,
SERVER_TASK_TYPE_SLOT_SAVE,
SERVER_TASK_TYPE_SLOT_RESTORE,
SERVER_TASK_TYPE_SLOT_ERASE,
SERVER_TASK_TYPE_METRICS
};
struct server_task {
@@ -131,7 +128,6 @@ struct server_params {
bool slots_endpoint = true;
bool metrics_endpoint = false;
std::string slot_save_path;
};
struct server_slot {
@@ -689,7 +685,6 @@ struct server_context {
n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
return true;
}
@@ -759,7 +754,7 @@ struct server_context {
metrics.init();
}
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const {
// TODO: currently, we tokenize using special tokens by default
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
// but it's better compared to completely ignoring ChatML and other chat templates
@@ -777,7 +772,7 @@ struct server_context {
std::vector<llama_token> p;
if (first) {
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
first = false;
} else {
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
@@ -794,7 +789,7 @@ struct server_context {
}
} else {
auto s = json_prompt.template get<std::string>();
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
}
return prompt_tokens;
@@ -859,7 +854,7 @@ struct server_context {
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
// process "json_schema" and "grammar"
if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
if (data.contains("json_schema") && data.contains("grammar")) {
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
return false;
} else if (data.contains("json_schema") && !data.contains("grammar")) {
@@ -1059,7 +1054,7 @@ struct server_context {
system_tokens.clear();
if (!system_prompt.empty()) {
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
llama_batch_clear(batch);
@@ -1083,7 +1078,7 @@ struct server_context {
};
if (llama_decode(ctx, batch_view) != 0) {
LOG_ERROR("llama_decode() failed", {});
LOG_TEE("%s: llama_decode() failed\n", __func__);
return;
}
}
@@ -1281,11 +1276,7 @@ struct server_context {
}
void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
LOG_ERROR("task error", {
{"id_multi", id_multi},
{"id_task", id_task},
{"error", error},
});
LOG_TEE("task %i - error: %s\n", id_task, error.c_str());
server_task_result res;
res.id = id_task;
@@ -1621,107 +1612,6 @@ struct server_context {
}
queue_results.send(res);
} break;
case SERVER_TASK_TYPE_SLOT_SAVE:
{
int id_slot = task.data["id_slot"];
server_slot * slot = get_slot(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
const size_t token_count = slot->cache_tokens.size();
const int64_t t_start = ggml_time_us();
std::string filename = task.data["filename"];
std::string filepath = task.data["filepath"];
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
const int64_t t_end = ggml_time_us();
const double t_save_ms = (t_end - t_start) / 1000.0;
server_task_result result;
result.id = task.id;
result.stop = true;
result.error = false;
result.data = json {
{ "id_slot", id_slot },
{ "filename", filename },
{ "n_saved", token_count }, // tokens saved
{ "n_written", nwrite }, // bytes written
{ "timings", {
{ "save_ms", t_save_ms }
} }
};
queue_results.send(result);
} break;
case SERVER_TASK_TYPE_SLOT_RESTORE:
{
int id_slot = task.data["id_slot"];
server_slot * slot = get_slot(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
const int64_t t_start = ggml_time_us();
std::string filename = task.data["filename"];
std::string filepath = task.data["filepath"];
slot->cache_tokens.resize(slot->n_ctx);
size_t token_count = 0;
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
if (nread == 0) {
slot->cache_tokens.resize(0);
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
break;
}
slot->cache_tokens.resize(token_count);
const int64_t t_end = ggml_time_us();
const double t_restore_ms = (t_end - t_start) / 1000.0;
server_task_result result;
result.id = task.id;
result.stop = true;
result.error = false;
result.data = json {
{ "id_slot", id_slot },
{ "filename", filename },
{ "n_restored", token_count }, // tokens restored
{ "n_read", nread }, // bytes read
{ "timings", {
{ "restore_ms", t_restore_ms }
} }
};
queue_results.send(result);
} break;
case SERVER_TASK_TYPE_SLOT_ERASE:
{
int id_slot = task.data["id_slot"];
server_slot * slot = get_slot(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
// Erase token cache
const size_t n_erased = slot->cache_tokens.size();
llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
slot->cache_tokens.clear();
server_task_result result;
result.id = task.id;
result.stop = true;
result.error = false;
result.data = json {
{ "id_slot", id_slot },
{ "n_erased", n_erased }
};
queue_results.send(result);
} break;
}
}
@@ -1919,7 +1809,7 @@ struct server_context {
prefix_tokens.push_back(llama_token_middle(model));
prompt_tokens = prefix_tokens;
} else {
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
}
slot.n_past = 0;
@@ -2190,11 +2080,7 @@ struct server_context {
if (ret != 0) {
if (n_batch == 1 || ret < 0) {
// if you get here, it means the KV cache is full - try increasing it via the context size
LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
{"i", i},
{"n_batch", ret},
{"ret", ret},
});
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
for (auto & slot : slots) {
slot.state = SLOT_STATE_PROCESSING;
slot.command = SLOT_COMMAND_NONE;
@@ -2204,16 +2090,12 @@ struct server_context {
break; // break loop of n_batch
}
LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
// retry with half the batch size to try to find a free slot in the KV cache
n_batch /= 2;
i -= n_batch;
LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
{"i", i},
{"n_batch", n_batch},
{"ret", ret},
});
continue; // continue loop of n_batch
}
@@ -2307,6 +2189,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_supports_mlock()) {
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
@@ -2329,8 +2213,6 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row)\n");
printf(" -nkvo, --no-kv-offload\n");
printf(" disable KV offload\n");
}
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
@@ -2367,7 +2249,6 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" --log-disable disables logging to a file.\n");
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
printf(" --slot-save-path PATH path to save slot kv cache (default: disabled)\n");
printf("\n");
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
printf(" --override-kv KEY=TYPE:VALUE\n");
@@ -2617,8 +2498,6 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
"See main README.md for information on enabling GPU BLAS support",
{{"n_gpu_layers", params.n_gpu_layers}});
}
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
params.no_kv_offload = true;
} else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) {
invalid_param = true;
@@ -2776,16 +2655,6 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
sparams.slots_endpoint = false;
} else if (arg == "--metrics") {
sparams.metrics_endpoint = true;
} else if (arg == "--slot-save-path") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.slot_save_path = argv[i];
// if doesn't end with DIRECTORY_SEPARATOR, add it
if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
sparams.slot_save_path += DIRECTORY_SEPARATOR;
}
} else if (arg == "--chat-template") {
if (++i >= argc) {
invalid_param = true;
@@ -3288,112 +3157,6 @@ int main(int argc, char ** argv) {
res.status = 200; // HTTP OK
};
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body);
std::string filename = request_data["filename"];
if (!validate_file_name(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return;
}
std::string filepath = sparams.slot_save_path + filename;
server_task task;
task.type = SERVER_TASK_TYPE_SLOT_SAVE;
task.data = {
{ "id_slot", id_slot },
{ "filename", filename },
{ "filepath", filepath }
};
const int id_task = ctx_server.queue_tasks.post(task);
ctx_server.queue_results.add_waiting_task_id(id_task);
server_task_result result = ctx_server.queue_results.recv(id_task);
ctx_server.queue_results.remove_waiting_task_id(id_task);
if (result.error) {
res_error(res, result.data);
} else {
res.set_content(result.data.dump(), "application/json");
}
};
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body);
std::string filename = request_data["filename"];
if (!validate_file_name(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return;
}
std::string filepath = sparams.slot_save_path + filename;
server_task task;
task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
task.data = {
{ "id_slot", id_slot },
{ "filename", filename },
{ "filepath", filepath }
};
const int id_task = ctx_server.queue_tasks.post(task);
ctx_server.queue_results.add_waiting_task_id(id_task);
server_task_result result = ctx_server.queue_results.recv(id_task);
ctx_server.queue_results.remove_waiting_task_id(id_task);
if (result.error) {
res_error(res, result.data);
} else {
res.set_content(result.data.dump(), "application/json");
}
};
const auto handle_slots_erase = [&ctx_server, &res_error](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
server_task task;
task.type = SERVER_TASK_TYPE_SLOT_ERASE;
task.data = {
{ "id_slot", id_slot },
};
const int id_task = ctx_server.queue_tasks.post(task);
ctx_server.queue_results.add_waiting_task_id(id_task);
server_task_result result = ctx_server.queue_results.recv(id_task);
ctx_server.queue_results.remove_waiting_task_id(id_task);
if (result.error) {
res_error(res, result.data);
} else {
res.set_content(result.data.dump(), "application/json");
}
};
const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
std::string id_slot_str = req.path_params.at("id_slot");
int id_slot;
try {
id_slot = std::stoi(id_slot_str);
} catch (const std::exception &) {
res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
return;
}
std::string action = req.get_param_value("action");
if (action == "save") {
handle_slots_save(req, res, id_slot);
} else if (action == "restore") {
handle_slots_restore(req, res, id_slot);
} else if (action == "erase") {
handle_slots_erase(req, res, id_slot);
} else {
res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
}
};
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = {
@@ -3756,10 +3519,6 @@ int main(int argc, char ** argv) {
svr->Post("/v1/embeddings", handle_embeddings);
svr->Post("/tokenize", handle_tokenize);
svr->Post("/detokenize", handle_detokenize);
if (!sparams.slot_save_path.empty()) {
// only enable slot endpoints if slot_save_path is set
svr->Post("/slots/:id_slot", handle_slots_action);
}
//
// Start the server

View File

@@ -1,58 +0,0 @@
@llama.cpp
@slotsave
Feature: llama.cpp server slot management
Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And prompt caching is enabled
And 2 slots
And . as slot save path
And 2048 KV cache size
And 42 as server seed
And 24 max tokens to predict
Then the server is starting
Then the server is healthy
Scenario: Save and Restore Slot
# First prompt in slot 1 should be fully processed
Given a user prompt "What is the capital of France?"
And using slot id 1
And a completion request with no api error
Then 24 tokens are predicted matching (Lily|cake)
And 22 prompt tokens are processed
When the slot 1 is saved with filename "slot1.bin"
Then the server responds with status code 200
# Since we have cache, this should only process the last tokens
Given a user prompt "What is the capital of Germany?"
And a completion request with no api error
Then 24 tokens are predicted matching (Thank|special)
And 7 prompt tokens are processed
# Loading the original cache into slot 0,
# we should only be processing 1 prompt token and get the same output
When the slot 0 is restored with filename "slot1.bin"
Then the server responds with status code 200
Given a user prompt "What is the capital of France?"
And using slot id 0
And a completion request with no api error
Then 24 tokens are predicted matching (Lily|cake)
And 1 prompt tokens are processed
# For verification that slot 1 was not corrupted during slot 0 load, same thing
Given a user prompt "What is the capital of Germany?"
And using slot id 1
And a completion request with no api error
Then 24 tokens are predicted matching (Thank|special)
And 1 prompt tokens are processed
Scenario: Erase Slot
Given a user prompt "What is the capital of France?"
And using slot id 1
And a completion request with no api error
Then 24 tokens are predicted matching (Lily|cake)
And 22 prompt tokens are processed
When the slot 1 is erased
Then the server responds with status code 200
Given a user prompt "What is the capital of France?"
And a completion request with no api error
Then 24 tokens are predicted matching (Lily|cake)
And 22 prompt tokens are processed

View File

@@ -49,9 +49,6 @@ def step_server_config(context, server_fqdn, server_port):
context.n_predict = None
context.n_prompts = 0
context.n_server_predict = None
context.slot_save_path = None
context.id_slot = None
context.cache_prompt = None
context.n_slots = None
context.prompt_prefix = None
context.prompt_suffix = None
@@ -122,21 +119,6 @@ def step_server_n_predict(context, n_predict):
context.n_server_predict = n_predict
@step('{slot_save_path} as slot save path')
def step_slot_save_path(context, slot_save_path):
context.slot_save_path = slot_save_path
@step('using slot id {id_slot:d}')
def step_id_slot(context, id_slot):
context.id_slot = id_slot
@step('prompt caching is enabled')
def step_enable_prompt_cache(context):
context.cache_prompt = True
@step('continuous batching')
def step_server_continuous_batching(context):
context.server_continuous_batching = True
@@ -230,8 +212,6 @@ async def step_request_completion(context, api_error):
context.base_url,
debug=context.debug,
n_predict=context.n_predict,
cache_prompt=context.cache_prompt,
id_slot=context.id_slot,
seed=await completions_seed(context),
expect_api_error=expect_api_error,
user_api_key=context.user_api_key)
@@ -731,48 +711,12 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
await asyncio.sleep(0.1)
@step('the slot {slot_id:d} is saved with filename "{filename}"')
@async_run_until_complete
async def step_save_slot(context, slot_id, filename):
async with aiohttp.ClientSession() as session:
async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
json={"filename": filename},
headers={"Content-Type": "application/json"}) as response:
context.response = response
@step('the slot {slot_id:d} is restored with filename "{filename}"')
@async_run_until_complete
async def step_restore_slot(context, slot_id, filename):
async with aiohttp.ClientSession() as session:
async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
json={"filename": filename},
headers={"Content-Type": "application/json"}) as response:
context.response = response
@step('the slot {slot_id:d} is erased')
@async_run_until_complete
async def step_erase_slot(context, slot_id):
async with aiohttp.ClientSession() as session:
async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
headers={"Content-Type": "application/json"}) as response:
context.response = response
@step('the server responds with status code {status_code:d}')
def step_server_responds_with_status_code(context, status_code):
assert context.response.status == status_code
async def request_completion(prompt,
base_url,
debug=False,
prompt_prefix=None,
prompt_suffix=None,
n_predict=None,
cache_prompt=False,
id_slot=None,
seed=None,
expect_api_error=None,
user_api_key=None):
@@ -794,8 +738,6 @@ async def request_completion(prompt,
"prompt": prompt,
"input_suffix": prompt_suffix,
"n_predict": n_predict if n_predict is not None else -1,
"cache_prompt": cache_prompt,
"id_slot": id_slot,
"seed": seed if seed is not None else 42
},
headers=headers,
@@ -1162,8 +1104,6 @@ def start_server_background(context):
server_args.extend(['--parallel', context.n_slots])
if context.n_server_predict:
server_args.extend(['--n-predict', context.n_server_predict])
if context.slot_save_path:
server_args.extend(['--slot-save-path', context.slot_save_path])
if context.server_api_key:
server_args.extend(['--api-key', context.server_api_key])
if context.n_ga:

View File

@@ -49,23 +49,12 @@ extern bool server_log_json;
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value) {
// Fallback null to default value
if (body.contains(key) && !body.at(key).is_null()){
try {
return body.value(key, default_value);
}
catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
server_log("WARN", __func__, __LINE__, message.c_str(), body);
return default_value;
}
} else {
return default_value;
}
return body.contains(key) && !body.at(key).is_null()
? body.value(key, default_value)
: default_value;
}
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
@@ -567,15 +556,6 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
{"model", modelname},
{"object", "chat.completion.chunk"}
};
if (!finish_reason.empty()) {
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
ret.push_back({"usage", json {
{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
}});
}
return std::vector<json>({ret});
}

View File

@@ -76,28 +76,6 @@ int main(int argc, char ** argv) {
params.n_threads_batch = params.n_threads_batch_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
const bool vocab_type_dft = llama_vocab_type(model_dft);
LOG("vocab_type dft: %d\n", vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) {
fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
return 1;
}
if (
llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
) {
fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
return 1;
}
{
const int n_vocab_tgt = llama_n_vocab(model_tgt);
const int n_vocab_dft = llama_n_vocab(model_dft);
@@ -127,8 +105,20 @@ int main(int argc, char ** argv) {
// Tokenize the prompt
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
LOG("add_bos tgt: %d\n", add_bos_tgt);
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
LOG("add_bos dft: %d\n", add_bos_dft);
if (add_bos_tgt != add_bos_dft) {
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
return 1;
}
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
const int max_context_size = llama_n_ctx(ctx_tgt);
const int max_tokens_list_size = max_context_size - 4;

View File

@@ -20,4 +20,4 @@ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#cmake --build . --config Release --target llama-bench
#build all binary
cmake --build . --config Release -j -v
cmake --build . --config Release -v

View File

@@ -12,7 +12,6 @@ if [ $# -gt 0 ]; then
GGML_SYCL_SINGLE_GPU=1
else
GGML_SYCL_DEVICE=0
GGML_SYCL_SINGLE_GPU=0
fi
#export GGML_SYCL_DEBUG=1

View File

@@ -26,9 +26,11 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
const bool add_bos = llama_should_add_bos_token(model);
std::vector<llama_token> tokens;
tokens = ::llama_tokenize(model, prompt, true, true);
tokens = ::llama_tokenize(model, prompt, add_bos, true);
for (int i = 0; i < (int) tokens.size(); i++) {
if (printing_ids) {

View File

@@ -1,7 +1,7 @@
#!/bin/bash
#
# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json
# python examples/json-schema-to-grammar.py https://json.schemastore.org/tsconfig.json
#
set -euo pipefail
@@ -25,4 +25,4 @@ npx ts-json-schema-generator --unstable --no-top-ref --path "$DTS_FILE" --type M
# https://github.com/YousefED/typescript-json-schema
# npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2
./examples/json_schema_to_grammar.py "$SCHEMA_FILE"
./examples/json-schema-to-grammar.py "$SCHEMA_FILE"

18
flake.lock generated
View File

@@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1712014858,
"narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
"lastModified": 1709336216,
"narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
"rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
"type": "github"
},
"original": {
@@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1712791164,
"narHash": "sha256-3sbWO1mbpWsLepZGbWaMovSO7ndZeFqDSdX0hZ9nVyw=",
"lastModified": 1711163522,
"narHash": "sha256-YN/Ciidm+A0fmJPWlHBGvVkcarYWSC+s3NTPk/P+q3c=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "1042fd8b148a9105f3c0aca3a6177fd1d9360ba5",
"rev": "44d0940ea560dee511026a53f0e2e2cde489b4d4",
"type": "github"
},
"original": {
@@ -37,11 +37,11 @@
"nixpkgs-lib": {
"locked": {
"dir": "lib",
"lastModified": 1711703276,
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
"lastModified": 1709237383,
"narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
"rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
"type": "github"
},
"original": {

View File

@@ -137,7 +137,7 @@ extern "C" {
/*
Example usage:
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
// preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);

View File

@@ -447,11 +447,10 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_SYCL)
#include <cstdint>
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };
#define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory<const type, 1> name(sycl::range<1>(size), {
#define GGML_TABLE_END() });
#define GGML_COMMON_IMPL
#endif

View File

@@ -401,8 +401,10 @@ GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t
GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
if (tensor->view_src != NULL) {
if (tensor->view_src != NULL && tensor->view_offs == 0) {
assert(tensor->view_src->buffer->buft == buffer->buft);
tensor->backend = tensor->view_src->backend;
tensor->extra = tensor->view_src->extra;
return;
}
@@ -1225,7 +1227,7 @@ static void ggml_cuda_op_mul_mat_cublas(
// the main device has a larger memory buffer to hold the results from all GPUs
// ldc == nrows of the matrix that cuBLAS writes into
int64_t ldc = id == ctx.device ? ne0 : row_diff;
int ldc = id == ctx.device ? ne0 : row_diff;
const int compute_capability = ggml_cuda_info().devices[id].cc;
@@ -1377,8 +1379,8 @@ static void ggml_cuda_op_mul_mat(
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
const int64_t nb2 = dst->nb[2];
const int64_t nb3 = dst->nb[3];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
@@ -1946,7 +1948,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
} else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) {
@@ -1960,49 +1962,227 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
}
}
#if 0
template<typename ... Srcs>
static __global__ void k_compute_batched_ptrs_id(
const void ** ptrs_src, void ** ptrs_dst,
int ne12, int ne13,
int ne23,
int nb02, int nb03,
int nb12, int nb13,
int nb2, int nb3,
int r2, int r3,
ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
const half * src1_f16, half * dst_f16,
const int32_t * ids, const int id,
Srcs... src0s) {
int i = ids[id];
half * src0_f16;
const void * srcs_ar[] = { (const half *) src0s... };
if (src0_type == GGML_TYPE_F16) {
src0_f16 = (half *) srcs_ar[i];
} else {
src0_f16 = src0_as_f16;
if (threadIdx.x == 0 && threadIdx.y == 0) {
const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
}
}
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
if (i13 >= ne13 || i12 >= ne12) {
return;
}
int i03 = i13 / r3;
int i02 = i12 / r2;
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03;
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
}
static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
const struct ggml_tensor * ids = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * src00 = dst->src[2];
const int id = dst->op_params[0];
GGML_ASSERT(!ggml_is_transposed(src00));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
const int64_t ne01 = src00->ne[1];
const int64_t ne02 = src00->ne[2];
const int64_t ne03 = src00->ne[3];
//const int64_t nb01 = src00->nb[1];
const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
//const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst);
ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
//ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
//void * src0_ddq = src0_extra->data_device[g_main_device];
//half * src0_as_f16 = (half *) src0_ddq;
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
// convert src1 to fp16
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
GGML_ASSERT(to_fp16_cuda != nullptr);
size_t src1_as = 0;
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
size_t dst_as = 0;
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
// broadcast factors
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
// use cublasGemmBatchedEx
const int ne23 = ne12*ne13;
const void ** ptrs_src = nullptr;
void ** ptrs_dst = nullptr;
size_t ptrs_src_s = 0;
size_t ptrs_dst_s = 0;
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
int64_t src0_ne = ggml_nelements(src00);
half * src0_as_f16 = nullptr;
size_t src0_as = 0;
if (src00->type != GGML_TYPE_F16) {
src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
}
static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
dim3 block_dims(ne13, ne12);
k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
ptrs_src, ptrs_dst,
ne12, ne13,
ne23,
ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
nb12, nb13,
dst->nb[2], dst->nb[3],
r2, r3,
src00->type, src0_as_f16, src0_ne,
src1_as_f16, dst_f16,
(const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
);
CUDA_CHECK(cudaGetLastError());
CUBLAS_CHECK(
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
ne23,
CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (src0_as != 0) {
ggml_cuda_pool_free(src0_as_f16, src0_as);
}
if (ptrs_src_s != 0) {
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
}
if (ptrs_dst_s != 0) {
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
}
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
ggml_cuda_pool_free(src1_as_f16, src1_as);
ggml_cuda_pool_free(dst_f16, dst_as);
}
#endif
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
#if 0
ggml_cuda_mul_mat_id_cublas(dst);
// TODO: mmq/mmv support
#endif
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * ids = dst->src[2];
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
cudaStream_t stream = ctx.stream();
const size_t nb11 = src1->nb[1];
const size_t nb1 = dst->nb[1];
const struct ggml_tensor * ids = src0;
const int32_t id = ((int32_t *) dst->op_params)[0];
const int32_t n_as = src0->ne[2];
const int32_t n_as = ((int32_t *) dst->op_params)[1];
std::vector<char> ids_host(ggml_nbytes(ids));
const char * ids_dev = (const char *) ids->data;
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
ggml_tensor src0_row = *src0;
ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst;
char * src0_original = (char *) src0->data;
char * src1_original = (char *) src1->data;
char * dst_original = (char *) dst->data;
src0_row.ne[2] = 1;
src0_row.ne[3] = 1;
src0_row.nb[3] = src0->nb[2];
if (src1->ne[1] == 1) {
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(row_id >= 0 && row_id < n_as);
src0_row.data = src0_original + row_id*src0->nb[2];
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
src1_row.data = src1_original + i01*src1->nb[1];
dst_row.data = dst_original + i01*dst->nb[1];
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row);
}
} else {
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
@@ -2012,6 +2192,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
dst_row.data = dst_contiguous.get();
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
int64_t num_src1_rows = 0;
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
@@ -2031,8 +2213,6 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
continue;
}
src0_row.data = src0_original + row_id*src0->nb[2];
src1_row.ne[1] = num_src1_rows;
dst_row.ne[1] = num_src1_rows;
@@ -2044,7 +2224,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
dst_row.nb[2] = num_src1_rows*nb1;
dst_row.nb[3] = num_src1_rows*nb1;
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row);
num_src1_rows = 0;
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -2209,7 +2389,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
CUDA_CHECK(err);
GGML_ASSERT(false);
}
return true;
@@ -2617,7 +2797,6 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
return false;
}
#if CUDART_VERSION >= 11100
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
if (err != cudaSuccess) {
// clear the error
@@ -2628,9 +2807,6 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
return false;
}
return true;
#else
return false;
#endif
}
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {

View File

@@ -8,41 +8,32 @@ static inline __device__ void ggml_cuda_swap(T & a, T & b) {
}
template<ggml_sort_order order>
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
// bitonic sort
int col = threadIdx.x;
int row = blockIdx.y;
if (col >= ncols_pad) {
return;
}
if (col >= ncols) return;
const float * x_row = x + row * ncols;
extern __shared__ int dst_row[];
int * dst_row = dst + row * ncols;
// initialize indices
dst_row[col] = col;
if (col < ncols) {
dst_row[col] = col;
}
__syncthreads();
for (int k = 2; k <= ncols_pad; k *= 2) {
for (int k = 2; k <= ncols; k *= 2) {
for (int j = k / 2; j > 0; j /= 2) {
int ixj = col ^ j;
if (ixj > col) {
if ((col & k) == 0) {
if (dst_row[col] >= ncols ||
(dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
) {
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
}
} else {
if (dst_row[ixj] >= ncols ||
(dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
) {
if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
ggml_cuda_swap(dst_row[col], dst_row[ixj]);
}
}
@@ -50,35 +41,18 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n
__syncthreads();
}
}
// copy the result to dst without the padding
if (col < ncols) {
dst[row * ncols + col] = dst_row[col];
}
}
static int next_power_of_2(int x) {
int n = 1;
while (n < x) {
n *= 2;
}
return n;
}
static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
// bitonic sort requires ncols to be power of 2
const int ncols_pad = next_power_of_2(ncols);
GGML_ASSERT((ncols & (ncols - 1)) == 0);
const dim3 block_dims(ncols_pad, 1, 1);
const dim3 block_dims(ncols, 1, 1);
const dim3 block_nums(1, nrows, 1);
const size_t shared_mem = ncols_pad * sizeof(int);
GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
if (order == GGML_SORT_ORDER_ASC) {
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
} else if (order == GGML_SORT_ORDER_DESC) {
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
} else {
GGML_ASSERT(false);
}

View File

@@ -394,7 +394,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
// TODO: move to ggml-common.h
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
//////////////////////

View File

@@ -4,14 +4,14 @@
#define CUDA_Q8_0_NE_ALIGN 2048
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
if (i >= k) {
return;
}
const int64_t ib = i/qk; // block index
const int ib = i/qk; // block index
const int iqs = (i%qk)/qr; // quant index
const int iybs = i - i%qk; // y block start index
const int y_offset = qr == 1 ? 1 : qk/2;
@@ -25,7 +25,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
}
template <bool need_check>
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
#if __CUDA_ARCH__ >= CC_PASCAL
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
@@ -68,13 +68,13 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
template<typename dst_t>
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
const int64_t i = blockIdx.x;
const int i = blockIdx.x;
// assume 32 threads
const int tid = threadIdx.x;
const int il = tid/8;
const int ir = tid%8;
const int64_t ib = 8*i + ir;
const int ib = 8*i + ir;
if (ib >= nb32) {
return;
}
@@ -96,13 +96,13 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
template<typename dst_t>
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
const int64_t i = blockIdx.x;
const int i = blockIdx.x;
// assume 32 threads
const int tid = threadIdx.x;
const int il = tid/8;
const int ir = tid%8;
const int64_t ib = 8*i + ir;
const int ib = 8*i + ir;
if (ib >= nb32) {
return;
}
@@ -313,14 +313,14 @@ template<typename dst_t>
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const block_q6_K * x = (const block_q6_K *) vx;
const int64_t i = blockIdx.x;
const int i = blockIdx.x;
#if QK_K == 256
// assume 64 threads - this is very slightly better than the one below
const int64_t tid = threadIdx.x;
const int64_t ip = tid/32; // ip is 0 or 1
const int64_t il = tid - 32*ip; // 0...32
const int64_t is = 8*ip + il/16;
const int tid = threadIdx.x;
const int ip = tid/32; // ip is 0 or 1
const int il = tid - 32*ip; // 0...32
const int is = 8*ip + il/16;
dst_t * y = yy + i*QK_K + 128*ip + il;
@@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
#else
// assume 32 threads
const int64_t tid = threadIdx.x;
const int64_t ip = tid/16; // 0 or 1
const int64_t il = tid - 16*ip; // 0...15
const int tid = threadIdx.x;
const int ip = tid/16; // 0 or 1
const int il = tid - 16*ip; // 0...15
dst_t * y = yy + i*QK_K + 16*ip + il;
@@ -571,12 +571,12 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
#endif
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
const bool need_check = false;
@@ -588,7 +588,7 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
}
template<typename dst_t>
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -598,7 +598,7 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k
}
template<typename dst_t>
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -608,27 +608,27 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k
}
template<typename dst_t>
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb32 = k / 32;
const int nb = (k + 255) / 256;
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
}
template<typename dst_t>
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb32 = k / 32;
const int nb = (k + 255) / 256;
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
}
template<typename dst_t>
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -638,7 +638,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k
}
template<typename dst_t>
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
@@ -648,55 +648,55 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k
}
template<typename dst_t>
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = (k + QK_K - 1) / QK_K;
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
}
template<typename dst_t>
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
const int nb = (k + QK_K - 1) / QK_K;
#if QK_K == 64
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
@@ -706,8 +706,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t
}
template <typename src_t, typename dst_t>
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
@@ -719,7 +719,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
}
template <typename src_t, typename dst_t>
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}

View File

@@ -3,7 +3,7 @@
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
template<typename T>
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
typedef to_t_cuda_t<float> to_fp32_cuda_t;
typedef to_t_cuda_t<half> to_fp16_cuda_t;

View File

@@ -1,6 +1,6 @@
#include "common.cuh"
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
const block_q4_0 * x = (const block_q4_0 *) vx;
const dfloat d = x[ib].d;
@@ -19,7 +19,7 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
#endif // GGML_CUDA_F16
}
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
const block_q4_1 * x = (const block_q4_1 *) vx;
const dfloat d = __low2half(x[ib].dm);
@@ -39,7 +39,7 @@ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const in
#endif // GGML_CUDA_F16
}
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
const block_q5_0 * x = (const block_q5_0 *) vx;
const dfloat d = x[ib].d;
@@ -62,7 +62,7 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
#endif // GGML_CUDA_F16
}
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
const block_q5_1 * x = (const block_q5_1 *) vx;
const dfloat d = __low2half(x[ib].dm);
@@ -86,7 +86,7 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
#endif // GGML_CUDA_F16
}
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
const block_q8_0 * x = (const block_q8_0 *) vx;
const dfloat d = x[ib].d;

View File

@@ -565,7 +565,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
}
}
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
const half * x = (const half *) vx;
// automatic half -> float type cast if dfloat == float
@@ -577,7 +577,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
// qk = quantized weights per x block
// qr = number of quantized weights per data value in x block
const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row >= nrows) {
return;
@@ -598,7 +598,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
for (int i = 0; i < ncols; i += iter_stride) {
const int col = i + vals_per_iter*tid;
const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
const int ib = (row*ncols + col)/qk; // x block index
const int iqs = (col%qk)/qr; // x quant index
const int iybs = col - col%qk; // y block start index

View File

@@ -1,20 +1,20 @@
#include "quantize.cuh"
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
if (ix >= kx_padded) {
return;
}
const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
const int64_t i_padded = (int64_t)iy*kx_padded + ix;
const int i_padded = iy*kx_padded + ix;
block_q8_1 * y = (block_q8_1 *) vy;
const int64_t ib = i_padded / QK8_1; // block index
const int64_t iqs = i_padded % QK8_1; // quant index
const int ib = i_padded / QK8_1; // block index
const int iqs = i_padded % QK8_1; // quant index
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
float amax = fabsf(xi);
@@ -36,8 +36,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
reinterpret_cast<half&>(y[ib].ds.y) = sum;
}
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, ky, 1);
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);

View File

@@ -2,4 +2,4 @@
#define CUDA_QUANTIZE_BLOCK_SIZE 256
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream);

View File

@@ -37,7 +37,6 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_DIV_ROW,
GGML_METAL_KERNEL_TYPE_SCALE,
GGML_METAL_KERNEL_TYPE_SCALE_4,
GGML_METAL_KERNEL_TYPE_CLAMP,
GGML_METAL_KERNEL_TYPE_TANH,
GGML_METAL_KERNEL_TYPE_RELU,
GGML_METAL_KERNEL_TYPE_GELU,
@@ -469,7 +468,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
@@ -715,7 +713,6 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_SCALE:
case GGML_OP_CLAMP:
case GGML_OP_SQR:
case GGML_OP_SUM_ROWS:
return true;
@@ -1157,25 +1154,6 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_CLAMP:
{
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
float min;
float max;
memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&min length:sizeof(min) atIndex:2];
[encoder setBytes:&max length:sizeof(max) atIndex:3];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(gf->nodes[i])) {
case GGML_UNARY_OP_TANH:
@@ -1707,31 +1685,37 @@ static enum ggml_status ggml_metal_graph_compute(
{
//GGML_ASSERT(ne00 == ne10);
//GGML_ASSERT(ne03 == ne13);
const int n_as = src0->ne[2];
GGML_ASSERT(src0t == GGML_TYPE_I32);
const int n_as = ((int32_t *) dst->op_params)[1];
// TODO: make this more general
GGML_ASSERT(n_as <= 8);
// max size of the src1ids array in the kernel shared buffer
GGML_ASSERT(ne11 <= 4096);
// src2 = ids
const int64_t ne20 = src2->ne[0]; GGML_UNUSED(ne20);
const int64_t ne21 = src2->ne[1];
const int64_t ne22 = src2->ne[2]; GGML_UNUSED(ne22);
const int64_t ne23 = src2->ne[3]; GGML_UNUSED(ne23);
const int64_t ne20 = src2 ? src2->ne[0] : 0;
const int64_t ne21 = src2 ? src2->ne[1] : 0;
const int64_t ne22 = src2 ? src2->ne[2] : 0;
const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20);
const uint64_t nb21 = src2->nb[1];
const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22);
const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23);
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);
const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
GGML_ASSERT(src2t == GGML_TYPE_I32);
GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src2));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(src1t == GGML_TYPE_F32);
const uint r2 = ne12/ne22;
const uint r3 = ne13/ne23;
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
// to the matrix-vector kernel
int ne11_mm_min = n_as;
@@ -1739,10 +1723,7 @@ static enum ggml_status ggml_metal_graph_compute(
const int idx = ((int32_t *) dst->op_params)[0];
// batch size
GGML_ASSERT(ne21 == ne11); // ?
GGML_ASSERT(ne12 == 1 && ne13 == 1); // no broadcasting
const uint r2 = 1;
const uint r3 = 1;
GGML_ASSERT(ne01 == ne11);
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
@@ -1751,7 +1732,7 @@ static enum ggml_status ggml_metal_graph_compute(
// indirect matrix multiplication
// !!!
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
ne00 % 32 == 0 && ne00 >= 64 &&
ne20 % 32 == 0 && ne20 >= 64 &&
ne11 > ne11_mm_min) {
// some Metal matrix data types require aligned pointers
@@ -1764,7 +1745,7 @@ static enum ggml_status ggml_metal_graph_compute(
id<MTLComputePipelineState> pipeline = nil;
switch (src0->type) {
switch (src2->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break;
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break;
@@ -1793,27 +1774,36 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:4];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:5];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:9];
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:10];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:11];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:12];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:13];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:14];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:15];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:16];
[encoder setBytes:&r2 length:sizeof(r2) atIndex:17];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:18];
[encoder setBytes:&idx length:sizeof(idx) atIndex:19];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:5];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6];
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:7];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:8];
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:9];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
[encoder setBytes:&r2 length:sizeof(r2) atIndex:16];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
// TODO: how to make this an array? read Metal docs
for (int j = 0; j < 8; ++j) {
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
}
[encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne01 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
} else {
int nth0 = 32;
int nth1 = 1;
@@ -1823,7 +1813,7 @@ static enum ggml_status ggml_metal_graph_compute(
id<MTLComputePipelineState> pipeline = nil;
// use custom matrix x vector kernel
switch (src0t) {
switch (src2t) {
case GGML_TYPE_F32:
{
GGML_ASSERT(src1t == GGML_TYPE_F32);
@@ -1948,12 +1938,7 @@ static enum ggml_status ggml_metal_graph_compute(
{
nth0 = 4;
nth1 = 16;
#if QK_K == 64
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
#else
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
#endif
} break;
default:
{
@@ -1962,8 +1947,8 @@ static enum ggml_status ggml_metal_graph_compute(
}
};
if (ggml_is_quantized(src0t)) {
GGML_ASSERT(ne00 >= nth0*nth1);
if (ggml_is_quantized(src2t)) {
GGML_ASSERT(ne20 >= nth0*nth1);
}
const int64_t _ne1 = 1; // kernels needs a reference in constant memory
@@ -1972,66 +1957,75 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:4];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:5];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:6];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:7];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:8];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:9];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:10];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:12];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:18];
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:19];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:20];
[encoder setBytes:&r2 length:sizeof(r2) atIndex:21];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:22];
[encoder setBytes:&idx length:sizeof(idx) atIndex:23];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
[encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
[encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
[encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
[encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17];
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19];
[encoder setBytes:&r2 length:sizeof(r2) atIndex:20];
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
// TODO: how to make this an array? read Metal docs
for (int j = 0; j < 8; ++j) {
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 ||
src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K ||
src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
size_t offs_src_cur = 0;
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
}
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || src2t == GGML_TYPE_Q5_0 ||
src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || src2t == GGML_TYPE_Q2_K ||
src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ1_M || src2t == GGML_TYPE_IQ2_S) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) {
const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
else if (src2t == GGML_TYPE_IQ4_NL || src2t == GGML_TYPE_IQ4_XS) {
const int mem_size = 32*sizeof(float);
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
else if (src2t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q3_K) {
else if (src2t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#else
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#endif
}
else if (src0t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
else if (src2t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
else if (src2t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else {
const int64_t ny = (_ne1 + nrows - 1)/nrows;
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
}
} break;
@@ -2438,16 +2432,6 @@ static enum ggml_status ggml_metal_graph_compute(
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
// bitonic sort requires the number of elements to be power of 2
int64_t ne00_padded = 1;
while (ne00_padded < ne00) {
ne00_padded *= 2;
}
// Metal kernels require the buffer size to be multiple of 16 bytes
// https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength
const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16);
id<MTLComputePipelineState> pipeline = nil;
switch (order) {
@@ -2457,13 +2441,11 @@ static enum ggml_status ggml_metal_graph_compute(
};
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3];
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
} break;
case GGML_OP_LEAKY_RELU:
{

Some files were not shown because too many files have changed in this diff Show More