Compare commits

..

4 Commits

Author SHA1 Message Date
Francis Couture-Harpin
3fe362fe49 gguf-py : use ThreadPoolExecutor when writing tensors
- gguf-py : handle (limited) retries for remote tensors
2025-04-12 00:00:51 -04:00
Francis Couture-Harpin
d7db1593ee Merge branch 'master' into compilade/parallel-convert 2025-04-11 15:18:33 -04:00
Francis Couture-Harpin
d8bab9efa1 gguf-py : add more clarifying comments for multi-thread writes 2025-04-08 21:55:15 -04:00
Francis Couture-Harpin
06e1d3119a convert : write tensors in parallel 2025-04-08 16:59:52 -04:00
678 changed files with 45043 additions and 174694 deletions

View File

@@ -13,7 +13,6 @@ Checks: >
-readability-magic-numbers,
-readability-uppercase-literal-suffix,
-readability-simplify-boolean-expr,
-readability-math-missing-parentheses,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,

View File

@@ -14,9 +14,9 @@ WORKDIR /app
COPY . .
RUN if [ "$TARGETARCH" = "amd64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \

View File

@@ -21,7 +21,7 @@ COPY . .
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -1,4 +1,4 @@
ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
## Build Image
@@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -22,7 +22,7 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli
# TODO: use image with NNRT

View File

@@ -1,10 +1,10 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc4.0.1
ARG MUSA_VERSION=rc3.1.1
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
@@ -21,14 +21,21 @@ RUN apt-get update && \
libcurl4-openssl-dev \
libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Use the default MUSA archs if not specified
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -40,7 +40,7 @@ WORKDIR /app
COPY . .
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
&& cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib \

View File

@@ -16,7 +16,7 @@ WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -21,15 +21,15 @@ indent_style = tab
[prompts/*.txt]
insert_final_newline = unset
[tools/server/public/*]
[examples/server/public/*]
indent_size = 2
[tools/server/public/deps_*]
[examples/server/public/deps_*]
trim_trailing_whitespace = unset
indent_style = unset
indent_size = unset
[tools/server/deps_*]
[examples/server/deps_*]
trim_trailing_whitespace = unset
indent_style = unset
indent_size = unset
@@ -37,7 +37,7 @@ indent_size = unset
[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab
[tools/cvector-generator/*.txt]
[examples/cvector-generator/*.txt]
trim_trailing_whitespace = unset
insert_final_newline = unset
@@ -48,7 +48,3 @@ end_of_line = unset
charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset
[vendor/miniaudio/miniaudio.h]
trim_trailing_whitespace = unset
insert_final_newline = unset

View File

@@ -2,9 +2,8 @@
max-line-length = 125
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
exclude =
# Do not traverse examples and tools
# Do not traverse examples
examples,
tools,
# Do not include package initializers
__init__.py,
# No need to traverse our git directory

View File

@@ -1,22 +0,0 @@
name: "Determine tag name"
description: "Determine the tag name to use for a release"
outputs:
name:
description: "The name of the tag"
value: ${{ steps.tag.outputs.name }}
runs:
using: "composite"
steps:
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi

View File

@@ -1,67 +0,0 @@
name: "Windows - Setup CUDA Toolkit"
description: "Setup CUDA Toolkit for Windows"
inputs:
cuda_version:
description: "CUDA toolkit version"
required: true
runs:
using: "composite"
steps:
- name: Install Cuda Toolkit 11.7
if: ${{ inputs.cuda_version == '11.7' }}
shell: pwsh
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install Cuda Toolkit 12.4
if: ${{ inputs.cuda_version == '12.4' }}
shell: pwsh
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8

View File

@@ -5,10 +5,6 @@ inputs:
description: 'CURL version'
required: false
default: '8.6.0_6'
architecture:
description: 'Architecture of the libcurl to download'
required: false
default: 'win64'
outputs:
curl_path:
description: "Path to the downloaded libcurl"
@@ -22,9 +18,8 @@ runs:
shell: powershell
env:
CURL_VERSION: ${{ inputs.curl_version }}
ARCHITECTURE: ${{ inputs.architecture }}
run: |
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
mkdir $env:RUNNER_TEMP/libcurl
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT

13
.github/labeler.yml vendored
View File

@@ -45,9 +45,7 @@ build:
- CMakePresets.json
examples:
- changed-files:
- any-glob-to-any-file:
- examples/**
- tools/**
- any-glob-to-any-file: examples/**
devops:
- changed-files:
- any-glob-to-any-file:
@@ -72,7 +70,7 @@ android:
server:
- changed-files:
- any-glob-to-any-file:
- tools/server/**
- examples/server/**
ggml:
- changed-files:
- any-glob-to-any-file:
@@ -86,10 +84,3 @@ nix:
embedding:
- changed-files:
- any-glob-to-any-file: examples/embedding/
Ascend NPU:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-cann.h
- ggml/src/ggml-cann/**
- docs/backend/CANN.md

View File

@@ -27,10 +27,10 @@ on:
push:
branches:
- master
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
pull_request_target:
types: [opened, synchronize, reopened]
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
schedule:
- cron: '04 2 * * *'
@@ -69,7 +69,7 @@ jobs:
- name: Install python env
id: pipenv
run: |
cd tools/server/bench
cd examples/server/bench
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
@@ -79,7 +79,7 @@ jobs:
run: |
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
tar xzf prometheus*.tar.gz --strip-components=1
./prometheus --config.file=tools/server/bench/prometheus.yml &
./prometheus --config.file=examples/server/bench/prometheus.yml &
while ! nc -z localhost 9090; do
sleep 0.1
done
@@ -92,7 +92,7 @@ jobs:
- name: Install k6 and xk6-sse
id: k6_installation
run: |
cd tools/server/bench
cd examples/server/bench
go install go.k6.io/xk6/cmd/xk6@latest
xk6 build master \
--with github.com/phymbert/xk6-sse
@@ -116,7 +116,7 @@ jobs:
- name: Download the dataset
id: download_dataset
run: |
cd tools/server/bench
cd examples/server/bench
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
- name: Server bench
@@ -126,7 +126,7 @@ jobs:
run: |
set -eux
cd tools/server/bench
cd examples/server/bench
source venv/bin/activate
python bench.py \
--runner-label ${{ env.RUNNER_LABEL }} \
@@ -157,9 +157,9 @@ jobs:
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
compression-level: 9
path: |
tools/server/bench/*.jpg
tools/server/bench/*.json
tools/server/bench/*.log
examples/server/bench/*.jpg
examples/server/bench/*.json
examples/server/bench/*.log
- name: Commit status
uses: Sibz/github-status-action@v1
@@ -178,17 +178,17 @@ jobs:
with:
client_id: ${{secrets.IMGUR_CLIENT_ID}}
path: |
tools/server/bench/prompt_tokens_seconds.jpg
tools/server/bench/predicted_tokens_seconds.jpg
tools/server/bench/kv_cache_usage_ratio.jpg
tools/server/bench/requests_processing.jpg
examples/server/bench/prompt_tokens_seconds.jpg
examples/server/bench/predicted_tokens_seconds.jpg
examples/server/bench/kv_cache_usage_ratio.jpg
examples/server/bench/requests_processing.jpg
- name: Extract mermaid
id: set_mermaid
run: |
set -eux
cd tools/server/bench
cd examples/server/bench
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV

View File

@@ -4,37 +4,29 @@ on:
workflow_call:
jobs:
ubuntu-24-riscv64-cpu-cross:
runs-on: ubuntu-24.04
ubuntu-latest-riscv64-cpu-cross:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Riscv
run: |
sudo dpkg --add-architecture riscv64
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
sudo apt-get clean
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu
g++-14-riscv64-linux-gnu \
libcurl4-openssl-dev:riscv64
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
@@ -48,40 +40,35 @@ jobs:
cmake --build build --config Release -j $(nproc)
ubuntu-24-riscv64-vulkan-cross:
runs-on: ubuntu-24.04
ubuntu-latest-riscv64-vulkan-cross:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Riscv
run: |
sudo dpkg --add-architecture riscv64
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
sudo apt-get clean
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libvulkan-dev:riscv64
libvulkan-dev:riscv64 \
libcurl4-openssl-dev:riscv64
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
@@ -95,39 +82,34 @@ jobs:
cmake --build build --config Release -j $(nproc)
ubuntu-24-arm64-vulkan-cross:
runs-on: ubuntu-24.04
ubuntu-latest-arm64-vulkan-cross:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Arm64
run: |
sudo dpkg --add-architecture arm64
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
sudo apt-get clean
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
crossbuild-essential-arm64 \
libvulkan-dev:arm64
libvulkan-dev:arm64 \
libcurl4-openssl-dev:arm64
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=aarch64 \
@@ -140,207 +122,3 @@ jobs:
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
ubuntu-24-ppc64el-cpu-cross:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: Setup PowerPC64le
run: |
sudo dpkg --add-architecture ppc64el
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
ubuntu-24-ppc64el-vulkan-cross:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: Setup PowerPC64le
run: |
sudo dpkg --add-architecture ppc64el
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
libvulkan-dev:ppc64el
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-cpu-cross:
runs-on: ubuntu-24.04
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
steps:
- uses: actions/checkout@v4
- name: Setup LoongArch
run: |
rm -f /etc/apt/sources.list.d/*
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
EOF
( echo 'quiet "true";'; \
echo 'APT::Get::Assume-Yes "true";'; \
echo 'APT::Install-Recommends "false";'; \
echo 'Acquire::Check-Valid-Until "false";'; \
echo 'Acquire::Retries "5";'; \
) > /etc/apt/apt.conf.d/99snapshot-repos
apt-get update
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
dpkg --add-architecture loong64
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
EOF
apt-get update || true ;# Prevent failure due to missing URLs.
apt-get install -y --no-install-recommends \
build-essential \
gcc-14-loongarch64-linux-gnu \
g++-14-loongarch64-linux-gnu
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-vulkan-cross:
runs-on: ubuntu-24.04
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
steps:
- uses: actions/checkout@v4
- name: Setup LoongArch
run: |
rm -f /etc/apt/sources.list.d/*
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
EOF
( echo 'quiet "true";'; \
echo 'APT::Get::Assume-Yes "true";'; \
echo 'APT::Install-Recommends "false";'; \
echo 'Acquire::Check-Valid-Until "false";'; \
echo 'Acquire::Retries "5";'; \
) > /etc/apt/apt.conf.d/99snapshot-repos
apt-get update
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
dpkg --add-architecture loong64
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
EOF
apt-get update || true ;# Prevent failure due to missing URLs.
apt-get install -y --no-install-recommends \
build-essential \
glslc \
gcc-14-loongarch64-linux-gnu \
g++-14-loongarch64-linux-gnu \
libvulkan-dev:loong64
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)

File diff suppressed because it is too large Load Diff

View File

@@ -36,13 +36,10 @@ jobs:
matrix:
config:
# Multi-stage build
# Note: the arm64 images are failing, which prevents the amd64 images from being built
# https://github.com/ggml-org/llama.cpp/issues/11888
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }

View File

@@ -1,749 +0,0 @@
name: Release
on:
workflow_dispatch: # allows manual triggering
inputs:
create_release:
description: 'Create new release'
required: true
type: boolean
push:
branches:
- master
paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
jobs:
macOS-arm64:
runs-on: macos-14
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-arm64
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
name: llama-bin-macos-arm64.zip
macOS-x64:
runs-on: macos-13
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-x64
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
run: |
sysctl -a
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip
ubuntu-22-cpu:
strategy:
matrix:
include:
- build: 'x64'
os: ubuntu-22.04
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
# - build: 'arm64'
# os: ubuntu-22.04-arm
runs-on: ${{ matrix.os }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-cpu-cmake
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Build
id: cmake_build
run: |
cmake -B build \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
-DLLAMA_FATAL_WARNINGS=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
name: llama-bin-ubuntu-${{ matrix.build }}.zip
ubuntu-22-vulkan:
runs-on: ubuntu-22.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-vulkan
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
- name: Build
id: cmake_build
run: |
cmake -B build \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_VULKAN=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
name: llama-bin-ubuntu-vulkan-x64.zip
windows-cpu:
runs-on: windows-latest
strategy:
matrix:
include:
- arch: 'x64'
- arch: 'arm64'
steps:
- name: Clone
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-cpu-${{ matrix.arch }}
variant: ccache
evict-old-files: 1d
- name: Install Ninja
run: |
choco install ninja
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
with:
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
- name: Build
shell: cmd
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
cmake -S . -B build -G "Ninja Multi-Config" ^
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-DGGML_NATIVE=OFF ^
-DGGML_BACKEND_DL=ON ^
-DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
-DGGML_OPENMP=ON ^
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
${{ env.CMAKE_ARGS }}
cmake --build build --config Release
- name: Pack artifacts
id: pack_artifacts
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-bin-win-cpu-${{ matrix.arch }}.zip
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
windows:
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
VULKAN_VERSION: 1.4.309.0
strategy:
matrix:
include:
- backend: 'vulkan'
arch: 'x64'
defines: '-DGGML_VULKAN=ON'
target: 'ggml-vulkan'
- backend: 'opencl-adreno'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
target: 'ggml-opencl'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
variant: ccache
evict-old-files: 1d
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.backend == 'vulkan' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
run: |
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
cmake -B build `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build --target install
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
cmake -B build-arm64-release `
-A arm64 `
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build-arm64-release --target install --config release
- name: Build
id: cmake_build
run: |
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
cmake --build build --config Release --target ${{ matrix.target }}
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
windows-cuda:
runs-on: windows-2022
strategy:
matrix:
cuda: ['12.4']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-cuda-${{ matrix.cuda }}
variant: ccache
evict-old-files: 1d
- name: Install Cuda Toolkit
uses: ./.github/actions/windows-setup-cuda
with:
cuda_version: ${{ matrix.cuda }}
- name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Build
id: cmake_build
shell: cmd
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^
-DGGML_BACKEND_DL=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_CPU=OFF ^
-DGGML_CUDA=ON ^
-DLLAMA_CURL=OFF
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime
run: |
echo "Cuda install location: ${{ env.CUDA_PATH }}"
$dst='.\build\bin\cudart\'
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime
uses: actions/upload-artifact@v4
with:
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
windows-sycl:
runs-on: windows-latest
defaults:
run:
shell: bash
env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-sycl
variant: ccache
evict-old-files: 1d
- name: Install
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
- name: Build
id: cmake_build
shell: cmd
run: |
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
cmake -G "Ninja" -B build ^
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
-DLLAMA_CURL=OFF
cmake --build build --target ggml-sycl -j
- name: Build the release package
id: pack_artifacts
run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package
uses: actions/upload-artifact@v4
with:
path: llama-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip
windows-hip:
runs-on: windows-latest
strategy:
matrix:
include:
- name: "radeon"
gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Clone rocWMMA repository
id: clone_rocwmma
run: |
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: windows-latest-cmake-hip-${{ matrix.name }}-x64
evict-old-files: 1d
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Build
id: cmake_build
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-DCMAKE_BUILD_TYPE=Release `
-DGGML_BACKEND_DL=ON `
-DGGML_NATIVE=OFF `
-DGGML_CPU=OFF `
-DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
-DLLAMA_CURL=OFF
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
ios-xcode-build:
runs-on: macos-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Build
id: cmake_build
run: |
sysctl -a
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
- name: xcodebuild for swift package
id: xcodebuild
run: |
./build-xcframework.sh
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
name: llama-${{ steps.tag.outputs.name }}-xcframework
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
# Fine-grant permission
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
permissions:
contents: write # for creating release
runs-on: ubuntu-latest
needs:
- windows
- windows-cpu
- windows-cuda
- windows-sycl
- windows-hip
- ubuntu-22-cpu
- ubuntu-22-vulkan
- macOS-arm64
- macOS-x64
- ios-xcode-build
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
with:
path: ./artifact
merge-multiple: true
- name: Move artifacts
id: move_artifacts
run: |
mkdir -p release
echo "Adding CPU backend files to existing zips..."
for arch in x64 arm64; do
cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
temp_dir=$(mktemp -d)
echo "Extracting CPU backend for $arch..."
unzip "$cpu_zip" -d "$temp_dir"
echo "Adding CPU files to $arch zips..."
for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
if [[ "$target_zip" == "$cpu_zip" ]]; then
continue
fi
echo "Adding CPU backend to $(basename "$target_zip")"
realpath_target_zip=$(realpath "$target_zip")
(cd "$temp_dir" && zip -r "$realpath_target_zip" .)
done
rm -rf "$temp_dir"
done
echo "Renaming and moving zips to release..."
for zip_file in artifact/llama-bin-win-*.zip; do
base_name=$(basename "$zip_file" .zip)
zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
echo "Moving $zip_file to release/$zip_name"
mv "$zip_file" "release/$zip_name"
done
echo "Moving other artifacts..."
mv -v artifact/*.zip release
- name: Create release
id: create_release
uses: ggml-org/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.tag.outputs.name }}
- name: Upload release
id: upload_release
uses: actions/github-script@v3
with:
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./release')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: release_id,
name: file,
data: await fs.readFileSync(`./release/${file}`)
});
}
}

View File

@@ -15,10 +15,10 @@ on:
push:
branches:
- master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
env:
LLAMA_LOG_COLORS: 1
@@ -74,7 +74,7 @@ jobs:
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
pip install -r examples/server/tests/requirements.txt
# Setup nodejs (to be used for verifying bundled index.html)
- uses: actions/setup-node@v4
@@ -84,14 +84,14 @@ jobs:
- name: WebUI - Install dependencies
id: webui_lint
run: |
cd tools/server/webui
cd examples/server/webui
npm ci
- name: WebUI - Check code format
id: webui_format
run: |
git config --global --add safe.directory $(realpath .)
cd tools/server/webui
cd examples/server/webui
git status
npm run format
@@ -108,7 +108,7 @@ jobs:
id: verify_server_index_html
run: |
git config --global --add safe.directory $(realpath .)
cd tools/server/webui
cd examples/server/webui
git status
npm run build
@@ -161,26 +161,26 @@ jobs:
env:
GITHUB_ACTIONS: "true"
run: |
cd tools/server/tests
cd examples/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd tools/server/tests
cd examples/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
cd examples/server/tests
SLOW_TESTS=1 ./tests.sh
server-windows:
runs-on: windows-2022
runs-on: windows-2019
steps:
- name: Clone
@@ -211,7 +211,7 @@ jobs:
- name: Tests dependencies
id: test_dependencies
run: |
pip install -r tools/server/tests/requirements.txt
pip install -r examples/server/tests/requirements.txt
- name: Copy Libcurl
id: prepare_libcurl
@@ -224,7 +224,7 @@ jobs:
id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: |
cd tools/server/tests
cd examples/server/tests
$env:PYTHONIOENCODING = ":replace"
pytest -v -x -m "not slow"
@@ -232,6 +232,6 @@ jobs:
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: |
cd tools/server/tests
cd examples/server/tests
$env:SLOW_TESTS = "1"
pytest -v -x

View File

@@ -1,42 +0,0 @@
name: Update Winget Package
on:
workflow_dispatch: # allows manual triggering
schedule:
- cron: '28 5 * * *' # Update every day at 5:28 UTC
jobs:
update:
name: Update Winget Package
runs-on: ubuntu-latest
steps:
- name: Install cargo binstall
uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
- name: Install komac
run: |
cargo binstall komac@2.11.2 -y
- name: Find latest release
id: find_latest_release
uses: actions/github-script@v6
with:
script: |
const { data: releases } = await github.rest.repos.listReleases({
owner: context.repo.owner,
repo: context.repo.repo,
});
console.log("Latest release:", releases[0].tag_name);
return releases[0].tag_name;
- name: Update manifest
env:
VERSION: ${{ steps.find_latest_release.outputs.result }}
run: |
echo "Updating manifest..."
komac update --version ${{ env.VERSION }} \
--urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
--token ${{ secrets.WINGET_GITHUB_TOKEN }} \
--submit \
ggml.llamacpp

12
.gitignore vendored
View File

@@ -96,11 +96,11 @@ perf-*.txt
# Examples
examples/jeopardy/results.txt
tools/server/*.css.hpp
tools/server/*.html.hpp
tools/server/*.js.hpp
tools/server/*.mjs.hpp
tools/server/*.gz.hpp
examples/server/*.css.hpp
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp
examples/server/*.gz.hpp
!build_64.sh
!examples/*.bat
!examples/*/*.kts
@@ -110,7 +110,7 @@ tools/server/*.gz.hpp
# Server Web UI temporary files
node_modules
tools/server/webui/dist
examples/server/webui/dist
# Python

View File

@@ -77,7 +77,6 @@ option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE
# extra artifacts
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
@@ -159,11 +158,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
if (MINGW)
# Target Windows 8 for PrefetchVirtualMemory
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
endif()
#
# build the library
#
@@ -193,10 +187,6 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
add_subdirectory(pocs)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
add_subdirectory(tools)
endif()
#
# install
#
@@ -257,3 +247,20 @@ configure_file(cmake/llama.pc.in
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
#
# copy the license files
#
# Check if running in GitHub Actions
if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
message(STATUS "Running inside GitHub Actions - copying license files")
# Copy all files from licenses/ to build/bin/
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
foreach(LICENSE_FILE ${LICENSE_FILES})
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
endforeach()
endif()

View File

@@ -38,6 +38,15 @@
}
},
{
"name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
}
},
{
"name": "arm64-windows-llvm", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
@@ -64,6 +73,10 @@
{ "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
{ "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang", "reldbg", "static" ] },
{ "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },

View File

@@ -2,7 +2,7 @@
/ci/ @ggerganov
/.devops/*.Dockerfile @ngxson
/tools/server/ @ngxson
/examples/server/ @ngxson
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler

105
Makefile
View File

@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
endif
ifndef GGML_NO_CPU_AARCH64
MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
endif
# warnings
@@ -780,6 +780,10 @@ ifdef GGML_HIP
MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
ifdef GGML_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # GGML_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
@@ -970,7 +974,7 @@ OBJ_GGML = \
$(DIR_GGML)/src/ggml-threading.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
$(DIR_GGML)/src/ggml-cpu/repack.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
@@ -1156,10 +1160,10 @@ $(LIB_COMMON_S): $(OBJ_COMMON)
# Clean generated server assets
clean-server-assets:
find tools/server -type f -name "*.js.hpp" -delete
find tools/server -type f -name "*.mjs.hpp" -delete
find tools/server -type f -name "*.css.hpp" -delete
find tools/server -type f -name "*.html.hpp" -delete
find examples/server -type f -name "*.js.hpp" -delete
find examples/server -type f -name "*.mjs.hpp" -delete
find examples/server -type f -name "*.css.hpp" -delete
find examples/server -type f -name "*.html.hpp" -delete
# Clean rule
clean: clean-server-assets
@@ -1179,7 +1183,7 @@ clean: clean-server-assets
# Helper function that replaces .c, .cpp, and .cu file endings with .o:
GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
llama-cli: tools/main/main.cpp \
llama-cli: examples/main/main.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1187,7 +1191,12 @@ llama-cli: tools/main/main.cpp \
@echo '==== Run ./llama-cli -h for help. ===='
@echo
llama-run: tools/run/run.cpp \
llama-infill: examples/infill/infill.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-run: examples/run/run.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1202,7 +1211,7 @@ llama-simple-chat: examples/simple-chat/simple-chat.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-tokenize: tools/tokenize/tokenize.cpp \
llama-tokenize: examples/tokenize/tokenize.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1212,27 +1221,27 @@ llama-batched: examples/batched/batched.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-batched-bench: tools/batched-bench/batched-bench.cpp \
llama-batched-bench: examples/batched-bench/batched-bench.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-quantize: tools/quantize/quantize.cpp \
llama-quantize: examples/quantize/quantize.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-quantize-stats: tools/quantize-stats/quantize-stats.cpp \
llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-perplexity: tools/perplexity/perplexity.cpp \
llama-perplexity: examples/perplexity/perplexity.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-imatrix: tools/imatrix/imatrix.cpp \
llama-imatrix: examples/imatrix/imatrix.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1274,7 +1283,7 @@ llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/s
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-gguf-split: tools/gguf-split/gguf-split.cpp \
llama-gguf-split: examples/gguf-split/gguf-split.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1284,7 +1293,7 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-cvector-generator: tools/cvector-generator/cvector-generator.cpp \
llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1294,12 +1303,12 @@ llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-bench: tools/llama-bench/llama-bench.cpp \
llama-bench: examples/llama-bench/llama-bench.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-export-lora: tools/export-lora/export-lora.cpp \
llama-export-lora: examples/export-lora/export-lora.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -1355,17 +1364,17 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
ifdef GGML_RPC
rpc-server: tools/rpc/rpc-server.cpp \
rpc-server: examples/rpc/rpc-server.cpp \
$(OBJ_GGML)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif # GGML_RPC
llama-server: \
tools/server/server.cpp \
tools/server/utils.hpp \
tools/server/httplib.h \
tools/server/index.html.hpp \
tools/server/loading.html.hpp \
examples/server/server.cpp \
examples/server/utils.hpp \
examples/server/httplib.h \
examples/server/index.html.hpp \
examples/server/loading.html.hpp \
common/chat.cpp \
common/chat.h \
common/chat-template.hpp \
@@ -1373,10 +1382,10 @@ llama-server: \
common/minja.hpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Itools/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
# Portable equivalent of `cd tools/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
tools/server/%.hpp: tools/server/public/% FORCE Makefile
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
examples/server/%.hpp: examples/server/public/% FORCE Makefile
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
echo "unsigned char $${NAME}[] = {" && \
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
@@ -1389,36 +1398,36 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
libllava.a: tools/mtmd/llava.cpp \
tools/mtmd/llava.h \
tools/mtmd/clip.cpp \
tools/mtmd/clip.h \
libllava.a: examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
common/stb_image.h \
common/base64.hpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
llama-llava-cli: tools/mtmd/llava-cli.cpp \
tools/mtmd/llava.cpp \
tools/mtmd/llava.h \
tools/mtmd/clip.cpp \
tools/mtmd/clip.h \
llama-llava-cli: examples/llava/llava-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
llama-minicpmv-cli: tools/mtmd/minicpmv-cli.cpp \
tools/mtmd/llava.cpp \
tools/mtmd/llava.h \
tools/mtmd/clip.cpp \
tools/mtmd/clip.h \
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
llama-qwen2vl-cli: tools/mtmd/qwen2vl-cli.cpp \
tools/mtmd/llava.cpp \
tools/mtmd/llava.h \
tools/mtmd/clip.cpp \
tools/mtmd/clip.h \
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
@@ -1475,12 +1484,12 @@ tests/test-double-float: tests/test-double-float.cpp
tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-chat: tests/test-chat.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -Itools/server -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.cpp \

View File

@@ -3,7 +3,6 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
@@ -17,10 +16,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Hot topics
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
@@ -29,30 +26,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
----
## Quick start
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
- Run with Docker - see our [Docker documentation](docs/docker.md)
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
Example command:
```sh
# Use a local model file
llama-cli -m my_model.gguf
# Or download and run a model directly from Hugging Face
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
# Launch OpenAI-compatible API server
llama-server -hf ggml-org/gemma-3-1b-it-GGUF
```
## Description
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@@ -62,7 +35,7 @@ range of hardware - locally and in the cloud.
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2, AVX512 and AMX support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
- Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
@@ -155,7 +128,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
<details>
<summary>Bindings</summary>
- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@@ -255,7 +227,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
</details>
## Supported backends
| Backend | Target devices |
@@ -264,13 +235,23 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [BLAS](docs/build.md#blas-build) | All |
| [BLIS](docs/backend/BLIS.md) | All |
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
| [HIP](docs/build.md#hip) | AMD GPU |
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
## Building the project
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
- Clone this repository and build locally, see [how to build](docs/build.md)
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
- Use a Docker image, see [documentation for Docker](docs/docker.md)
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
## Obtaining and quantizing models
@@ -279,11 +260,7 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
```sh
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
```
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
@@ -298,9 +275,9 @@ The Hugging Face platform provides a variety of online tools for converting, qua
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
To learn more about model quantization, [read this documentation](examples/quantize/README.md)
## [`llama-cli`](tools/main)
## [`llama-cli`](examples/main)
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
@@ -363,7 +340,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
</details>
## [`llama-server`](tools/server)
## [`llama-server`](examples/server)
#### A lightweight, [OpenAI API](https://github.com/openai/openai-openapi) compatible, HTTP server for serving LLMs.
@@ -433,7 +410,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
</details>
## [`llama-perplexity`](tools/perplexity)
## [`llama-perplexity`](examples/perplexity)
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
@@ -458,10 +435,10 @@ To learn more about model quantization, [read this documentation](tools/quantize
</details>
[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
[^1]: [examples/perplexity/README.md](./examples/perplexity/README.md)
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
## [`llama-bench`](tools/llama-bench)
## [`llama-bench`](examples/llama-bench)
#### Benchmark the performance of the inference for various parameters.
@@ -482,7 +459,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
</details>
## [`llama-run`](tools/run)
## [`llama-run`](examples/run)
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
@@ -526,8 +503,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
## Other documentation
- [main (cli)](tools/main/README.md)
- [server](tools/server/README.md)
- [main (cli)](examples/main/README.md)
- [server](examples/server/README.md)
- [GBNF grammars](grammars/README.md)
#### Development documentation
@@ -593,12 +570,4 @@ automatically. For example:
$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
```
## Dependencies
- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
## References

View File

@@ -40,8 +40,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
### Untrusted environments or networks
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
* Encrypt your data if sending it over the network.
### Multi-Tenant environments

View File

@@ -8,7 +8,6 @@ TVOS_MIN_OS_VERSION=16.4
BUILD_SHARED_LIBS=OFF
LLAMA_BUILD_EXAMPLES=OFF
LLAMA_BUILD_TOOLS=OFF
LLAMA_BUILD_TESTS=OFF
LLAMA_BUILD_SERVER=OFF
GGML_METAL=ON
@@ -32,7 +31,6 @@ COMMON_CMAKE_ARGS=(
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
@@ -117,7 +115,6 @@ setup_framework_structure() {
# Copy all required headers (common for all platforms)
cp include/llama.h ${header_path}
cp ggml/include/ggml.h ${header_path}
cp ggml/include/ggml-opt.h ${header_path}
cp ggml/include/ggml-alloc.h ${header_path}
cp ggml/include/ggml-backend.h ${header_path}
cp ggml/include/ggml-metal.h ${header_path}

View File

@@ -54,7 +54,7 @@ docker run --privileged -it \
-v $HOME/llama.cpp/ci-cache:/ci-cache \
-v $HOME/llama.cpp/ci-results:/ci-results \
-v $PWD:/ws -w /ws \
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
mthreads/musa:rc3.1.1-devel-ubuntu22.04
```
Inside the container, execute the following commands:

View File

@@ -46,20 +46,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
if command -v nvidia-smi >/dev/null 2>&1; then
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
else
echo "Warning: Using fallback CUDA architectures"
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
fi
else
echo "Error: nvidia-smi not found, cannot build with CUDA"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -200,8 +187,8 @@ function gg_run_test_scripts_debug {
set -e
(cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}
@@ -224,8 +211,8 @@ function gg_run_test_scripts_release {
set -e
(cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./tools/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}

View File

@@ -0,0 +1,6 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )

View File

@@ -41,20 +41,14 @@ endif()
if(MSVC)
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
if (CMAKE_VS_PLATFORM_NAME)
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
else()
set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
endif()
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
else()
execute_process(
COMMAND ${CMAKE_C_COMPILER} --version
COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
set(BUILD_COMPILER ${OUT})
execute_process(
COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OUT

View File

@@ -3,3 +3,9 @@ set( CMAKE_SYSTEM_PROCESSOR x86_64 )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( arch_c_flags "-march=native" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )

View File

@@ -39,9 +39,7 @@ add_custom_command(
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
@@ -58,24 +56,21 @@ add_library(${TARGET} STATIC
arg.cpp
arg.h
base64.hpp
chat-parser.cpp
chat-parser.h
chat.cpp
chat.h
common.cpp
common.h
console.cpp
console.h
json-partial.cpp
json-partial.h
json-schema-to-grammar.cpp
json.hpp
llguidance.cpp
log.cpp
log.h
minja/chat-template.hpp
minja/minja.hpp
ngram-cache.cpp
ngram-cache.h
regex-partial.cpp
regex-partial.h
sampling.cpp
sampling.h
speculative.cpp
@@ -122,8 +117,8 @@ if (LLAMA_LLGUIDANCE)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v0.7.20 (+ fix to build on GCC 15):
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
# v0.7.10:
GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE
@@ -144,30 +139,6 @@ if (LLAMA_LLGUIDANCE)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
endif ()
target_include_directories(${TARGET} PUBLIC . ../vendor)
target_include_directories(${TARGET} PUBLIC .)
target_compile_features (${TARGET} PUBLIC cxx_std_17)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
#
# copy the license files
#
# Check if running in GitHub Actions
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
message(STATUS "Running inside GitHub Actions - copying license files")
# Copy all files from licenses/ to build/bin/
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
foreach(LICENSE_FILE ${LICENSE_FILES})
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
add_custom_command(
POST_BUILD
TARGET ${TARGET}
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${LICENSE_FILE}"
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
endforeach()
endif()

File diff suppressed because it is too large Load Diff

View File

@@ -78,12 +78,3 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
// function to be used by test-arg-parser
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
bool common_has_curl();
struct common_remote_params {
std::vector<std::string> headers;
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
};
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);

View File

@@ -1,380 +0,0 @@
#include "chat-parser.h"
#include "common.h"
#include "log.h"
#include "regex-partial.h"
#include <optional>
#include <stdexcept>
#include <string>
#include <vector>
using json = nlohmann::ordered_json;
common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
: input_(input), is_partial_(is_partial), syntax_(syntax)
{
result_.role = "assistant";
while (true) {
std::string id = std::to_string(std::rand());
if (input.find(id) == std::string::npos) {
healing_marker_ = id;
break;
}
}
}
std::string common_chat_msg_parser::str(const common_string_range & rng) const {
GGML_ASSERT(rng.begin <= rng.end);
return input_.substr(rng.begin, rng.end - rng.begin);
}
void common_chat_msg_parser::add_content(const std::string &content) {
result_.content += content;
}
void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
result_.reasoning_content += reasoning_content;
}
bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
if (name.empty()) {
return false;
}
common_chat_tool_call tool_call;
tool_call.name = name;
tool_call.arguments = arguments;
tool_call.id = id;
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
result_.tool_calls.emplace_back(tool_call);
return true;
}
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
return add_tool_call(name, id, arguments);
}
bool common_chat_msg_parser::add_tool_calls(const json & arr) {
for (const auto & item : arr) {
if (!add_tool_call(item)) {
return false;
}
}
return true;
}
void common_chat_msg_parser::finish() {
if (!is_partial_ && pos_ != input_.size()) {
throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
}
}
bool common_chat_msg_parser::consume_spaces() {
const auto length = input_.size();
auto consumed = false;
while (pos_ < length && std::isspace(input_[pos_])) {
++pos_;
consumed = true;
}
return consumed;
}
bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
auto pos = pos_;
for (auto i = 0u; i < literal.size(); ++i) {
if (pos >= input_.size()) {
return false;
}
if (input_[pos] != literal[i]) {
return false;
}
++pos;
}
pos_ = pos;
return true;
}
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_literal(const std::string & literal) {
auto idx = input_.find(literal, pos_);
if (idx != std::string::npos) {
find_regex_result res;
res.prelude = input_.substr(pos_, idx - pos_);
auto end = idx + literal.size();
res.groups.emplace_back(common_string_range{idx, end});
move_to(end);
return res;
}
if (is_partial_) {
idx = string_find_partial_stop(input_, literal);
if (idx != std::string::npos && idx >= pos_) {
find_regex_result res;
res.prelude = input_.substr(pos_, idx - pos_);
auto end = input_.size();
res.groups.emplace_back(common_string_range{idx, end});
move_to(end);
return res;
}
}
return std::nullopt;
}
void common_chat_msg_parser::consume_literal(const std::string & literal) {
if (!try_consume_literal(literal)) {
throw common_chat_msg_partial_exception(literal);
}
}
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
auto stripped_reasoning = string_strip(reasoning);
if (stripped_reasoning.empty()) {
return;
}
if (syntax_.reasoning_in_content) {
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
add_content(stripped_reasoning);
if (closed) {
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
}
} else {
add_reasoning_content(stripped_reasoning);
}
};
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
if (auto res = try_find_literal(end_think)) {
handle_reasoning(res->prelude, /* closed */ true);
consume_spaces();
return true;
}
auto rest = consume_rest();
if (!rest.empty()) {
handle_reasoning(rest, /* closed */ !is_partial());
}
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
// if (!syntax_.thinking_forced_open) {
// throw common_chat_msg_partial_exception(end_think);
// }
return true;
}
}
return false;
}
std::string common_chat_msg_parser::consume_rest() {
auto rest = input_.substr(pos_);
pos_ = input_.size();
return rest;
}
// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
return std::nullopt;
}
auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
pos_ = m.groups[0].end;
if (add_prelude_to_content) {
add_content(prelude);
}
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
if (is_partial()) {
throw common_chat_msg_partial_exception(regex.str());
}
return std::nullopt;
}
return find_regex_result{prelude, m.groups};
}
common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
if (auto result = try_consume_regex(regex)) {
return *result;
}
throw common_chat_msg_partial_exception(regex.str());
}
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
auto m = regex.search(input_, pos_);
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
return std::nullopt;
}
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
if (is_partial()) {
throw common_chat_msg_partial_exception(regex.str());
}
return std::nullopt;
}
if (m.groups[0].begin != pos_) {
// Didn't match at the current position.
return std::nullopt;
}
pos_ = m.groups[0].end;
return find_regex_result {
/* .prelude = */ "",
m.groups,
};
}
std::optional<common_json> common_chat_msg_parser::try_consume_json() {
auto it = input_.cbegin() + pos_;
const auto end = input_.cend();
common_json result;
if (!common_json_parse(it, end, healing_marker_, result)) {
return std::nullopt;
}
pos_ = std::distance(input_.cbegin(), it);
if (result.healing_marker.marker.empty()) {
// No healing marker, just return the parsed json
return result;
}
if (!is_partial()) {
throw common_chat_msg_partial_exception("JSON");
}
return result;
}
common_json common_chat_msg_parser::consume_json() {
if (auto result = try_consume_json()) {
return *result;
}
throw common_chat_msg_partial_exception("JSON");
}
common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
const std::vector<std::vector<std::string>> & args_paths,
const std::vector<std::vector<std::string>> & content_paths
) {
if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
return *result;
}
throw common_chat_msg_partial_exception("JSON");
}
std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
const std::vector<std::vector<std::string>> & args_paths,
const std::vector<std::vector<std::string>> & content_paths
) {
auto partial = try_consume_json();
if (!partial) {
return std::nullopt;
}
auto is_arguments_path = [&](const std::vector<std::string> & path) {
return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
};
auto is_content_path = [&](const std::vector<std::string> & path) {
return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
};
if (partial->healing_marker.marker.empty()) {
if (args_paths.empty()) {
// No arguments to dump, and JSON was parsed fully.
return consume_json_result {
partial->json,
/* .is_partial = */ false,
};
}
if (is_arguments_path({})) {
// Entire JSON is the arguments and was parsed fully.
return consume_json_result {
partial->json.dump(),
/* .is_partial = */ false,
};
}
}
LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
auto found_healing_marker = false;
std::vector<std::string> path;
std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
if (is_arguments_path(path)) {
auto arguments = j.dump();
if (is_partial() && !partial->healing_marker.marker.empty()) {
auto idx = arguments.find(partial->healing_marker.json_dump_marker);
if (idx != std::string::npos) {
arguments.resize(idx);
found_healing_marker = true;
}
if (arguments == "\"") {
// This happens because of completing `:"$magic` after `"arguments"`
arguments = "";
}
}
return arguments;
}
if (is_content_path(path)) {
if (!j.is_string()) {
throw std::runtime_error("Content path must be a string");
}
std::string str = j;
auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
if (idx != std::string::npos) {
str.resize(idx);
found_healing_marker = true;
}
return str;
}
if (j.is_object()) {
auto obj = json::object();
for (const auto & p : j.items()) {
const auto & key = p.key();
const auto & value = p.value();
const std::string key_str = key; // NOLINT
auto idx = key_str.find(healing_marker_);
if (idx != std::string::npos) {
found_healing_marker = true;
break;
}
path.push_back(key_str);
if (value.is_string()) {
const std::string value_str = value;
if (value_str.find(healing_marker_) != std::string::npos) {
found_healing_marker = true;
if (is_content_path(path)) {
if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
// The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
obj[key] = remove_unsupported_healings_and_dump_args(value);
}
}
break;
}
obj[key] = value;
} else {
obj[key] = remove_unsupported_healings_and_dump_args(value);
}
path.pop_back();
}
return obj;
}
if (j.is_array()) {
auto arr = json::array();
for (const auto & value : j) {
if (value.is_string()) {
std::string str = value;
auto idx = str.find(healing_marker_);
if (idx != std::string::npos) {
// Don't heal array values that aren't in the arguments.
found_healing_marker = true;
break;
}
}
arr.push_back(remove_unsupported_healings_and_dump_args(value));
}
return arr;
}
return j;
};
auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
return consume_json_result {
cleaned,
/* .is_partial = */ found_healing_marker,
};
}

View File

@@ -1,118 +0,0 @@
#pragma once
#include "chat.h"
#include "json-partial.h"
#include "regex-partial.h"
#include <nlohmann/json.hpp>
#include <optional>
#include <string>
#include <vector>
class common_chat_msg_partial_exception : public std::runtime_error {
public:
common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
};
class common_chat_msg_parser {
std::string input_;
bool is_partial_;
common_chat_syntax syntax_;
std::string healing_marker_;
size_t pos_ = 0;
common_chat_msg result_;
public:
common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
const std::string & input() const { return input_; }
size_t pos() const { return pos_; }
const std::string & healing_marker() const { return healing_marker_; }
const bool & is_partial() const { return is_partial_; }
const common_chat_msg & result() const { return result_; }
const common_chat_syntax & syntax() const { return syntax_; }
void move_to(size_t pos) {
if (pos > input_.size()) {
throw std::runtime_error("Invalid position!");
}
pos_ = pos;
}
void move_back(size_t n) {
if (pos_ < n) {
throw std::runtime_error("Can't move back that far!");
}
pos_ -= n;
}
// Get the substring of the input at the given range
std::string str(const common_string_range & rng) const;
// Appends to the result.content field
void add_content(const std::string & content);
// Appends to the result.reasoning_content field
void add_reasoning_content(const std::string & reasoning_content);
// Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
// Adds a tool call using the "name", "id" and "arguments" fields of the json object
bool add_tool_call(const nlohmann::ordered_json & tool_call);
// Adds an array of tool calls using their "name", "id" and "arguments" fields.
bool add_tool_calls(const nlohmann::ordered_json & arr);
void finish();
bool consume_spaces();
void consume_literal(const std::string & literal);
bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
std::string consume_rest();
struct find_regex_result {
std::string prelude;
std::vector<common_string_range> groups;
};
std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
bool try_consume_literal(const std::string & literal);
std::optional<find_regex_result> try_find_literal(const std::string & literal);
find_regex_result consume_regex(const common_regex & regex);
std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
std::optional<common_json> try_consume_json();
common_json consume_json();
struct consume_json_result {
nlohmann::ordered_json value;
bool is_partial;
};
/*
Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
- with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
- with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
*/
consume_json_result consume_json_with_dumped_args(
const std::vector<std::vector<std::string>> & args_paths = {},
const std::vector<std::vector<std::string>> & content_paths = {}
);
std::optional<consume_json_result> try_consume_json_with_dumped_args(
const std::vector<std::vector<std::string>> & args_paths = {},
const std::vector<std::vector<std::string>> & content_paths = {}
);
};

File diff suppressed because it is too large Load Diff

View File

@@ -3,8 +3,6 @@
#pragma once
#include "common.h"
#include <functional>
#include <chrono>
#include <string>
#include <vector>
@@ -14,19 +12,11 @@ struct common_chat_tool_call {
std::string name;
std::string arguments;
std::string id;
bool operator==(const common_chat_tool_call & other) const {
return name == other.name && arguments == other.arguments && id == other.id;
}
};
struct common_chat_msg_content_part {
std::string type;
std::string text;
bool operator==(const common_chat_msg_content_part & other) const {
return type == other.type && text == other.text;
}
};
struct common_chat_msg {
@@ -37,51 +27,6 @@ struct common_chat_msg {
std::string reasoning_content;
std::string tool_name;
std::string tool_call_id;
template <class T> T to_json_oaicompat() const;
bool empty() const {
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
}
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
for (auto i = 0u; i < tool_calls.size(); i++) {
if (ids_cache.size() <= i) {
auto id = tool_calls[i].id;
if (id.empty()) {
id = gen_tool_call_id();
}
ids_cache.push_back(id);
}
tool_calls[i].id = ids_cache[i];
}
}
bool operator==(const common_chat_msg & other) const {
return role == other.role
&& content == other.content
&& content_parts == other.content_parts
&& tool_calls == other.tool_calls
&& reasoning_content == other.reasoning_content
&& tool_name == other.tool_name
&& tool_call_id == other.tool_call_id;
}
bool operator!=(const common_chat_msg & other) const {
return !(*this == other);
}
};
struct common_chat_msg_diff {
std::string reasoning_content_delta;
std::string content_delta;
size_t tool_call_index = std::string::npos;
common_chat_tool_call tool_call_delta;
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
bool operator==(const common_chat_msg_diff & other) const {
return content_delta == other.content_delta
&& tool_call_index == other.tool_call_index
&& tool_call_delta == other.tool_call_delta;
}
};
struct common_chat_tool {
@@ -103,11 +48,14 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_LLAMA_3_X,
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
COMMON_CHAT_FORMAT_HERMES_2_PRO,
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
COMMON_CHAT_FORMAT_COMMAND_R7B,
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};
@@ -122,9 +70,7 @@ struct common_chat_templates_inputs {
std::vector<common_chat_tool> tools;
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
bool parallel_tool_calls = false;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
bool extract_reasoning = true;
};
struct common_chat_params {
@@ -132,21 +78,11 @@ struct common_chat_params {
std::string prompt;
std::string grammar;
bool grammar_lazy = false;
bool thinking_forced_open = false;
std::vector<common_grammar_trigger> grammar_triggers;
std::vector<std::string> preserved_tokens;
std::vector<std::string> additional_stops;
};
struct common_chat_syntax {
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
bool reasoning_in_content = false;
bool thinking_forced_open = false;
bool parse_tool_calls = true;
};
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
@@ -183,9 +119,8 @@ std::string common_chat_format_example(
const struct common_chat_templates * tmpls,
bool use_jinja);
const char* common_chat_format_name(common_chat_format format);
const char* common_reasoning_format_name(common_reasoning_format format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
std::string common_chat_format_name(common_chat_format format);
common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
@@ -198,5 +133,3 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
// T can be std::string containing JSON or nlohmann::ordered_json
template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);

View File

@@ -203,7 +203,6 @@ bool set_process_priority(enum ggml_sched_priority prio) {
DWORD p = NORMAL_PRIORITY_CLASS;
switch (prio) {
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -229,7 +228,6 @@ bool set_process_priority(enum ggml_sched_priority prio) {
int p = 0;
switch (prio) {
case GGML_SCHED_PRIO_LOW: p = 5; break;
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
case GGML_SCHED_PRIO_HIGH: p = -10; break;
@@ -445,25 +443,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
s = std::move(builder);
}
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
if (!str.empty() && !stop.empty()) {
const char text_last_char = str.back();
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
if (stop[char_index] == text_last_char) {
const auto current_partial = stop.substr(0, char_index + 1);
if (string_ends_with(str, current_partial)) {
return str.size() - char_index - 1;
}
}
}
}
return std::string::npos;
}
std::string regex_escape(const std::string & s) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
return std::regex_replace(s, special_chars, "\\$0");
@@ -851,7 +830,7 @@ std::string fs_get_cache_directory() {
if (getenv("LLAMA_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE");
} else {
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
#ifdef __linux__
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else {
@@ -861,9 +840,7 @@ std::string fs_get_cache_directory() {
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32)
cache_directory = std::getenv("LOCALAPPDATA");
#else
# error Unknown architecture
#endif
#endif // __linux__
cache_directory = ensure_trailing_slash(cache_directory);
cache_directory += "llama.cpp";
}
@@ -905,16 +882,13 @@ struct common_init_result common_init_from_params(common_params & params) {
ok = false;
}
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
}
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
@@ -934,7 +908,7 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}
@@ -1041,7 +1015,7 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
}
llama_memory_clear(llama_get_memory(lctx), true);
llama_kv_self_clear(lctx);
llama_synchronize(lctx);
llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);
@@ -1107,9 +1081,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
}
mparams.progress_callback = params.load_progress_callback;
mparams.progress_callback_user_data = params.load_progress_callback_user_data;
return mparams;
}
@@ -1123,6 +1094,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type;
cparams.rope_freq_base = params.rope_freq_base;
@@ -1140,8 +1112,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
if (params.reranking) {
cparams.embeddings = true;
@@ -1334,6 +1304,81 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
return text;
}
//
// KV cache utils
//
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
llama_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences;
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
if (i % row_size == 0) {
printf("\n%5d: ", i);
}
int seq_count = 0;
for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] >= 0) { seq_count++; }
}
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
}
printf("\n=== Done dumping\n");
}
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
std::unordered_map<llama_seq_id, size_t> seqs;
llama_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences;
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] < 0) { continue; }
if (seqs.find(cs_curr[j]) == seqs.end()) {
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
const size_t sz = seqs.size();
seqs[cs_curr[j]] = sz;
}
}
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
}
printf("=== Sequence legend: ");
for (const auto & it : seqs) {
printf("%zu=%d, ", it.second, it.first);
}
printf("'+'=other sequence ids");
c_curr = view.cells;
cs_curr = view.cells_sequences;
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
if (i % row_size == 0) {
printf("\n%5d: ", i);
}
for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] >= 0) {
const auto & it = seqs.find(cs_curr[j]);
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
} else {
putchar('.');
}
}
putchar(' ');
}
printf("\n=== Done dumping\n");
}
//
// Embedding utils
//
@@ -1518,20 +1563,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return result;
}
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
const int64_t ne_datapoint = llama_n_ctx(ctx);
const int64_t ndata = (tokens.size() - ne_datapoint - 1) / stride;
ggml_opt_dataset_t result = ggml_opt_dataset_init(
GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
llama_token * data = (llama_token *) ggml_opt_dataset_data(result)->data;
llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
for (int64_t idata = 0; idata < ndata; ++idata) {
memcpy(data + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
}
return result;
}

View File

@@ -6,7 +6,6 @@
#include <set>
#include <string>
#include <string_view>
#include <vector>
#include <sstream>
@@ -67,6 +66,7 @@ enum llama_example {
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_INFILL,
LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL,
@@ -76,7 +76,7 @@ enum llama_example {
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
@@ -96,7 +96,6 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_XTC = 8,
COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
};
// dimensionality reduction methods, used by cvector-generator
@@ -115,7 +114,7 @@ enum common_grammar_trigger_type {
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
};
struct common_grammar_trigger {
@@ -162,7 +161,6 @@ struct common_params_sampling {
std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_PENALTIES,
COMMON_SAMPLER_TYPE_DRY,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TYPICAL_P,
COMMON_SAMPLER_TYPE_TOP_P,
@@ -215,8 +213,7 @@ struct common_params_vocoder {
enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
};
struct common_params {
@@ -292,7 +289,6 @@ struct common_params {
int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
bool offline = false;
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -325,17 +321,17 @@ struct common_params {
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
bool no_op_offload = false; // globally disable offload host tensor operations to device
bool single_turn = false; // single turn chat conversation
@@ -344,10 +340,8 @@ struct common_params {
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see tools/mtmd)
// multimodal models (see examples/llava)
struct common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s)
// embedding
@@ -370,8 +364,6 @@ struct common_params {
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
std::vector<std::string> api_keys;
@@ -415,14 +407,13 @@ struct common_params {
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
// cvector-generator params
int n_pca_batch = 100;
int n_pca_iterations = 1000;
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
bool spm_infill = false; // suffix/prefix/middle pattern for infill
@@ -431,11 +422,6 @@ struct common_params {
// common params
std::string out_file; // output filename for all example programs
// optional callback for model loading progress and cancellation:
// called with a progress value between 0.0 and 1.0.
// return false from callback to abort model loading or true to continue
llama_progress_callback load_progress_callback = NULL;
void * load_progress_callback_user_data = NULL;
};
// call once at the start of a program if it uses libcommon
@@ -513,9 +499,10 @@ static bool string_starts_with(const std::string & str,
return str.rfind(prefix, 0) == 0;
}
// While we wait for C++20's std::string::ends_with...
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
static bool string_ends_with(const std::string & str,
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
@@ -624,6 +611,16 @@ std::string common_detokenize(
const std::vector<llama_token> & tokens,
bool special = true);
//
// KV cache utils
//
// Dump the KV cache view with the number of sequences per cell.
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output).
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
//
// Embedding utils
//
@@ -665,9 +662,3 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
}
//
// training utils
//
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);

View File

@@ -1,256 +0,0 @@
#include "json-partial.h"
#include "log.h"
#include <nlohmann/json.hpp>
#include <string>
using json = nlohmann::ordered_json;
enum common_json_stack_element_type {
COMMON_JSON_STACK_ELEMENT_OBJECT,
COMMON_JSON_STACK_ELEMENT_KEY,
COMMON_JSON_STACK_ELEMENT_ARRAY,
};
struct common_json_stack_element {
common_json_stack_element_type type;
std::string key;
};
bool common_json_parse(
const std::string & input,
const std::string & healing_marker,
common_json & out)
{
std::string::const_iterator it = input.begin();
const auto end = input.end();
return common_json_parse(it, end, healing_marker, out);
}
bool common_json_parse(
std::string::const_iterator & it,
const std::string::const_iterator & end,
const std::string & healing_marker,
common_json & out)
{
// // https://json.nlohmann.me/features/parsing/sax_interface/
struct json_error_locator : public nlohmann::json_sax<json> {
std::size_t position;
bool found_error;
std::string last_token;
std::string exception_message;
std::vector<common_json_stack_element> stack;
json_error_locator() : position(0), found_error(false) {}
bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
this->position = position - 1;
this->found_error = true;
this->last_token = last_token;
this->exception_message = ex.what();
return false;
}
void close_value() {
if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
stack.pop_back();
}
}
bool null() override { // NOLINT
close_value();
return true;
}
bool boolean(bool) override { // NOLINT
close_value();
return true;
}
bool number_integer(number_integer_t) override { // NOLINT
close_value();
return true;
}
bool number_unsigned(number_unsigned_t) override { // NOLINT
close_value();
return true;
}
bool number_float(number_float_t, const string_t &) override { // NOLINT
close_value();
return true;
}
bool string(string_t &) override { // NOLINT
close_value();
return true;
}
bool binary(binary_t &) override { // NOLINT
close_value();
return true;
}
bool start_object(std::size_t) override { // NOLINT
stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
return true;
}
bool end_object() override {
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
stack.pop_back();
close_value();
return true;
}
bool key(string_t & key) override { // NOLINT
stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
return true;
}
bool start_array(std::size_t) override { // NOLINT
stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
return true;
}
bool end_array() override {
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
stack.pop_back();
close_value();
return true;
}
};
json_error_locator err_loc;
auto start = it;
json::sax_parse(it, end, &err_loc);
if (err_loc.found_error) {
it = start;
auto temptative_end = it + err_loc.position;
// LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
auto input = std::string(it, temptative_end);
try {
out.json = json::parse(input);
// out.json = json::parse(it, temptative_end);
it = temptative_end;
return true;
} catch (const std::exception & ex) {
// No, needs healing.
LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
}
auto can_parse = [](const std::string & str) {
try {
auto _ = json::parse(str); // NOLINT
return true;
} catch (const std::exception &) {
return false;
}
};
if (!healing_marker.empty() && !err_loc.stack.empty()) {
std::string str(it, temptative_end);
auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
if (last_non_sp_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
}
auto last_non_sp_char = str[last_non_sp_pos];
// Used to detect stops on a number, which may not be complete.
auto was_maybe_number = [&]() {
if (!str.empty() && std::isspace(str.back())) {
return false;
}
return std::isdigit(last_non_sp_char) ||
last_non_sp_char == '.' ||
last_non_sp_char == 'e' ||
last_non_sp_char == 'E' ||
last_non_sp_char == '-';
};
std::string closing;
for (size_t i = err_loc.stack.size(); i > 0; i--) {
auto & el = err_loc.stack[i - 1];
if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
closing += "}";
} else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
closing += "]";
} else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
throw std::runtime_error("Unexpected stack element type");
}
}
const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
// We're inside an object value
if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
// Was about to create an object value
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
} else if (can_parse(str + ": 1" + closing)) {
str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
} else if (last_non_sp_char == '{' && can_parse(str + closing)) {
// Was about to create an object
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
} else if (can_parse(str + "\"" + closing)) {
// Was inside an object value string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
// Was inside an object value string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
} else {
// find last :
auto last_pos = str.find_last_of(':');
if (last_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
}
// Cutting back to opening : for object value
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
}
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
// Was about to create an array value
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
} else if (can_parse(str + "\"" + closing)) {
// Was inside an array value string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
// Was inside an array value string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
} else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
// Had just finished a value
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
} else {
auto last_pos = str.find_last_of("[,");
if (last_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
}
// Cutting back to last [ or , for array value
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
}
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
(last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
// Was about to create an object key+value
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
} else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
// Was about to create an object key+value
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
} else if (can_parse(str + "\": 1" + closing)) {
// Was inside an object key string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
// Was inside an object key string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
} else {
auto last_pos = str.find_last_of(':');
if (last_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
}
// fprintf(stderr, "Cutting back to last : for object key+value\n");
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
}
} else {
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
}
// fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
out.json = json::parse(str);
it = temptative_end;
return true;
}
// TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
// fprintf(stderr, "Closing: TODO\n");
return false;
}
out.json = json::parse(it, end);
it = end;
return true;
}

View File

@@ -1,38 +0,0 @@
#pragma once
#include <nlohmann/json.hpp>
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
struct common_healing_marker {
// Raw marker.
std::string marker;
// Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
std::string json_dump_marker;
};
// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
struct common_json {
nlohmann::ordered_json json;
common_healing_marker healing_marker;
};
// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
//
// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
//
// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
bool common_json_parse(
const std::string & input,
const std::string & healing_marker,
common_json & out);
// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
bool common_json_parse(
std::string::const_iterator & it,
const std::string::const_iterator & end,
const std::string & healing_marker,
common_json & out);

View File

@@ -1,9 +1,8 @@
#include "json-schema-to-grammar.h"
#include "common.h"
#include <nlohmann/json.hpp>
#include <algorithm>
#include <fstream>
#include <map>
#include <regex>
#include <sstream>
@@ -17,9 +16,6 @@ using json = nlohmann::ordered_json;
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max();
if (max_items == 0) {
return "";
}
if (min_items == 0 && max_items == 1) {
return item_rule + "?";
}

View File

@@ -1,9 +1,9 @@
#pragma once
#include <nlohmann/json_fwd.hpp>
#include <functional>
#include <string>
#include "ggml.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
bool force_gbnf = false);

File diff suppressed because it is too large Load Diff

View File

@@ -189,7 +189,6 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
/* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
/* .use_approximate_greedy_tokenize_fn = */ false,
/* .tokenize_user_data = */ vocab,
/* .slices = */ nullptr,
};
char error_buffer[1024];

View File

@@ -13,16 +13,14 @@
#include <chrono>
#include <cstddef>
#include <cstdio>
#include <ctime>
#include <exception>
#include <iomanip>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
#include <nlohmann/json.hpp>
#include <json.hpp>
using json = nlohmann::ordered_json;
@@ -395,8 +393,8 @@ class chat_template {
for (const auto & message_ : adjusted_messages) {
auto message = message_;
if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
if (!message.contains("role") || !message.contains("content")) {
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
}
std::string role = message.at("role");
@@ -417,6 +415,7 @@ class chat_template {
}
}
if (polyfill_tool_calls) {
auto content = message.at("content");
auto tool_calls = json::array();
for (const auto & tool_call : message.at("tool_calls")) {
if (tool_call.at("type") != "function") {
@@ -435,11 +434,8 @@ class chat_template {
auto obj = json {
{"tool_calls", tool_calls},
};
if (message.contains("content")) {
auto content = message.at("content");
if (!content.is_null() && !content.empty()) {
obj["content"] = content;
}
if (!content.is_null() && !content.empty()) {
obj["content"] = content;
}
message["content"] = obj.dump(2);
message.erase("tool_calls");

View File

@@ -11,7 +11,6 @@
#include <algorithm>
#include <cctype>
#include <cstddef>
#include <cstdint>
#include <cmath>
#include <exception>
#include <functional>
@@ -29,7 +28,7 @@
#include <utility>
#include <vector>
#include <nlohmann/json.hpp>
#include <json.hpp>
using json = nlohmann::ordered_json;
@@ -234,7 +233,7 @@ public:
}
} else if (is_object()) {
if (!index.is_hashable())
throw std::runtime_error("Unhashable type: " + index.dump());
throw std::runtime_error("Unashable type: " + index.dump());
auto it = object_->find(index.primitive_);
if (it == object_->end())
throw std::runtime_error("Key not found: " + index.dump());
@@ -253,7 +252,7 @@ public:
auto index = key.get<int>();
return array_->at(index < 0 ? array_->size() + index : index);
} else if (object_) {
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
auto it = object_->find(key.primitive_);
if (it == object_->end()) return Value();
return it->second;
@@ -262,7 +261,7 @@ public:
}
void set(const Value& key, const Value& value) {
if (!object_) throw std::runtime_error("Value is not an object: " + dump());
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
(*object_)[key.primitive_] = value;
}
Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {
@@ -399,7 +398,7 @@ public:
}
return false;
} else if (object_) {
if (!value.is_hashable()) throw std::runtime_error("Unhashable type: " + value.dump());
if (!value.is_hashable()) throw std::runtime_error("Unashable type: " + value.dump());
return object_->find(value.primitive_) != object_->end();
} else {
throw std::runtime_error("contains can only be called on arrays and objects: " + dump());
@@ -417,7 +416,7 @@ public:
return const_cast<Value*>(this)->at(index);
}
Value& at(const Value & index) {
if (!index.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
if (!index.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
if (is_array()) return array_->at(index.get<int>());
if (is_object()) return object_->at(index.primitive_);
throw std::runtime_error("Value is not an array or object: " + dump());
@@ -677,8 +676,8 @@ public:
class VariableExpr : public Expression {
std::string name;
public:
VariableExpr(const Location & loc, const std::string& n)
: Expression(loc), name(n) {}
VariableExpr(const Location & location, const std::string& n)
: Expression(location), name(n) {}
std::string get_name() const { return name; }
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!context->contains(name)) {
@@ -1201,9 +1200,9 @@ public:
class SliceExpr : public Expression {
public:
std::shared_ptr<Expression> start, end, step;
SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e, std::shared_ptr<Expression> && st = nullptr)
: Expression(loc), start(std::move(s)), end(std::move(e)), step(std::move(st)) {}
std::shared_ptr<Expression> start, end;
SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
: Expression(loc), start(std::move(s)), end(std::move(e)) {}
Value do_evaluate(const std::shared_ptr<Context> &) const override {
throw std::runtime_error("SliceExpr not implemented");
}
@@ -1220,35 +1219,18 @@ public:
if (!index) throw std::runtime_error("SubscriptExpr.index is null");
auto target_value = base->evaluate(context);
if (auto slice = dynamic_cast<SliceExpr*>(index.get())) {
auto len = target_value.size();
auto wrap = [len](int64_t i) -> int64_t {
if (i < 0) {
return i + len;
}
return i;
};
int64_t step = slice->step ? slice->step->evaluate(context).get<int64_t>() : 1;
if (!step) {
throw std::runtime_error("slice step cannot be zero");
}
int64_t start = slice->start ? wrap(slice->start->evaluate(context).get<int64_t>()) : (step < 0 ? len - 1 : 0);
int64_t end = slice->end ? wrap(slice->end->evaluate(context).get<int64_t>()) : (step < 0 ? -1 : len);
auto start = slice->start ? slice->start->evaluate(context).get<int64_t>() : 0;
auto end = slice->end ? slice->end->evaluate(context).get<int64_t>() : (int64_t) target_value.size();
if (target_value.is_string()) {
std::string s = target_value.get<std::string>();
std::string result;
if (start < end && step == 1) {
result = s.substr(start, end - start);
} else {
for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
result += s[i];
}
}
return result;
if (start < 0) start = s.size() + start;
if (end < 0) end = s.size() + end;
return s.substr(start, end - start);
} else if (target_value.is_array()) {
if (start < 0) start = target_value.size() + start;
if (end < 0) end = target_value.size() + end;
auto result = Value::array();
for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
for (auto i = start; i < end; ++i) {
result.push_back(target_value.at(i));
}
return result;
@@ -1323,8 +1305,6 @@ public:
if (name == "iterable") return l.is_iterable();
if (name == "sequence") return l.is_array();
if (name == "defined") return !l.is_null();
if (name == "true") return l.to_bool();
if (name == "false") return !l.to_bool();
throw std::runtime_error("Unknown type for 'is' operator: " + name);
};
auto value = eval();
@@ -1540,10 +1520,6 @@ public:
vargs.expectArgs("endswith method", {1, 1}, {0, 0});
auto suffix = vargs.args[0].get<std::string>();
return suffix.length() <= str.length() && std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
} else if (method->get_name() == "startswith") {
vargs.expectArgs("startswith method", {1, 1}, {0, 0});
auto prefix = vargs.args[0].get<std::string>();
return prefix.length() <= str.length() && std::equal(prefix.begin(), prefix.end(), str.begin());
} else if (method->get_name() == "title") {
vargs.expectArgs("title method", {0, 0}, {0, 0});
auto res = str;
@@ -2106,37 +2082,28 @@ private:
while (it != end && consumeSpaces() && peekSymbols({ "[", "." })) {
if (!consumeToken("[").empty()) {
std::shared_ptr<Expression> index;
auto slice_loc = get_location();
std::shared_ptr<Expression> start, end, step;
bool has_first_colon = false, has_second_colon = false;
if (!peekSymbols({ ":" })) {
start = parseExpression();
}
if (!consumeToken(":").empty()) {
has_first_colon = true;
if (!peekSymbols({ ":", "]" })) {
end = parseExpression();
}
std::shared_ptr<Expression> index;
if (!consumeToken(":").empty()) {
has_second_colon = true;
if (!peekSymbols({ "]" })) {
step = parseExpression();
auto slice_end = parseExpression();
index = std::make_shared<SliceExpr>(slice_end->location, nullptr, std::move(slice_end));
} else {
auto slice_start = parseExpression();
if (!consumeToken(":").empty()) {
consumeSpaces();
if (peekSymbols({ "]" })) {
index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), nullptr);
} else {
auto slice_end = parseExpression();
index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), std::move(slice_end));
}
} else {
index = std::move(slice_start);
}
}
}
if (!index) throw std::runtime_error("Empty index in subscript");
if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
if ((has_first_colon || has_second_colon) && (start || end || step)) {
index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
} else {
index = std::move(start);
}
if (!index) throw std::runtime_error("Empty index in subscript");
if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
} else if (!consumeToken(".").empty()) {
auto identifier = parseIdentifier();
if (!identifier) throw std::runtime_error("Expected identifier in subscript");

View File

@@ -1,204 +0,0 @@
#include "regex-partial.h"
#include "common.h"
#include <functional>
#include <optional>
common_regex::common_regex(const std::string & pattern) :
pattern(pattern),
rx(pattern),
rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
std::smatch match;
if (pos > input.size()) {
throw std::runtime_error("Position out of bounds");
}
auto start = input.begin() + pos;
auto found = as_match
? std::regex_match(start, input.end(), match, rx)
: std::regex_search(start, input.end(), match, rx);
if (found) {
common_regex_match res;
res.type = COMMON_REGEX_MATCH_TYPE_FULL;
for (size_t i = 0; i < match.size(); ++i) {
auto begin = pos + match.position(i);
res.groups.emplace_back(begin, begin + match.length(i));
}
return res;
}
std::match_results<std::string::const_reverse_iterator> srmatch;
if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
auto group = srmatch[1].str();
if (group.length() != 0) {
auto it = srmatch[1].second.base();
// auto position = static_cast<size_t>(std::distance(input.begin(), it));
if ((!as_match) || it == input.begin()) {
common_regex_match res;
res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
const size_t begin = std::distance(input.begin(), it);
const size_t end = input.size();
if (begin == std::string::npos || end == std::string::npos || begin > end) {
throw std::runtime_error("Invalid range");
}
res.groups.push_back({begin, end});
return res;
}
}
}
return {};
}
/*
Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
to see if a string ends with a partial regex match, but but it's not in std::regex yet.
Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
- /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
- /a|b/ -> (a|b).*
- /a*?/ -> error, could match ""
- /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
- /.*?ab/ -> ((?:b)?a).* (merge .*)
- /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
- /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
- /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
- /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
(i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
*/
std::string regex_to_reversed_partial_regex(const std::string & pattern) {
auto it = pattern.begin();
const auto end = pattern.end();
std::function<std::string()> process = [&]() {
std::vector<std::vector<std::string>> alternatives(1);
std::vector<std::string> * sequence = &alternatives.back();
while (it != end) {
if (*it == '[') {
auto start = it;
++it;
while (it != end) {
if ((*it == '\\') && (++it != end)) {
++it;
} else if ((it != end) && (*it == ']')) {
break;
} else {
++it;
}
}
if (it == end) {
throw std::runtime_error("Unmatched '[' in pattern");
}
++it;
sequence->push_back(std::string(start, it));
} else if (*it == '*' || *it == '?' || *it == '+') {
if (sequence->empty()) {
throw std::runtime_error("Quantifier without preceding element");
}
sequence->back() += *it;
auto is_star = *it == '*';
++it;
if (is_star) {
if (*it == '?') {
++it;
}
}
} else if (*it == '{') {
if (sequence->empty()) {
throw std::runtime_error("Repetition without preceding element");
}
++it;
auto start = it;
while (it != end && *it != '}') {
++it;
}
if (it == end) {
throw std::runtime_error("Unmatched '{' in pattern");
}
auto parts = string_split(std::string(start, it), ",");
++it;
if (parts.size() > 2) {
throw std::runtime_error("Invalid repetition range in pattern");
}
auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
if (s.empty()) {
return def;
}
return std::stoi(s);
};
auto min = parseOptInt(parts[0], 0);
auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
if (min && max && *max < *min) {
throw std::runtime_error("Invalid repetition range in pattern");
}
// Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
auto part = sequence->back();
sequence->pop_back();
for (int i = 0; i < *min; i++) {
sequence->push_back(part);
}
if (max) {
for (int i = *min; i < *max; i++) {
sequence->push_back(part + "?");
}
} else {
sequence->push_back(part + "*");
}
} else if (*it == '(') {
++it;
if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
it += 2;
}
auto sub = process();
if (*it != ')') {
throw std::runtime_error("Unmatched '(' in pattern");
}
++it;
auto & part = sequence->emplace_back("(?:");
part += sub;
part += ")";
} else if (*it == ')') {
break;
} else if (*it == '|') {
++it;
alternatives.emplace_back();
sequence = &alternatives.back();
} else if (*it == '\\' && (++it != end)) {
auto str = std::string("\\") + *it;
sequence->push_back(str);
++it;
} else if (it != end) {
sequence->push_back(std::string(1, *it));
++it;
}
}
// /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
// if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
// We'll do the outermost capturing group and final .* in the enclosing function.
std::vector<std::string> res_alts;
for (const auto & parts : alternatives) {
auto & res = res_alts.emplace_back();
for (size_t i = 0; i < parts.size() - 1; i++) {
res += "(?:";
}
for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
res += *it;
if (it != parts.rend() - 1) {
res += ")?";
}
}
}
return string_join(res_alts, "|");
};
auto res = process();
if (it != end) {
throw std::runtime_error("Unmatched '(' in pattern");
}
return "(" + res + ")[\\s\\S]*";
}

View File

@@ -1,56 +0,0 @@
#pragma once
#include <regex>
#include <string>
enum common_regex_match_type {
COMMON_REGEX_MATCH_TYPE_NONE,
COMMON_REGEX_MATCH_TYPE_PARTIAL,
COMMON_REGEX_MATCH_TYPE_FULL,
};
struct common_string_range {
size_t begin;
size_t end;
common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
if (begin > end) {
throw std::runtime_error("Invalid range");
}
}
// prevent default ctor
common_string_range() = delete;
bool empty() const {
return begin == end;
}
bool operator==(const common_string_range & other) const {
return begin == other.begin && end == other.end;
}
};
struct common_regex_match {
common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
std::vector<common_string_range> groups;
bool operator==(const common_regex_match & other) const {
return type == other.type && groups == other.groups;
}
bool operator!=(const common_regex_match & other) const {
return !(*this == other);
}
};
class common_regex {
std::string pattern;
std::regex rx;
std::regex rx_reversed_partial;
public:
explicit common_regex(const std::string & pattern);
common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
const std::string & str() const { return pattern; }
};
// For testing only (pretty print of failures).
std::string regex_to_reversed_partial_regex(const std::string & pattern);

View File

@@ -1,7 +1,6 @@
#include "sampling.h"
#include "common.h"
#include "log.h"
#include <cmath>
#include <unordered_map>
@@ -161,7 +160,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
#endif // LLAMA_USE_LLGUIDANCE
} else {
std::vector<std::string> trigger_patterns;
std::vector<std::string> patterns_at_start;
std::vector<std::string> patterns_anywhere;
std::vector<llama_token> trigger_tokens;
for (const auto & trigger : params.grammar_triggers) {
@@ -173,13 +172,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
{
patterns_anywhere.push_back(trigger.value);
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
{
trigger_patterns.push_back(trigger.value);
const auto & pattern = trigger.value;
(trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
break;
}
case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -193,6 +189,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
}
}
std::vector<std::string> trigger_patterns;
if (!patterns_at_start.empty()) {
trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
}
if (!patterns_anywhere.empty()) {
trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
}
@@ -229,48 +229,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
params.logit_bias.data()));
if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY:
{
std::vector<const char *> c_breakers;
c_breakers.reserve(params.dry_sequence_breakers.size());
for (const auto & str : params.dry_sequence_breakers) {
c_breakers.push_back(str.c_str());
}
if (params.top_n_sigma >= 0) {
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
} else {
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY:
{
std::vector<const char *> c_breakers;
c_breakers.reserve(params.dry_sequence_breakers.size());
for (const auto & str : params.dry_sequence_breakers) {
c_breakers.push_back(str.c_str());
}
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
}
break;
case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
break;
case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break;
case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
break;
case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break;
default:
GGML_ASSERT(false && "unknown sampler type");
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
}
break;
case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break;
case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
break;
case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break;
default:
GGML_ASSERT(false && "unknown sampler type");
}
}
}
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@@ -472,7 +475,6 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
case COMMON_SAMPLER_TYPE_XTC: return 'x';
@@ -488,7 +490,6 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
@@ -503,7 +504,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "dry", COMMON_SAMPLER_TYPE_DRY },
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
@@ -517,7 +517,6 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
@@ -534,16 +533,14 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
auto sampler = sampler_canonical_name_map.find(name);
if (sampler != sampler_canonical_name_map.end()) {
samplers.push_back(sampler->second);
continue;
}
if (allow_alt_names) {
sampler = sampler_alt_name_map.find(name);
if (sampler != sampler_alt_name_map.end()) {
samplers.push_back(sampler->second);
continue;
} else {
if (allow_alt_names) {
sampler = sampler_alt_name_map.find(name);
if (sampler != sampler_alt_name_map.end()) {
samplers.push_back(sampler->second);
}
}
}
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
}
return samplers;
@@ -555,7 +552,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
@@ -570,8 +566,6 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
const auto sampler = sampler_name_map.find(c);
if (sampler != sampler_name_map.end()) {
samplers.push_back(sampler->second);
} else {
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
}
}

View File

@@ -144,8 +144,6 @@ llama_tokens common_speculative_gen_draft(
auto & smpl = spec->smpl;
auto & prompt = spec->prompt;
auto * mem = llama_get_memory(ctx);
int reuse_i = 0;
int reuse_n = 0;
@@ -175,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
result.reserve(params.n_draft);
if (reuse_n == 0) {
llama_memory_clear(mem, false);
llama_kv_self_clear(ctx);
prompt.clear();
} else {
@@ -194,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
}
if (reuse_i > 0) {
llama_memory_seq_rm (mem, 0, 0, reuse_i);
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
}
if (reuse_n < (int) prompt.size()) {
llama_memory_seq_rm (mem, 0, reuse_n, -1);
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
prompt.erase(prompt.begin() + reuse_n, prompt.end());
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,28 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
#
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
# provide the necessary information to llama.cpp via the GGUF header in order to implement
# the same pre-tokenizer.
#
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
#
# Instructions:
#
# - Add a new model to the "models" list
# - Run the script with your huggingface token:
#
# python3 convert_hf_to_gguf_update.py <huggingface_token>
#
# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
# - Update llama.cpp with the new pre-tokenizer if necessary
#
# TODO: generate tokenizer tests for llama.cpp
#
import logging
import os
import pathlib
@@ -10,7 +32,6 @@ import requests
import sys
import json
import shutil
import argparse
from hashlib import sha256
from enum import IntEnum, auto
@@ -20,11 +41,6 @@ logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert_hf_to_gguf_update")
sess = requests.Session()
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
convert_py = convert_py_pth.read_text(encoding="utf-8")
hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token"
hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None
class TOKENIZER_TYPE(IntEnum):
SPM = auto()
@@ -33,49 +49,20 @@ class TOKENIZER_TYPE(IntEnum):
UGM = auto()
DOC_STRING = """
This script downloads the tokenizer models of the specified models from Huggingface and
generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
/!\\ It is intended to be used by contributors and is not meant to be run by end users
This is necessary in order to analyze the type of pre-tokenizer used by the model and
provide the necessary information to llama.cpp via the GGUF header in order to implement
the same pre-tokenizer.
ref: https://github.com/ggml-org/llama.cpp/pull/6920
Instructions:
- Add a new model to the "models" list
- Run the script with your huggingface token
By default, token will be read from ~/.cache/huggingface/token
- The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
- Update llama.cpp with the new pre-tokenizer if necessary
"""
# TODO: generate tokenizer tests for llama.cpp
parser = argparse.ArgumentParser(description=DOC_STRING, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
"--full", action="store_true",
help="download full list of models - make sure you have access to all of them",
)
parser.add_argument(
"hf_token",
help="optional HF token",
nargs="?",
)
args = parser.parse_args()
hf_token = args.hf_token if args.hf_token is not None else hf_token
if hf_token is None:
logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
sys.exit(1)
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
if len(sys.argv) == 2:
token = sys.argv[1]
if not token.startswith("hf_"):
logger.info("Huggingface token seems invalid")
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
sys.exit(1)
else:
logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
sys.exit(1)
# TODO: add models here, base models preferred
models = [
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
@@ -116,6 +103,7 @@ models = [
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
@@ -126,17 +114,7 @@ models = [
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
pre_computed_hashes = [
# chatglm-bpe has 2 hashes, why?
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
]
@@ -189,29 +167,9 @@ def download_model(model):
if os.path.isfile(save_path):
logger.info(f"{name}: File {save_path} already exists - skipping")
continue
download_file_with_auth(f"{repo}/resolve/main/{file}", hf_token, save_path)
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
# get list of existing models and chkhsh from the convert_hf_to_gguf.py file
# returns mapping res --> chkhsh
def get_existing_models(convert_py):
pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
matches = re.findall(pattern, convert_py)
output = {}
for chkhsh, res in matches:
output[res] = chkhsh
return output
existing_models = {}
all_models = models.copy()
if not args.full:
# Filter out models that already exist in convert_hf_to_gguf.py
existing_models = get_existing_models(convert_py)
all_models = models.copy()
models = [model for model in all_models if model["name"] not in existing_models]
logging.info(f"Downloading {len(models)} models...")
for model in models:
try:
download_model(model)
@@ -222,10 +180,9 @@ for model in models:
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
src_ifs = ""
for model in [*all_models, *pre_computed_hashes]:
for model in models:
name = model["name"]
tokt = model["tokt"]
chkhsh = model.get("chkhsh")
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
continue
@@ -236,44 +193,35 @@ for model in [*all_models, *pre_computed_hashes]:
continue
# create the tokenizer
if chkhsh is not None:
# if the model has a pre-computed hash, use it
logger.info(f"Using pre-computed hash for model {name}: {chkhsh}")
elif name in existing_models:
# if the model already exists in convert_hf_to_gguf.py, skip compute hash
chkhsh = existing_models[name]
else:
# otherwise, compute the hash of the tokenizer
try:
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
if name == "t5":
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
try:
if name == "t5":
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()
chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}")
logger.info(f"tokt: {tokt}")
logger.info(f"repo: {model['repo']}")
logger.info(f"chktok: {chktok}")
logger.info(f"chkhsh: {chkhsh}")
logger.info(f"model: {name}")
logger.info(f"tokt: {tokt}")
logger.info(f"repo: {model['repo']}")
logger.info(f"chktok: {chktok}")
logger.info(f"chkhsh: {chkhsh}")
# print the "pre_tokenizer" content from the tokenizer.json
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
normalizer = cfg["normalizer"]
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
pre_tokenizer = cfg["pre_tokenizer"]
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
if "ignore_merges" in cfg["model"]:
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
# print the "pre_tokenizer" content from the tokenizer.json
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
normalizer = cfg["normalizer"]
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
pre_tokenizer = cfg["pre_tokenizer"]
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
if "ignore_merges" in cfg["model"]:
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
logger.info("")
logger.info("")
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
src_ifs += f" # ref: {model['repo']}\n"
@@ -321,6 +269,8 @@ src_func = f"""
return res
"""
convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
convert_py = convert_py_pth.read_text(encoding="utf-8")
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
@@ -336,7 +286,7 @@ logger.info("+++ convert_hf_to_gguf.py was updated")
tests = [
"ied 4 ½ months",
"Äpfel",
"Führer",
"",
" ",
" ",
@@ -415,10 +365,6 @@ for model in models:
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
continue # Skip this model and continue with the next one in the loop
if not os.path.exists(f"models/ggml-vocab-{name}.gguf"):
logger.info(f"Skip vocab files for model {name}, no GGUF file found")
continue
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
for text in tests:
f.write(f"{text}")

View File

@@ -24,7 +24,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
import gguf
# reuse model definitions from convert_hf_to_gguf.py
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
from convert_hf_to_gguf import LazyTorchTensor, Model
logger = logging.getLogger("lora-to-gguf")
@@ -340,11 +340,11 @@ if __name__ == '__main__':
sys.exit(1)
else:
logger.info(f"Loading base model: {dir_base_model.name}")
hparams = ModelBase.load_hparams(dir_base_model)
hparams = Model.load_hparams(dir_base_model)
with torch.inference_mode():
try:
model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
model_class = Model.from_model_architecture(hparams["architectures"][0])
except NotImplementedError:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)

155
docs/backend/CANN.md Executable file → Normal file
View File

@@ -8,7 +8,6 @@
- [DataType Supports](#datatype-supports)
- [Docker](#docker)
- [Linux](#linux)
- [Environment variable setup](#environment-variable-setup)
- [TODO](#todo)
@@ -57,82 +56,60 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
## Model Supports
| Model Name | FP16 | Q4_0 | Q8_0 |
| Model Name | FP16 | Q8_0 | Q4_0 |
|:----------------------------|:-----:|:----:|:----:|
| Llama-2 | √ | √ | √ |
| Llama-3 | √ | √ | √ |
| Mistral-7B | √ | √ | √ |
| Mistral MOE | √ | √ | √ |
| DBRX | - | - | - |
| Falcon | √ | | √ |
| Chinese LLaMA/Alpaca | | | |
| Vigogne(French) | | | |
| BERT | x | x | x |
| Koala | √ | √ | √ |
| Baichuan | √ | | |
| Aquila 1 & 2 | √ | | |
| Starcoder models | √ | | |
| Refact | | | |
| MPT | √ | √ | √ |
| Bloom | √ | √ | √ |
| Yi models | √ | √ | √ |
| stablelm models | | | |
| DeepSeek models | x | x | x |
| Qwen models | √ | √ | √ |
| PLaMo-13B | √ | √ | √ |
| Phi models | √ | √ | √ |
| PhiMoE | √ | √ | √ |
| GPT-2 | √ | √ | √ |
| Orion | √ | √ | √ |
| InternlLM2 | √ | √ | √ |
| CodeShell | √ | √ | √ |
| Gemma | √ | √ | √ |
| Mamba | √ | √ | √ |
| Xverse | √ | √ | √ |
| command-r models | √ | √ | √ |
| Grok-1 | - | - | - |
| SEA-LION | √ | √ | √ |
| AquilaChat2-7B | √ | √ | √ |
| Baichuan-7b | √ | √ | √ |
| Baichuan2-7B-Chat | √ | √ | √ |
| bitnet_b1_58-large | √ | √ | √ |
| bloom-560m | | x | |
| bloomz-alpaca-560m | √ | x | √ |
| c4ai-command-r-35B-v01 | x | x | x |
| chatglm3-6B | x | x | x |
| chinese-alpaca-2-1.3b | | | |
| CodeShell-7B | √ | √ | √ |
| deepseek-ai_deepseek-coder-1.3B-base | x | x | x |
| deepseek-ai_DeepSeek-V2-Lite | x | x | x |
| deepseek-coder-6.7B-instruct | x | x | x |
| DeepSeek-V2-Lite-64x1.5B | x | x | x |
| falcon-7b-instruct | √ | √ | √ |
| flan-t5-large | √ | √ | √ |
| gemma-2-9b-it | √ | √ | √ |
| glm-4-9B | x | x | x |
| gpt2 | | | |
| Gpt2-163M | √ | √ | √ |
| granite-3B-code-instruct | √ | √ | √ |
| GritLM-7B | √ | √ | √ |
| OLMo | √ | √ | √ |
| OLMo 2 | √ | √ | √ |
| OLMoE | √ | √ | √ |
| Granite models | √ | √ | √ |
| GPT-NeoX | √ | √ | √ |
| Pythia | √ | √ | √ |
| Snowflake-Arctic MoE | - | - | - |
| Smaug | √ | √ | √ |
| Poro 34B | √ | √ | √ |
| Bitnet b1.58 models | | x | x |
| Flan-T5 | √ | √ | √ |
| Open Elm models | x | √ | √ |
| chatGLM3-6B + ChatGLM4-9b + GLMEdge-1.5b + GLMEdge-4b | √ | √ | √ |
| GLM-4-0414 | √ | √ | √ |
| SmolLM | | | |
| EXAONE-3.0-7.8B-Instruct | | | |
| FalconMamba Models | √ | √ | √ |
| Jais Models | - | x | x |
| Bielik-11B-v2.3 | | | |
| RWKV-6 | - | √ | √ |
| QRWKV-6 | √ | | √ |
| GigaChat-20B-A3B | x | x | x |
| Trillion-7B-preview | √ | √ | √ |
| Ling models | | | |
**Multimodal**
| Model Name | FP16 | Q4_0 | Q8_0 |
|:----------------------------|:-----:|:----:|:----:|
| LLaVA 1.5 models, LLaVA 1.6 models | x | x | x |
| BakLLaVA | √ | √ | √ |
| Obsidian | √ | - | - |
| ShareGPT4V | x | - | - |
| MobileVLM 1.7B/3B models | - | - | - |
| Yi-VL | - | - | - |
| Mini CPM | √ | √ | √ |
| Moondream | √ | √ | √ |
| Bunny | √ | - | - |
| GLM-EDGE | √ | √ | √ |
| Qwen2-VL | √ | √ | √ |
| internlm2_5-7b-chat | √ | √ | √ |
| koala-7B-HF | √ | √ | √ |
| Llama-2-7b-chat-hf | √ | √ | √ |
| Llama-3-Smaug-8B | √ | √ | √ |
| Llama2-Chinese-7b-Chat | √ | √ | √ |
| Llama3-8B | √ | √ | √ |
| Llama3-8b-chinese | | | |
| mamba-130m-hf | √ | √ | √ |
| Mistral-7B-Instruct-v0.2 | √ | √ | √ |
| Mixtral-8x7B-Instruct-v0.1 | x | | |
| mpt-7B | √ | √ | √ |
| OLMo-1B-hf | | √ | √ |
| OpenELM-3B-Instruct | √ | √ | √ |
| Orion-14b-base | √ | √ | √ |
| phi1 | x | x | x |
| phi2 | x | x | x |
| Phi-3-mini-4k-instruct | √ | √ | √ |
| plamo-13b | | | |
| pythia-70M | x | x | x |
| Qwen-7B | | √ | √ |
| Qwen2-1.5B-Instruct | √ | x | √ |
| Refact-1_6B-fim | | | |
| SmolLM-135M | √ | √ | √ |
| stablelm-zephyr | x | x | x |
| stablelm-2-zephyr-1_6b | x | x | x |
| starcoderbase-1b | √ | √ | √ |
| starcoder2-3b | √ | √ | √ |
| vigogne-7b-chat | | √ | √ |
| xverse-7b-chat | √ | √ | √ |
| Yi-6b-Chat | | | |
@@ -281,34 +258,6 @@ cmake --build build --config release
### **GitHub contribution**:
Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
## Updates
### Basic Flash Attention Support
The basic FA kernel with aclnnops has been added in aclnn_ops.cpp.
Currently, the FA only supports the cases with FP16 KV tensors and NO logit softcap.
Since the aclnn interface for flash attention cannot support the logit softcap, we will only update the quantized version in the future.
Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang@pku.edu.cn), Ruiyang Ma (ruiyang@stu.pku.edu.cn), and Guojie Luo (gluo@pku.edu.cn).
We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
## Environment variable setup
### GGML_CANN_ASYNC_MODE
Enables asynchronous operator submission. Disabled by default.
### GGML_CANN_MEM_POOL
Specifies the memory pool management strategy:
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
- prio: Employs a priority queue-based memory pool management.
- leg: Uses a fixed-size buffer pool.
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
## TODO
- Support more models and data types.

View File

@@ -17,25 +17,25 @@
**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to Intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over Intel iGPUs and dGPUs.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
### Llama.cpp + SYCL
The llama.cpp SYCL backend is primarily designed for **Intel GPUs**.
SYCL cross-platform capabilities enable support for Nvidia GPUs as well, with limited support for AMD.
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
## Recommended Release
The following releases are verified and recommended:
The SYCL backend would be broken by some PRs due to no online CI.
The following release is verified with good quality:
|Commit ID|Tag|Release|Verified Platform| Update date|
|-|-|-|-|-|
|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |ArcB580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
@@ -106,14 +106,15 @@ SYCL backend supports Intel GPU Family:
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc Series | Support | Arc 770, 730M, Arc A750, B580 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
| Intel iGPU | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7 |
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake |
| Intel iGPU | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 |
*Notes:*
- **Memory**
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
- **Execution Unit (EU)**
@@ -137,11 +138,9 @@ Note: AMD GPU support is highly experimental and is incompatible with F16.
Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
## Docker
The docker build option is currently limited to *Intel GPU* targets.
The docker build option is currently limited to *intel GPU* targets.
### Build image
```sh
# Using FP16
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
@@ -149,10 +148,9 @@ docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f
*Notes*:
To build in default FP32 *(Slower than FP16 alternative)*, set `--build-arg="GGML_SYCL_F16=OFF"` in the previous command.
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
Check the [documentation for Docker](../docker.md) to see the available images.
### Run container
@@ -252,7 +250,7 @@ sycl-ls
- **Intel GPU**
When targeting an intel GPU, the user should expect one or more devices among the available SYCL devices. Please make sure that at least one GPU is present via `sycl-ls`, for instance `[level_zero:gpu]` in the sample output below:
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:
```
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
@@ -284,7 +282,7 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
#### Intel GPU
```sh
```
./examples/sycl/build.sh
```
@@ -353,7 +351,7 @@ cmake --build build --config Release -j -v
#### Retrieve and prepare model
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
##### Check device
@@ -400,15 +398,11 @@ Choose one of following methods to run.
```sh
./examples/sycl/run-llama2.sh 0
# OR
./examples/sycl/run-llama3.sh 0
```
- Use multiple devices:
```sh
./examples/sycl/run-llama2.sh
# OR
./examples/sycl/run-llama3.sh
```
2. Command line
@@ -431,13 +425,13 @@ Examples:
- Use device 0:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
```
- Use multiple devices:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
*Notes:*
@@ -458,7 +452,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
1. Install GPU driver
Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
2. Install Visual Studio
@@ -635,7 +629,7 @@ Once it is completed, final results will be in **build/Release/bin**
#### Retrieve and prepare model
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
##### Check device
@@ -654,7 +648,7 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
build\bin\llama-ls-sycl-device.exe
```
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *Intel GPU* it would look like the following:
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
```
found 2 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| |
@@ -664,14 +658,13 @@ found 2 SYCL devices:
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
```
#### Choose level-zero devices
|Chosen Device ID|Setting|
|-|-|
|0|Default option. You may also want to `set ONEAPI_DEVICE_SELECTOR="level_zero:0"`|
|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
#### Execute
@@ -680,13 +673,7 @@ Choose one of following methods to run.
1. Script
```
examples\sycl\win-run-llama-2.bat
```
or
```
examples\sycl\win-run-llama-3.bat
examples\sycl\win-run-llama2.bat
```
2. Command line
@@ -710,13 +697,13 @@ Examples:
- Use device 0:
```
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
```
- Use multiple devices:
```
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
```
@@ -727,9 +714,7 @@ Note:
```sh
detect 1 SYCL GPUs: [0] with top Max compute units:512
```
Or
```sh
use 1 SYCL GPUs: [0] with Max compute units:512
```
@@ -741,17 +726,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| Name | Value | Function |
|--------------------|---------------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. |
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. (1.) |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
1. FP16 is recommended for better prompt processing performance on quantized models. Performance is equivalent in text generation but set `GGML_SYCL_F16=OFF` if you are experiencing issues with FP16 builds.
#### Runtime
| Name | Value | Function |
@@ -759,7 +741,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
@@ -769,7 +750,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
## Q&A
- Error: `error while loading shared libraries: libsycl.so: cannot open shared object file: No such file or directory`.
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
- Potential cause: Unavailable oneAPI installation or not set ENV variables.
- Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
@@ -798,18 +779,18 @@ use 1 SYCL GPUs: [0] with Max compute units:512
It's same for other projects including llama.cpp SYCL backend.
- `Native API failed. Native API returns: 39 (UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY)`, `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 3503030272 Bytes of memory on device`, or `failed to allocate SYCL0 buffer`
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
You are running out of Device Memory.
Device Memory is not enough.
|Reason|Solution|
|-|-|
| The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
| The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|
|Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
|Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
### **GitHub contribution**:
Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
## TODO
- Review ZES_ENABLE_SYSMAN: https://github.com/intel/compute-runtime/blob/master/programmers-guide/SYSMAN.md#support-and-limitations
- NA

View File

@@ -1,9 +1,5 @@
# Build llama.cpp locally
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
**To get the Code:**
```bash
@@ -67,7 +63,6 @@ cmake --build build --config Release
cmake --preset x64-windows-llvm-release
cmake --build build-x64-windows-llvm-release
```
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
## BLAS Build
@@ -264,6 +259,8 @@ You can download it from your Linux distro's package manager or from here: [ROCm
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16
```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
@@ -299,10 +296,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
### Unified Memory
On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
## Vulkan
**Windows**

View File

@@ -9,10 +9,10 @@ Adding a model requires few steps:
After following these steps, you can open PR.
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](/tools/main/)
- [imatrix](/tools/imatrix/)
- [quantize](/tools/quantize/)
- [server](/tools/server/)
- [main](/examples/main/)
- [imatrix](/examples/imatrix/)
- [quantize](/examples/quantize/)
- [server](/examples/server/)
### 1. Convert the model to GGUF

View File

@@ -22,9 +22,6 @@ Additionally, there the following images, similar to the above:
- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
@@ -107,7 +104,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
The defaults are:
- `MUSA_VERSION` set to `rc4.0.1`
- `MUSA_VERSION` set to `rc3.1.1`
The resulting images, are essentially the same as the non-MUSA images:

View File

@@ -2,6 +2,7 @@
[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
- `llama-server` when started w/ `--jinja` flag
- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
## Universal support w/ Native & Generic handlers
@@ -324,65 +325,36 @@ To get the official template from original HuggingFace repos, you can use [scrip
> [!TIP]
> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
> [!CAUTION]
> Beware of extreme KV quantizations (e.g. `-ctk q4_0`), they can substantially degrade the model's tool calling performance.
Test in CLI (or with any library / software that can use OpenAI-compatible API backends):
```bash
curl http://localhost:8080/v1/chat/completions -d '{
"model": "gpt-3.5-turbo",
"tools": [
{
"type":"function",
"function":{
"name":"python",
"description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
"parameters":{
"type":"object",
"properties":{
"code":{
"type":"string",
"description":"The code to run in the ipython interpreter."
}
},
"required":["code"]
"model": "gpt-3.5-turbo",
"tools": [
{
"type":"function",
"function":{
"name":"python",
"description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
"parameters":{
"type":"object",
"properties":{
"code":{
"type":"string",
"description":"The code to run in the ipython interpreter."
}
},
"required":["code"]
}
}
],
"messages": [
{
"role": "user",
"content": "Print a hello world message with python."
}
]
}'
curl http://localhost:8080/v1/chat/completions -d '{
"model": "gpt-3.5-turbo",
"messages": [
{"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
{"role": "user", "content": "What is the weather in Istanbul?"}
],
"tools": [{
"type":"function",
"function":{
"name":"get_current_weather",
"description":"Get the current weather in a given location",
"parameters":{
"type":"object",
"properties":{
"location":{
"type":"string",
"description":"The city and country/state, e.g. `San Francisco, CA`, or `Paris, France`"
}
},
"required":["location"]
}
}
}]
}
}
],
"messages": [
{
"role": "user",
"content": "Print a hello world message with python."
}
]
}'
```

View File

@@ -1,42 +1,28 @@
# Install pre-built version of llama.cpp
| Install via | Windows | Mac | Linux |
|-------------|---------|-----|-------|
| Winget | ✅ | | |
| Homebrew | | ✅ | ✅ |
| MacPorts | | ✅ | |
| Nix | | ✅ | ✅ |
## Homebrew
## Winget (Windows)
```sh
winget install llama.cpp
```
The package is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/issues/8188
## Homebrew (Mac and Linux)
On Mac and Linux, the homebrew package manager can be used via
```sh
brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
## MacPorts (Mac)
## MacPorts
```sh
sudo port install llama.cpp
```
see also: https://ports.macports.org/port/llama.cpp/details/
See also: https://ports.macports.org/port/llama.cpp/details/
## Nix
## Nix (Mac and Linux)
On Mac and Linux, the Nix package manager can be used via
```sh
nix profile install nixpkgs#llama-cpp
```
For flake enabled installs.
Or
@@ -48,3 +34,13 @@ nix-env --file '<nixpkgs>' --install --attr llama-cpp
For non-flake enabled installs.
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
## Flox
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
```sh
flox install llama-cpp
```
Flox follows the nixpkgs build of llama.cpp.

View File

@@ -1,109 +0,0 @@
# Multimodal
llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
- [llama-mtmd-cli](../tools/mtmd/README.md)
- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
To enable it, you can use one of the 2 methods below:
- Use `-hf` option with a supported model (see a list of pre-quantized model below)
- To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
- To load a model using `-hf` while using a custom mmproj file, use `--mmproj local_file.gguf`
- Use `-m model.gguf` option with `--mmproj file.gguf` to specify text and multimodal projector respectively
By default, multimodal projector will be offloaded to GPU. To disable this, add `--no-mmproj-offload`
For example:
```sh
# simple usage with CLI
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
# simple usage with server
llama-server -hf ggml-org/gemma-3-4b-it-GGUF
# using local file
llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.gguf
# no GPU offload
llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
```
## Pre-quantized models
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc
Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
NOTE: some models may require large context window, for example: `-c 8192`
**Vision models**:
```sh
# Gemma 3
(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
(tool_name) -hf ggml-org/gemma-3-12b-it-GGUF
(tool_name) -hf ggml-org/gemma-3-27b-it-GGUF
# SmolVLM
(tool_name) -hf ggml-org/SmolVLM-Instruct-GGUF
(tool_name) -hf ggml-org/SmolVLM-256M-Instruct-GGUF
(tool_name) -hf ggml-org/SmolVLM-500M-Instruct-GGUF
(tool_name) -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
(tool_name) -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
(tool_name) -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
# Pixtral 12B
(tool_name) -hf ggml-org/pixtral-12b-GGUF
# Qwen 2 VL
(tool_name) -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
(tool_name) -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
# Qwen 2.5 VL
(tool_name) -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
(tool_name) -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
(tool_name) -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
(tool_name) -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
# Mistral Small 3.1 24B (IQ2_M quantization)
(tool_name) -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
# InternVL 2.5 and 3
(tool_name) -hf ggml-org/InternVL2_5-1B-GGUF
(tool_name) -hf ggml-org/InternVL2_5-4B-GGUF
(tool_name) -hf ggml-org/InternVL3-1B-Instruct-GGUF
(tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
(tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
# Llama 4 Scout
(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
# Moondream2 20250414 version
(tool_name) -hf ggml-org/moondream2-20250414-GGUF
```
**Audio models**:
```sh
# Ultravox 0.5
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
# Qwen2-Audio and SeaLLM-Audio
# note: no pre-quantized GGUF this model, as they have very poor result
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
```
**Mixed modalities**:
```sh
# Qwen2.5 Omni
# Capabilities: audio input, vision input
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
```

View File

@@ -12,30 +12,60 @@ llama_add_compile_flags()
# examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(batched-bench)
add_subdirectory(batched)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(gbnf-validator)
endif()
add_subdirectory(gguf-hash)
add_subdirectory(gguf-split)
add_subdirectory(gguf)
add_subdirectory(gritlm)
add_subdirectory(imatrix)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(main)
add_subdirectory(parallel)
add_subdirectory(passkey)
add_subdirectory(perplexity)
add_subdirectory(quantize)
add_subdirectory(retrieval)
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
add_subdirectory(save-load-state)
add_subdirectory(run)
add_subdirectory(simple)
add_subdirectory(simple-chat)
add_subdirectory(speculative)
add_subdirectory(speculative-simple)
add_subdirectory(tokenize)
add_subdirectory(tts)
add_subdirectory(gen-docs)
add_subdirectory(training)
if (NOT GGML_BACKEND_DL)
add_subdirectory(convert-llama2c-to-ggml)
# these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator)
add_subdirectory(export-lora)
if (NOT WIN32)
# disabled on Windows because it uses internal functions not exported with LLAMA_API
add_subdirectory(quantize-stats)
endif()
add_subdirectory(llava)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
if (GGML_SYCL)
add_subdirectory(sycl)
endif()

View File

@@ -57,8 +57,6 @@ int main(int argc, char ** argv) {
return 1;
}
auto * mem = llama_get_memory(ctx);
const int32_t n_kv_max = llama_n_ctx(ctx);
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
@@ -125,8 +123,8 @@ int main(int argc, char ** argv) {
common_batch_clear(batch);
for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
for (int i = 0; i < pp; ++i) {
for (int i = 0; i < pp; ++i) {
for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
common_batch_add(batch, 0, i, { j }, false);
}
}
@@ -134,7 +132,7 @@ int main(int argc, char ** argv) {
const auto t_pp_start = ggml_time_us();
llama_memory_clear(mem, false);
llama_kv_self_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -143,7 +141,7 @@ int main(int argc, char ** argv) {
if (is_pp_shared) {
for (int32_t i = 1; i < pl; ++i) {
llama_memory_seq_cp(mem, 0, i, -1, -1);
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
}
}

View File

@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
}
for i in 1 ..< n_parallel {
llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens)
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
}
if n_parallel > 1 {

View File

@@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
}
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;

View File

@@ -35,14 +35,23 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
const struct llama_model * model = llama_get_model(ctx);
// clear previous kv_cache values (irrelevant for embeddings)
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_decode(ctx, batch) < 0) {
LOG_ERR("%s : failed to process\n", __func__);
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
// encoder-only model
if (llama_encode(ctx, batch) < 0) {
LOG_ERR("%s : failed to encode\n", __func__);
}
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
// decoder-only model
if (llama_decode(ctx, batch) < 0) {
LOG_ERR("%s : failed to decode\n", __func__);
}
}
for (int i = 0; i < batch.n_tokens; i++) {
@@ -80,13 +89,6 @@ int main(int argc, char ** argv) {
common_init();
params.embedding = true;
// utilize the full context
if (params.n_batch < params.n_ctx) {
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
params.n_batch = params.n_ctx;
}
// For non-causal models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
@@ -132,6 +134,7 @@ int main(int argc, char ** argv) {
// max batch size
const uint64_t n_batch = params.n_batch;
GGML_ASSERT(params.n_batch >= params.n_ctx);
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
@@ -236,24 +239,9 @@ int main(int argc, char ** argv) {
LOG("\n");
}
} else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
const uint32_t n_cls_out = llama_model_n_cls_out(model);
std::vector<std::string> cls_out_labels;
for (uint32_t i = 0; i < n_cls_out; i++) {
const char * label = llama_model_cls_label(model, i);
const std::string label_i(label == nullptr ? "" : label);
cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
}
for (int j = 0; j < n_embd_count; j++) {
for (uint32_t i = 0; i < n_cls_out; i++) {
// NOTE: if you change this log - update the tests in ci/run.sh
if (n_cls_out == 1) {
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
} else {
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
}
}
// NOTE: if you change this log - update the tests in ci/run.sh
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
}
} else {
// print the first part of the embeddings or for a single prompt, the full embedding

View File

@@ -0,0 +1,5 @@
set(TARGET llama-gbnf-validator)
add_executable(${TARGET} gbnf-validator.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,5 +1,5 @@
#include "../src/unicode.h"
#include "../src/llama-grammar.h"
#include "unicode.h"
#include "llama-grammar.h"
#include <cstdio>
#include <cstdlib>

View File

@@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
}
// clear previous kv_cache values (irrelevant for embeddings)
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
llama_set_embeddings(ctx, true);
llama_set_causal_attn(ctx, false);
@@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
llama_token eos_token = llama_vocab_eos(vocab);
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
llama_set_embeddings(ctx, false);
llama_set_causal_attn(ctx, true);

View File

@@ -1,4 +1,4 @@
# llama.cpp/tools/imatrix
# llama.cpp/examples/imatrix
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models.
More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861

View File

@@ -24,8 +24,7 @@ static void print_usage(int, char ** argv) {
LOG("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] \\\n"
" [--parse-special]\n" , argv[0]);
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG("\n");
}
@@ -47,7 +46,7 @@ private:
common_params m_params;
std::mutex m_mutex;
int m_last_call = 0;
std::vector<char> m_src1_data;
std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
};
@@ -94,13 +93,11 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
if (!is_host) {
const size_t src1_nbytes = ggml_nbytes(src1);
m_src1_data.resize(src1_nbytes);
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes);
m_src1_data.resize(ggml_nelements(src1));
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
}
const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
@@ -147,7 +144,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const int64_t i11 = idx % src1->ne[1];
const int64_t i12 = row;
const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]);
const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j];
@@ -183,7 +180,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
++e.ncall;
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = (const float *) (data + row * src1->nb[1]);
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
@@ -440,7 +437,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
auto tim1 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true, params.parse_special);
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -498,7 +495,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
@@ -586,6 +583,7 @@ int main(int argc, char ** argv) {
params.out_file = "imatrix.dat" ;
params.n_ctx = 512;
params.logits_all = true;
params.escape = false;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {

View File

@@ -0,0 +1,5 @@
set(TARGET llama-infill)
add_executable(${TARGET} infill.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

47
examples/infill/README.md Normal file
View File

@@ -0,0 +1,47 @@
# llama.cpp/example/infill
This example shows how to use the infill mode with Code Llama models supporting infill mode.
Currently the 7B and 13B models support infill mode.
Infill supports most of the options available in the main example.
For further information have a look at the main README.md in llama.cpp/example/main/README.md
## Common Options
In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
## Input Prompts
The `infill` program provides several ways to interact with the LLaMA models using input prompts:
- `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
- `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
## Interaction
The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
### Interaction Options
- `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
### Example
Download a model that supports infill, for example CodeLlama:
```console
scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
```
```bash
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
```

590
examples/infill/infill.cpp Normal file
View File

@@ -0,0 +1,590 @@
#include "arg.h"
#include "common.h"
#include "console.h"
#include "sampling.h"
#include "log.h"
#include "llama.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static llama_context ** g_ctx;
static llama_model ** g_model;
static common_sampler ** g_smpl;
static common_params * g_params;
static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
static void sigint_handler(int signo) {
if (signo == SIGINT) {
if (!is_interacting) {
is_interacting = true;
} else {
console::cleanup();
LOG("\n");
common_perf_print(*g_ctx, *g_smpl);
// make sure all logs are flushed
LOG("Interrupted by user\n");
common_log_pause(common_log_main());
_exit(130);
}
}
}
#endif
int main(int argc, char ** argv) {
common_params params;
g_params = &params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
return 1;
}
common_init();
auto & sparams = params.sampling;
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
if (params.logits_all) {
LOG_ERR("\n************\n");
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.embedding) {
LOG_ERR("\n************\n");
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
LOG_ERR("\n************\n");
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.rope_freq_base != 0.0) {
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 0.0) {
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
LOG_INF("%s: llama backend init\n", __func__);
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model = nullptr;
llama_context * ctx = nullptr;
common_sampler * smpl = nullptr;
g_model = &model;
g_ctx = &ctx;
g_smpl = &smpl;
// load the model and apply lora adapter, if any
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
common_init_result llama_init = common_init_from_params(params);
model = llama_init.model.get();
ctx = llama_init.context.get();
if (model == NULL) {
LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG_DBG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) {
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
}
// print system information
{
LOG_INF("\n");
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
}
const bool add_bos = llama_vocab_get_add_bos(vocab);
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end;
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
}
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
const llama_token middle_token = llama_vocab_fim_mid(vocab);
if (middle_token >= 0) {
embd_inp.push_back(middle_token);
}
LOG_DBG("add_bos: %d\n", add_bos);
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_vocab_bos(vocab));
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
}
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
params.n_keep = (int)embd_inp.size();
}
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
// enable interactive mode if interactive start is specified
if (params.interactive_first) {
params.interactive = true;
}
if (params.verbose_prompt) {
LOG_INF("\n");
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (params.n_keep > 0) {
LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
}
LOG_CNT("'\n");
}
LOG_INF("\n");
}
if (params.interactive) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
LOG_INF("%s: interactive mode on.\n", __func__);
if (params.input_prefix_bos) {
LOG_INF("Input prefix with BOS\n");
}
if (!params.input_prefix.empty()) {
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
}
if (!params.input_suffix.empty()) {
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
}
}
smpl = common_sampler_init(model, sparams);
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_INF("\n");
LOG_INF("\n##### Infill mode #####\n\n");
if (params.interactive) {
const char *control_message;
if (params.multiline_input) {
control_message = " - To return control to LLaMA, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n";
} else {
control_message = " - Press Return to return control to LLaMA.\n"
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
LOG_INF("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
#endif
LOG_INF( "%s\n", control_message);
is_interacting = params.interactive_first;
}
bool input_echo = true;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
std::ostringstream output_ss; g_output_ss = &output_ss;
// the first thing we will do is to output the prompt, so set color accordingly
console::set_display(console::prompt);
std::vector<llama_token> embd;
while (n_remain != 0 || params.interactive) {
// predict
if (!embd.empty()) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int) embd.size() > max_embd_size) {
const int skipped_tokens = (int) embd.size() - max_embd_size;
embd.resize(max_embd_size);
console::set_display(console::error);
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
}
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) {
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep - 1;
const int n_discard = n_left/2;
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
n_past -= n_discard;
LOG_DBG("after swap: n_past = %d\n", n_past);
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
}
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
LOG_DBG("n_past = %d\n", n_past);
}
}
embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const llama_token id = common_sampler_sample(smpl, ctx, -1);
common_sampler_accept(smpl, id, true);
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(id);
// echo this to console
input_echo = true;
// decrement remaining sampling budget
--n_remain;
LOG_DBG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
common_sampler_accept(smpl, embd_inp[n_consumed], false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
}
}
}
// display text
if (input_echo) {
for (auto id : embd) {
const std::string token_str = common_token_to_piece(ctx, id);
LOG("%s", token_str.c_str());
if (embd.size() > 1) {
input_tokens.push_back(id);
} else {
output_tokens.push_back(id);
output_ss << token_str;
}
}
}
// reset color to default if we there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
}
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode
if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
// print an eot token
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
}
LOG("\n");
console::set_display(console::user_input);
std::string buffer;
std::string line;
bool another_line=true;
// set a new prefix via stdin
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
// check if we got an empty line, if so we use the old input
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
params.input_prefix = buffer;
}
buffer.clear();
// set a new suffix via stdin
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
// check if we got an empty line
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
params.input_suffix = buffer;
}
buffer.clear();
// done taking input, reset color
console::set_display(console::reset);
if (params.escape) {
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
string_process_escapes(params.input_prefix);
string_process_escapes(params.input_suffix);
}
// tokenize new prefix and suffix
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
}
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
if (middle_token >= 0) {
embd_inp.push_back(middle_token);
}
embd.clear();
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
is_interacting = false;
}
// deal with end of generation tokens in interactive mode
else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
LOG_DBG("found EOS token\n");
if (params.interactive) {
is_interacting = true;
LOG("\n");
console::set_display(console::user_input);
}
}
if (n_past > 0 && is_interacting && !params.interactive) {
LOG_DBG("waiting for user input\n");
if (params.input_prefix_bos) {
LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_vocab_bos(vocab));
}
std::string buffer;
if (!params.input_prefix.empty()) {
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
buffer += params.input_prefix;
LOG("%s", buffer.c_str());
}
std::string line;
bool another_line = true;
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
// done taking input, reset color
console::set_display(console::reset);
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty()) {
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix;
LOG("%s", params.input_suffix.c_str());
}
LOG_DBG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size();
const auto line_inp = common_tokenize(ctx, buffer, false);
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
output_tokens.push_back(token);
output_ss << common_token_to_piece(ctx, token);
}
n_remain -= line_inp.size();
LOG_DBG("n_remain: %d\n", n_remain);
} else {
LOG_DBG("empty line, passing control back\n");
}
input_echo = false; // do not echo this again
}
if (n_past > 0) {
if (is_interacting) {
common_sampler_reset(smpl);
}
is_interacting = false;
}
}
// end of generation
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
break;
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
n_remain = params.n_predict;
is_interacting = true;
}
}
if (!params.interactive && n_remain <= 0) {
LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
}
LOG("\n");
common_perf_print(ctx, smpl);
common_sampler_free(smpl);
llama_backend_free();
return 0;
}

View File

@@ -10,9 +10,6 @@ from typing import Any, List, Optional, Set, Tuple, Union
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
if max_items == 0:
return ""
if min_items == 0 and max_items == 1:
return f'{item_rule}?'

View File

@@ -1,4 +1,4 @@
# llama.cpp/tools/llama-bench
# llama.cpp/examples/llama-bench
Performance testing tool for llama.cpp.
@@ -20,50 +20,40 @@ Performance testing tool for llama.cpp.
## Syntax
```
usage: llama-bench [options]
usage: ./llama-bench [options]
options:
-h, --help
--numa <distribute|isolate|numactl> numa mode (default: disabled)
-r, --repetitions <n> number of times to repeat each test (default: 5)
--prio <0|1|2|3> process/thread priority (default: 0)
--delay <0...N> (seconds) delay between each test (default: 0)
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
-v, --verbose verbose output
--progress print test progress indicators
test parameters:
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128)
-pg <pp,tg> (default: )
-d, --n-depth <n> (default: 0)
-b, --batch-size <n> (default: 2048)
-ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16)
-dt, --defrag-thold <f> (default: -1)
-t, --threads <n> (default: system dependent)
-t, --threads <n> (default: 8)
-C, --cpu-mask <hex,hex> (default: 0x0)
--cpu-strict <0|1> (default: 0)
--poll <0...100> (default: 50)
-ngl, --n-gpu-layers <n> (default: 99)
-rpc, --rpc <rpc_servers> (default: none)
-rpc, --rpc <rpc_servers> (default: )
-sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-fa, --flash-attn <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1)
--numa <distribute|isolate|numactl> (default: disabled)
-embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
-ot --override-tensors <tensor name pattern>=<buffer type>;...
(default: disabled)
-nopo, --no-op-offload <0|1> (default: 0)
-r, --repetitions <n> (default: 5)
--prio <0|1|2|3> (default: 0)
--delay <0...N> (seconds) (default: 0)
-o, --output <csv|json|jsonl|md|sql> (default: md)
-oe, --output-err <csv|json|jsonl|md|sql> (default: none)
-v, --verbose (default: 0)
Multiple values can be given for each parameter by separating them with ','
or by specifying the parameter multiple times. Ranges can be given as
'first-last' or 'first-last+step' or 'first-last*mult'.
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
```
llama-bench can perform three types of tests:
@@ -76,10 +66,12 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple
Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
For a description of the other options, see the [main example](../main/README.md).
Note:
- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`.
## Examples
### Text generation with different models
@@ -156,19 +148,6 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 |
| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 |
### Different prefilled context
```
$ ./llama-bench -d 0,512
```
| model | size | params | backend | ngl | test | t/s |
| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 |
| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 |
## Output formats
By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
@@ -191,9 +170,9 @@ $ ./llama-bench -o csv
```
```csv
build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
```
### JSON
@@ -205,78 +184,64 @@ $ ./llama-bench -o json
```json
[
{
"build_commit": "8cf427ff",
"build_number": 5163,
"cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
"gpu_info": "NVIDIA GeForce RTX 4080",
"backends": "CUDA",
"model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
"model_type": "qwen2 7B Q4_K - Medium",
"model_size": 4677120000,
"model_n_params": 7615616512,
"n_batch": 2048,
"n_ubatch": 512,
"n_threads": 8,
"cpu_mask": "0x0",
"cpu_strict": false,
"poll": 50,
"type_k": "f16",
"type_v": "f16",
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"metal": false,
"gpu_blas": true,
"blas": true,
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
"model_filename": "models/7B/ggml-model-q4_0.gguf",
"model_type": "llama 7B mostly Q4_0",
"model_size": 3825065984,
"model_n_params": 6738415616,
"n_batch": 512,
"n_threads": 16,
"f16_kv": true,
"n_gpu_layers": 99,
"split_mode": "layer",
"main_gpu": 0,
"no_kv_offload": false,
"flash_attn": false,
"mul_mat_q": true,
"tensor_split": "0.00",
"use_mmap": true,
"embeddings": false,
"n_prompt": 512,
"n_gen": 0,
"n_depth": 0,
"test_time": "2025-04-24T11:58:50Z",
"avg_ns": 72135640,
"stddev_ns": 1453752,
"avg_ts": 7100.002165,
"stddev_ts": 140.341520,
"samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ],
"samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ]
"test_time": "2023-09-23T12:09:57Z",
"avg_ns": 212365953,
"stddev_ns": 985423,
"avg_ts": 2410.974041,
"stddev_ts": 11.163766,
"samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
"samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
},
{
"build_commit": "8cf427ff",
"build_number": 5163,
"cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
"gpu_info": "NVIDIA GeForce RTX 4080",
"backends": "CUDA",
"model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
"model_type": "qwen2 7B Q4_K - Medium",
"model_size": 4677120000,
"model_n_params": 7615616512,
"n_batch": 2048,
"n_ubatch": 512,
"n_threads": 8,
"cpu_mask": "0x0",
"cpu_strict": false,
"poll": 50,
"type_k": "f16",
"type_v": "f16",
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"metal": false,
"gpu_blas": true,
"blas": true,
"cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
"gpu_info": "NVIDIA GeForce RTX 3090 Ti",
"model_filename": "models/7B/ggml-model-q4_0.gguf",
"model_type": "llama 7B mostly Q4_0",
"model_size": 3825065984,
"model_n_params": 6738415616,
"n_batch": 512,
"n_threads": 16,
"f16_kv": true,
"n_gpu_layers": 99,
"split_mode": "layer",
"main_gpu": 0,
"no_kv_offload": false,
"flash_attn": false,
"mul_mat_q": true,
"tensor_split": "0.00",
"use_mmap": true,
"embeddings": false,
"n_prompt": 0,
"n_gen": 128,
"n_depth": 0,
"test_time": "2025-04-24T11:58:51Z",
"avg_ns": 1076767880,
"stddev_ns": 9449585,
"avg_ts": 118.881588,
"stddev_ts": 1.041811,
"samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ],
"samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ]
"test_time": "2023-09-23T12:09:59Z",
"avg_ns": 977425219,
"stddev_ns": 9268593,
"avg_ts": 130.965708,
"stddev_ts": 1.238924,
"samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
"samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
}
]
```
@@ -289,8 +254,8 @@ $ ./llama-bench -o jsonl
```
```json lines
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
```
@@ -306,32 +271,25 @@ $ ./llama-bench -o sql
CREATE TABLE IF NOT EXISTS test (
build_commit TEXT,
build_number INTEGER,
cuda INTEGER,
metal INTEGER,
gpu_blas INTEGER,
blas INTEGER,
cpu_info TEXT,
gpu_info TEXT,
backends TEXT,
model_filename TEXT,
model_type TEXT,
model_size INTEGER,
model_n_params INTEGER,
n_batch INTEGER,
n_ubatch INTEGER,
n_threads INTEGER,
cpu_mask TEXT,
cpu_strict INTEGER,
poll INTEGER,
type_k TEXT,
type_v TEXT,
f16_kv INTEGER,
n_gpu_layers INTEGER,
split_mode TEXT,
main_gpu INTEGER,
no_kv_offload INTEGER,
flash_attn INTEGER,
mul_mat_q INTEGER,
tensor_split TEXT,
use_mmap INTEGER,
embeddings INTEGER,
n_prompt INTEGER,
n_gen INTEGER,
n_depth INTEGER,
test_time TEXT,
avg_ns INTEGER,
stddev_ns INTEGER,
@@ -339,6 +297,6 @@ CREATE TABLE IF NOT EXISTS test (
stddev_ts REAL
);
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
```

Some files were not shown because too many files have changed in this diff Show More