Compare commits

..

12 Commits

Author SHA1 Message Date
Sigbjørn Skjæret
d3a2eb592d disable on windows 2025-05-31 23:17:18 +02:00
Sigbjørn Skjæret
7210ebe230 revert build changes 2025-05-31 23:16:56 +02:00
Sigbjørn Skjæret
05f94a0e90 add arch to matrix 2025-05-31 22:54:37 +02:00
Sigbjørn Skjæret
f9a27178e5 download in batches 2025-05-31 22:35:26 +02:00
Sigbjørn Skjæret
de8ec1348b Merge branch 'master' into cisc/test-tokenizers-remote 2025-05-31 21:25:34 +02:00
Sigbjørn Skjæret
8e1125a8db copy curl dll for tests 2025-05-31 21:22:37 +02:00
Sigbjørn Skjæret
4b4843adf3 windows builds adds build type to runtime output 2025-05-30 11:51:46 +02:00
Sigbjørn Skjæret
d97b9ade51 correct working directory for all builds
..and change cache file name as per suggestion.
2025-05-28 12:49:36 +02:00
Sigbjørn Skjæret
0fe7183ae4 fix prototype for non-curl builds 2025-05-28 11:11:02 +02:00
Sigbjørn Skjæret
ecbc92acd0 correct working directory 2025-05-28 10:16:34 +02:00
Sigbjørn Skjæret
42ff1867bc add test-tokenizers-remote 2025-05-28 09:51:44 +02:00
Sigbjørn Skjæret
2d2e059f4f make common_download_file_single/multiple public 2025-05-28 09:50:41 +02:00
538 changed files with 43318 additions and 147082 deletions

View File

@@ -22,8 +22,8 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: false
BinPackParameters: false # OnePerLine
BinPackArguments: true
BinPackParameters: true # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
@@ -70,17 +70,14 @@ ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '".*"'
- Regex: '^<.*\.h>'
Priority: 1
SortPriority: 0
- Regex: '^<.*\.h>'
- Regex: '^<.*'
Priority: 2
SortPriority: 0
- Regex: '^<.*'
Priority: 3
SortPriority: 0
- Regex: '.*'
Priority: 4
Priority: 3
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''

View File

@@ -1,130 +0,0 @@
# ==============================================================================
# ARGUMENTS
# ==============================================================================
# Define the CANN base image for easier version updates later
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
# ==============================================================================
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS build
# Define the Ascend chip model for compilation. Default is Ascend910B3
ARG ASCEND_SOC_TYPE=Ascend910B3
# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
yum clean all && \
rm -rf /var/cache/yum
# -- Set the working directory --
WORKDIR /app
# -- Copy project files --
COPY . .
# -- Set CANN environment variables (required for compilation) --
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
# ... You can add other environment variables from the original file as needed ...
# For brevity, only core variables are listed here. You can paste the original ENV list here.
# -- Build llama.cpp --
# Use the passed ASCEND_SOC_TYPE argument and add general build options
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
&& \
cmake -B build \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
. && \
cmake --build build --config Release -j$(nproc)
# -- Organize build artifacts for copying in later stages --
# Create a lib directory to store all .so files
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
# Create a full directory to store all executables and Python scripts
RUN mkdir -p /app/full && \
cp build/bin/* /app/full/ && \
cp *.py /app/full/ && \
cp -r gguf-py /app/full/ && \
cp -r requirements /app/full/ && \
cp requirements.txt /app/full/
# If you have a tools.sh script, make sure it is copied here
# cp .devops/tools.sh /app/full/tools.sh
# ==============================================================================
# BASE STAGE
# Create a minimal base image with CANN runtime and common libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS base
# -- Install runtime dependencies --
RUN yum install -y libgomp curl && \
yum clean all && \
rm -rf /var/cache/yum
# -- Set CANN environment variables (required for runtime) --
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
# ... You can add other environment variables from the original file as needed ...
WORKDIR /app
# Copy compiled .so files from the build stage
COPY --from=build /app/lib/ /app
# ==============================================================================
# FINAL STAGES (TARGETS)
# ==============================================================================
### Target: full
# Complete image with all tools, Python bindings, and dependencies
# ==============================================================================
FROM base AS full
COPY --from=build /app/full /app
# Install Python dependencies
RUN yum install -y git python3 python3-pip && \
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
pip3 install --no-cache-dir -r requirements.txt && \
yum clean all && \
rm -rf /var/cache/yum
# You need to provide a tools.sh script as the entrypoint
ENTRYPOINT ["/app/tools.sh"]
# If there is no tools.sh, you can set the default to start the server
# ENTRYPOINT ["/app/llama-server"]
### Target: light
# Lightweight image containing only llama-cli
# ==============================================================================
FROM base AS light
COPY --from=build /app/full/llama-cli /app
ENTRYPOINT [ "/app/llama-cli" ]
### Target: server
# Dedicated server image containing only llama-server
# ==============================================================================
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View File

@@ -49,23 +49,19 @@ COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update && \
apt-get install -y \
git \
python3 \
python3-pip \
python3-venv && \
python3 -m venv /opt/venv && \
. /opt/venv/bin/activate && \
pip install --upgrade pip setuptools wheel && \
pip install -r requirements.txt && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENV PATH="/opt/venv/bin:$PATH"
ENTRYPOINT ["/app/tools.sh"]

View File

@@ -1,10 +1,10 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc4.2.0
ARG MUSA_VERSION=rc4.0.1
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build

View File

@@ -47,7 +47,6 @@ let
inherit (lib)
cmakeBool
cmakeFeature
optionalAttrs
optionals
strings
;
@@ -198,7 +197,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
];
# Environment variables needed for ROCm
env = optionalAttrs useRocm {
env = optionals useRocm {
ROCM_PATH = "${rocmPackages.clr}";
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
};

View File

@@ -1,8 +1,8 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=6.4
ARG AMDGPU_VERSION=6.4
ARG ROCM_VERSION=6.3
ARG AMDGPU_VERSION=6.3
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e
# Read the first argument into a variable

View File

@@ -40,7 +40,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
multiple: true
validations:
required: true

View File

@@ -42,7 +42,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
multiple: true
validations:
required: true

18
.github/labeler.yml vendored
View File

@@ -1,4 +1,10 @@
# https://github.com/actions/labeler
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute/**
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
@@ -80,15 +86,3 @@ nix:
embedding:
- changed-files:
- any-glob-to-any-file: examples/embedding/
Ascend NPU:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-cann.h
- ggml/src/ggml-cann/**
- docs/backend/CANN.md
OpenCL:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-opencl.h
- ggml/src/ggml-opencl/**

View File

@@ -1,51 +0,0 @@
name: Build relocatable cmake package
on:
workflow_dispatch:
workflow_call:
jobs:
linux:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y build-essential tcl
- name: Build
run: |
PREFIX="$(pwd)"/inst
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release
cmake --install build --prefix "$PREFIX" --config Release
export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
tclsh <<'EOF'
set build(commit) [string trim [exec git rev-parse --short HEAD]]
set build(number) [string trim [exec git rev-list --count HEAD]]
set build(version) "0.0.$build(number)"
set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
set checks [list "set\\(LLAMA_VERSION \\s+$build(version)\\)" \
"set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
"set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]
puts -nonewline "Checking llama-config.cmake version... "
foreach check $checks {
if {![regexp -expanded -- $check $llamaconfig]} {
puts "\"$check\" failed!"
exit 1
}
}
puts "success."
EOF
cd examples/simple-cmake-pkg
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
cmake --build build

View File

@@ -48,98 +48,98 @@ jobs:
cmake --build build --config Release -j $(nproc)
# ubuntu-24-riscv64-vulkan-cross:
# runs-on: ubuntu-24.04
ubuntu-24-riscv64-vulkan-cross:
runs-on: ubuntu-24.04
# steps:
# - uses: actions/checkout@v4
# - name: Setup Riscv
# run: |
# sudo dpkg --add-architecture riscv64
steps:
- uses: actions/checkout@v4
- name: Setup Riscv
run: |
sudo dpkg --add-architecture riscv64
# # Add arch-specific repositories for non-amd64 architectures
# cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
# EOF
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get update || true ;# Prevent failure due to missing URLs.
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# glslc \
# gcc-14-riscv64-linux-gnu \
# g++-14-riscv64-linux-gnu \
# libvulkan-dev:riscv64
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libvulkan-dev:riscv64
# - name: Build
# run: |
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
# cmake --build build --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
# ubuntu-24-arm64-vulkan-cross:
# runs-on: ubuntu-24.04
ubuntu-24-arm64-vulkan-cross:
runs-on: ubuntu-24.04
# steps:
# - uses: actions/checkout@v4
# - name: Setup Arm64
# run: |
# sudo dpkg --add-architecture arm64
steps:
- uses: actions/checkout@v4
- name: Setup Arm64
run: |
sudo dpkg --add-architecture arm64
# # Add arch-specific repositories for non-amd64 architectures
# cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
# deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
# deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
# deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
# deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
# EOF
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get update || true ;# Prevent failure due to missing URLs.
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# glslc \
# crossbuild-essential-arm64 \
# libvulkan-dev:arm64
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
crossbuild-essential-arm64 \
libvulkan-dev:arm64
# - name: Build
# run: |
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
# -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
# -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=aarch64 \
-DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
-DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
# cmake --build build --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
ubuntu-24-ppc64el-cpu-cross:
runs-on: ubuntu-24.04
@@ -185,144 +185,31 @@ jobs:
cmake --build build --config Release -j $(nproc)
# ubuntu-24-ppc64el-vulkan-cross:
# runs-on: ubuntu-24.04
# steps:
# - uses: actions/checkout@v4
# - name: Setup PowerPC64le
# run: |
# sudo dpkg --add-architecture ppc64el
# # Add arch-specific repositories for non-amd64 architectures
# cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
# EOF
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# glslc \
# gcc-14-powerpc64le-linux-gnu \
# g++-14-powerpc64le-linux-gnu \
# libvulkan-dev:ppc64el
# - name: Build
# run: |
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
# -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
# -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
# cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-cpu-cross:
ubuntu-24-ppc64el-vulkan-cross:
runs-on: ubuntu-24.04
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
steps:
- uses: actions/checkout@v4
- name: Setup LoongArch
- name: Setup PowerPC64le
run: |
rm -f /etc/apt/sources.list.d/*
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
EOF
( echo 'quiet "true";'; \
echo 'APT::Get::Assume-Yes "true";'; \
echo 'APT::Install-Recommends "false";'; \
echo 'Acquire::Check-Valid-Until "false";'; \
echo 'Acquire::Retries "5";'; \
) > /etc/apt/apt.conf.d/99snapshot-repos
apt-get update
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
dpkg --add-architecture loong64
sudo dpkg --add-architecture ppc64el
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get update || true ;# Prevent failure due to missing URLs.
apt-get install -y --no-install-recommends \
build-essential \
gcc-14-loongarch64-linux-gnu \
g++-14-loongarch64-linux-gnu
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-vulkan-cross:
runs-on: ubuntu-24.04
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
steps:
- uses: actions/checkout@v4
- name: Setup LoongArch
run: |
rm -f /etc/apt/sources.list.d/*
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
EOF
( echo 'quiet "true";'; \
echo 'APT::Get::Assume-Yes "true";'; \
echo 'APT::Install-Recommends "false";'; \
echo 'Acquire::Check-Valid-Until "false";'; \
echo 'Acquire::Retries "5";'; \
) > /etc/apt/apt.conf.d/99snapshot-repos
apt-get update
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
dpkg --add-architecture loong64
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
EOF
apt-get update || true ;# Prevent failure due to missing URLs.
apt-get install -y --no-install-recommends \
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
gcc-14-loongarch64-linux-gnu \
g++-14-loongarch64-linux-gnu \
libvulkan-dev:loong64
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
libvulkan-dev:ppc64el
- name: Build
run: |
@@ -334,11 +221,11 @@ jobs:
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

View File

@@ -5,43 +5,10 @@ on:
push:
branches:
- master
paths: [
'.github/workflows/build.yml',
'.github/workflows/build-linux-cross.yml',
'.github/workflows/build-cmake-pkg.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp'
]
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build.yml',
'.github/workflows/build-linux-cross.yml',
'.github/workflows/build-cmake-pkg.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp'
]
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -84,8 +51,7 @@ jobs:
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=OFF \
-DGGML_METAL_SHADER_DEBUG=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
@@ -135,53 +101,6 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
macOS-latest-cmake-arm64-webgpu:
runs-on: macos-14
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-arm64-webgpu
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v1.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
curl -L -o artifact.tar.gz \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
mkdir dawn
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
run: |
export CMAKE_PREFIX_PATH=dawn
cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-cpu-cmake:
strategy:
matrix:
@@ -383,57 +302,6 @@ jobs:
-DGGML_VULKAN=ON
cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
export GGML_VK_VISIBLE_DEVICES=0
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 4200
ubuntu-22-cmake-webgpu:
runs-on: ubuntu-22.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-webgpu
evict-old-files: 1d
- name: Vulkan SDK Dependencies
id: vulkan-depends
run: |
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
- name: Dawn Dependency
id: dawn-depends
run: |
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
DAWN_VERSION="v1.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
curl -L -o artifact.tar.gz \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
mkdir dawn
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
run: |
export Dawn_DIR=dawn/lib64/cmake/Dawn
cmake -B build -DGGML_WEBGPU=ON
cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
@@ -483,7 +351,7 @@ jobs:
ubuntu-22-cmake-musa:
runs-on: ubuntu-22.04
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
steps:
- name: Clone
@@ -609,9 +477,6 @@ jobs:
build-linux-cross:
uses: ./.github/workflows/build-linux-cross.yml
build-cmake-pkg:
uses: ./.github/workflows/build-cmake-pkg.yml
macOS-latest-cmake-ios:
runs-on: macos-latest
@@ -762,7 +627,7 @@ jobs:
./build-xcframework.sh
windows-msys2:
runs-on: windows-2025
runs-on: windows-latest
strategy:
fail-fast: false
@@ -812,31 +677,28 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
runs-on: windows-2025
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.4.313.2
VULKAN_VERSION: 1.4.309.0
strategy:
matrix:
include:
- build: 'cpu-x64 (static)'
arch: 'x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
- build: 'cpu-x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
- build: 'openblas-x64'
arch: 'x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'vulkan-x64'
arch: 'x64'
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
- build: 'llvm-arm64'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
- build: 'llvm-arm64-opencl-adreno'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
# - build: 'kompute-x64'
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
steps:
- name: Clone
@@ -850,6 +712,12 @@ jobs:
variant: ccache
evict-old-files: 1d
- name: Clone Kompute submodule
id: clone_kompute
if: ${{ matrix.build == 'kompute-x64' }}
run: |
git submodule update --init ggml/src/ggml-kompute/kompute
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
@@ -865,9 +733,9 @@ jobs:
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'vulkan-x64' }}
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
@@ -900,8 +768,6 @@ jobs:
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
with:
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
- name: Build
id: cmake_build
@@ -911,7 +777,6 @@ jobs:
cmake -S . -B build ${{ matrix.defines }} `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
- name: Add libopenblas.dll
id: add_libopenblas_dll
@@ -922,7 +787,7 @@ jobs:
- name: Test
id: cmake_test
if: ${{ matrix.arch == 'x64' }}
if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
@@ -974,12 +839,12 @@ jobs:
-DGGML_CUDA=ON
cmake --build build
windows-2022-cmake-cuda:
runs-on: windows-2022
windows-2019-cmake-cuda:
runs-on: windows-2019
strategy:
matrix:
cuda: ['12.4']
cuda: ['12.4', '11.7']
steps:
- name: Clone
@@ -1013,7 +878,7 @@ jobs:
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" ^
-DLLAMA_BUILD_SERVER=ON ^
-DGGML_NATIVE=OFF ^
@@ -1027,7 +892,7 @@ jobs:
cmake --build build --config Release
windows-latest-cmake-sycl:
runs-on: windows-2022
runs-on: windows-latest
defaults:
run:
@@ -1061,7 +926,7 @@ jobs:
windows-latest-cmake-hip:
if: ${{ github.event.inputs.create_release != 'true' }}
runs-on: windows-2022
runs-on: windows-latest
steps:
- name: Clone

View File

@@ -17,7 +17,7 @@ jobs:
steps:
- uses: actions/stale@v5
with:
exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"

View File

@@ -1,52 +0,0 @@
name: "Copilot Setup Steps"
# Automatically run the setup steps when they are changed to allow for easy validation, and
# allow manual testing through the repository's "Actions" tab
on:
workflow_dispatch:
push:
paths:
- .github/workflows/copilot-setup-steps.yml
pull_request:
paths:
- .github/workflows/copilot-setup-steps.yml
jobs:
# The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
copilot-setup-steps:
runs-on: ubuntu-latest
# Set the permissions to the lowest permissions possible needed for your steps.
# Copilot will be given its own token for its operations.
permissions:
# If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
contents: read
# You can define any steps you want, and they will run before the agent starts.
# If you do not check out your code, Copilot will do this for you.
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: copilot-setup-steps
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install Python dependencies
run: |
python3 -m venv .venv
.venv/bin/activate
pip install -r requirements/requirements-all.txt

View File

@@ -1,45 +0,0 @@
name: Check Pre-Tokenizer Hashes
on:
push:
paths:
- 'convert_hf_to_gguf.py'
- 'convert_hf_to_gguf_update.py'
pull_request:
paths:
- 'convert_hf_to_gguf.py'
- 'convert_hf_to_gguf_update.py'
jobs:
pre-tokenizer-hashes:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install Python dependencies
run: |
python3 -m venv .venv
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
- name: Update pre-tokenizer hashes
run: |
cp convert_hf_to_gguf.py /tmp
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
- name: Check if committed pre-tokenizer hashes matches generated version
run: |
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
echo "Differences found:"
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
exit 1
fi
echo "Model pre-tokenizer hashes are up to date."

View File

@@ -49,8 +49,7 @@ jobs:
run: |
sysctl -a
cmake -B build \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
@@ -104,8 +103,7 @@ jobs:
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
@@ -133,9 +131,8 @@ jobs:
include:
- build: 'x64'
os: ubuntu-22.04
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
# - build: 'arm64'
# os: ubuntu-22.04-arm
- build: 'arm64'
os: ubuntu-22.04-arm
runs-on: ${{ matrix.os }}
@@ -162,11 +159,6 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
-DLLAMA_FATAL_WARNINGS=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
@@ -215,11 +207,6 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_VULKAN=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
@@ -241,7 +228,7 @@ jobs:
name: llama-bin-ubuntu-vulkan-x64.zip
windows-cpu:
runs-on: windows-2025
runs-on: windows-latest
strategy:
matrix:
@@ -277,7 +264,7 @@ jobs:
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
cmake -S . -B build -G "Ninja Multi-Config" ^
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-DGGML_NATIVE=OFF ^
@@ -294,7 +281,7 @@ jobs:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
@@ -304,11 +291,11 @@ jobs:
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
windows:
runs-on: windows-2025
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
VULKAN_VERSION: 1.4.313.2
VULKAN_VERSION: 1.4.309.0
strategy:
matrix:
@@ -338,7 +325,7 @@ jobs:
id: get_vulkan
if: ${{ matrix.backend == 'vulkan' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
@@ -386,11 +373,11 @@ jobs:
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
windows-cuda:
runs-on: windows-2022
runs-on: windows-2019
strategy:
matrix:
cuda: ['12.4']
cuda: ['12.4', '11.7']
steps:
- name: Clone
@@ -418,7 +405,7 @@ jobs:
id: cmake_build
shell: cmd
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" ^
-DGGML_BACKEND_DL=ON ^
-DGGML_NATIVE=OFF ^
@@ -454,7 +441,7 @@ jobs:
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
windows-sycl:
runs-on: windows-2022
runs-on: windows-latest
defaults:
run:
@@ -526,7 +513,7 @@ jobs:
name: llama-bin-win-sycl-x64.zip
windows-hip:
runs-on: windows-2022
runs-on: windows-latest
strategy:
matrix:

View File

@@ -180,7 +180,7 @@ jobs:
server-windows:
runs-on: windows-2022
runs-on: windows-2019
steps:
- name: Clone

View File

@@ -1,40 +0,0 @@
name: Update Operations Documentation
on:
push:
paths:
- 'docs/ops/**'
- 'scripts/create_ops_docs.py'
pull_request:
paths:
- 'docs/ops/**'
- 'scripts/create_ops_docs.py'
jobs:
update-ops-docs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Generate operations documentation to temporary file
run: |
mkdir -p /tmp/ops_check
./scripts/create_ops_docs.py /tmp/ops_check/ops.md
- name: Check if docs/ops.md matches generated version
run: |
if ! diff -q docs/ops.md /tmp/ops_check/ops.md; then
echo "Operations documentation (docs/ops.md) is not up to date with the backend CSV files."
echo "To fix: run ./scripts/create_ops_docs.py and commit the updated docs/ops.md along with your changes"
echo "Differences found:"
diff docs/ops.md /tmp/ops_check/ops.md || true
exit 1
fi
echo "Operations documentation is up to date."

1
.gitignore vendored
View File

@@ -82,7 +82,6 @@ models/*
models-mnt
!models/.editorconfig
!models/ggml-vocab-*.gguf*
!models/templates
# Zig
zig-out/

3
.gitmodules vendored
View File

@@ -0,0 +1,3 @@
[submodule "kompute"]
path = ggml/src/ggml-kompute/kompute
url = https://github.com/nomic-ai/kompute.git

View File

@@ -89,14 +89,6 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
if (NOT DEFINED LLAMA_BUILD_NUMBER)
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
endif()
if (NOT DEFINED LLAMA_BUILD_COMMIT)
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
endif()
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
# override ggml options
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
@@ -120,6 +112,7 @@ endfunction()
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
@@ -162,17 +155,10 @@ if (LLAMA_USE_SYSTEM_GGML)
endif()
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
if (MINGW)
# Target Windows 8 for PrefetchVirtualMemory
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
endif()
#
# build the library
#
@@ -213,6 +199,10 @@ endif()
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")

View File

@@ -55,17 +55,6 @@
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
}
},
{
"name": "x64-linux-gcc", "hidden": true,
"cacheVariables": {
"CMAKE_C_COMPILER": "gcc",
"CMAKE_CXX_COMPILER": "g++"
}
},
{ "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
{ "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
{ "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
{ "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },

View File

@@ -9,4 +9,3 @@
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/gguf.cpp @JohannesGaessler
/ggml/src/ggml-vulkan/ @0cc4m

View File

@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
endif
ifndef GGML_NO_CPU_AARCH64
MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
endif
# warnings
@@ -970,7 +970,7 @@ OBJ_GGML = \
$(DIR_GGML)/src/ggml-threading.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
$(DIR_GGML)/src/ggml-cpu/repack.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \

View File

@@ -3,12 +3,11 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
LLM inference in C/C++
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
## Recent API changes
@@ -17,10 +16,11 @@ LLM inference in C/C++
## Hot topics
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
@@ -28,30 +28,6 @@ LLM inference in C/C++
----
## Quick start
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
- Run with Docker - see our [Docker documentation](docs/docker.md)
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
Example command:
```sh
# Use a local model file
llama-cli -m my_model.gguf
# Or download and run a model directly from Hugging Face
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
# Launch OpenAI-compatible API server
llama-server -hf ggml-org/gemma-3-1b-it-GGUF
```
## Description
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@@ -134,7 +110,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
#### Multimodal
@@ -155,7 +130,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
<details>
<summary>Bindings</summary>
- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@@ -240,7 +214,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
<details>
<summary>Infrastructure</summary>
- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
@@ -255,7 +229,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
</details>
## Supported backends
| Backend | Target devices |
@@ -270,9 +243,18 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
## Building the project
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
- Clone this repository and build locally, see [how to build](docs/build.md)
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
- Use a Docker image, see [documentation for Docker](docs/docker.md)
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
## Obtaining and quantizing models
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
@@ -280,11 +262,7 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
```sh
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
```
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
@@ -436,7 +414,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
## [`llama-perplexity`](tools/perplexity)
#### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text.
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
- <details open>
<summary>Measure the perplexity over a text file</summary>
@@ -459,7 +437,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
</details>
[^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
## [`llama-bench`](tools/llama-bench)

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
#
# Options
IOS_MIN_OS_VERSION=16.4

View File

@@ -54,7 +54,7 @@ docker run --privileged -it \
-v $HOME/llama.cpp/ci-cache:/ci-cache \
-v $HOME/llama.cpp/ci-results:/ci-results \
-v $PWD:/ws -w /ws \
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
```
Inside the container, execute the following commands:

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
#
# sample usage:
#
@@ -16,9 +16,6 @@
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with WebGPU support
# GG_BUILD_WEBGPU=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with MUSA support
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
@@ -42,27 +39,14 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
if command -v nvidia-smi >/dev/null 2>&1; then
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
else
echo "Warning: Using fallback CUDA architectures"
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
fi
else
echo "Error: nvidia-smi not found, cannot build with CUDA"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -84,10 +68,6 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
fi
if [ ! -z ${GG_BUILD_MUSA} ]; then
# Use qy1 by default (MTT S80)
MUSA_ARCH=${MUSA_ARCH:-21}
@@ -786,7 +766,7 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf"
# for this model, the SEP token is "</s>"
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
# sample output
# rerank score 0: 0.029

View File

@@ -7,8 +7,8 @@ llama_add_compile_flags()
# Build info header
#
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
@@ -18,26 +18,36 @@ if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
if (SLASH_POS EQUAL 0)
set(GIT_DIR "${REAL_GIT_DIR}")
else()
set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
endif()
endif()
if(EXISTS "${GIT_DIR}/index")
# For build-info.cpp below
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
set(GIT_INDEX "${GIT_DIR}/index")
else()
message(WARNING "Git index not found in git repository.")
set(GIT_INDEX "")
endif()
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "")
endif()
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
# Add a custom command to rebuild build-info.cpp when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
)
set(TARGET build_info)
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
add_library(${TARGET} OBJECT build-info.cpp)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
@@ -86,7 +96,8 @@ if (LLAMA_CURL)
endif()
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
endif ()
if (LLAMA_LLGUIDANCE)
@@ -111,13 +122,13 @@ if (LLAMA_LLGUIDANCE)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v1.0.1:
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
# v0.7.20 (+ fix to build on GCC 15):
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND cargo build --release --package llguidance
BUILD_COMMAND cargo build --release
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
UPDATE_COMMAND ""

View File

@@ -24,7 +24,6 @@
#include <cstdarg>
#include <filesystem>
#include <fstream>
#include <list>
#include <regex>
#include <set>
#include <string>
@@ -245,7 +244,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
}
// download one single file from remote URL to local path
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
// Check if the file already exists locally
auto file_exists = std::filesystem::exists(path);
@@ -468,7 +467,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
// download multiple files from remote URLs to local paths
// the input is a vector of pairs <url, path>
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (auto const & item : urls) {
@@ -712,12 +711,12 @@ bool common_has_curl() {
return false;
}
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
LOG_ERR("error: built without CURL, cannot download model from internet\n");
return false;
}
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
return false;
}
@@ -978,10 +977,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
string_process_escapes(seq_breaker);
}
for (auto & pair : params.speculative.replacements) {
string_process_escapes(pair.first);
string_process_escapes(pair.second);
}
}
if (!params.kv_overrides.empty()) {
@@ -993,6 +988,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
params.tensor_buft_overrides.push_back({nullptr, nullptr});
}
if (params.reranking && params.embedding) {
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
}
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
throw std::runtime_error(string_format(
"error: the supplied chat template is not supported: %s%s\n",
@@ -1349,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
));
add_opt(common_arg(
{"--prio"}, "N",
string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
[](common_params & params, int prio) {
if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
if (prio < 0 || prio > 3) {
throw std::invalid_argument("invalid value");
}
params.cpuparams.priority = (enum ggml_sched_priority) prio;
@@ -1469,14 +1468,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.swa_full = true;
}
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
[](common_params & params) {
params.kv_unified = true;
}
).set_env("LLAMA_ARG_KV_SPLIT"));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1617,7 +1608,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.antiprompt.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"-sp", "--special"},
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
@@ -2096,13 +2087,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.no_kv_offload = true;
}
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
add_opt(common_arg(
{"-nr", "--no-repack"},
"disable weight repacking",
[](common_params & params) {
params.no_extra_bufts = true;
}
).set_env("LLAMA_ARG_NO_REPACK"));
add_opt(common_arg(
{"-ctk", "--cache-type-k"}, "TYPE",
string_format(
@@ -2376,35 +2360,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
throw std::invalid_argument("unknown buffer type");
}
// keep strings alive and avoid leaking memory by storing them in a static vector
static std::list<std::string> buft_overrides;
buft_overrides.push_back(tensor_name);
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
// FIXME: this leaks memory
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
}
}
));
add_opt(common_arg(
{"--cpu-moe", "-cmoe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
[](common_params & params) {
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
}
).set_env("LLAMA_ARG_CPU_MOE"));
add_opt(common_arg(
{"--n-cpu-moe", "-ncmoe"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
[](common_params & params, int value) {
if (value < 0) {
throw std::invalid_argument("invalid value");
}
for (int i = 0; i < value; ++i) {
// keep strings alive and avoid leaking memory by storing them in a static vector
static std::list<std::string> buft_overrides;
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
).set_env("LLAMA_ARG_N_CPU_MOE"));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM",
@@ -2663,15 +2623,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_out_freq = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--output-format"}, "{gguf,dat}",
string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
[](common_params & params, const std::string & value) {
/**/ if (value == "gguf") { params.imat_dat = -1; }
else if (value == "dat") { params.imat_dat = 1; }
else { throw std::invalid_argument("invalid output format"); }
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--save-frequency"}, "N",
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
@@ -2700,13 +2651,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.i_chunk = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--show-statistics"},
string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
[](common_params & params) {
params.show_statistics = true;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--parse-special"},
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
@@ -2766,13 +2710,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.embd_sep = value;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg(
{"--cls-separator"}, "STRING",
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
[](common_params & params, const std::string & value) {
params.cls_sep = value;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg(
{"--host"}, "HOST",
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -2794,13 +2731,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.public_path = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
add_opt(common_arg(
{"--api-prefix"}, "PREFIX",
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
[](common_params & params, const std::string & value) {
params.api_prefix = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg(
{"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
@@ -2817,10 +2747,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
add_opt(common_arg(
{"--reranking", "--rerank"},
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
[](common_params & params) {
params.embedding = true;
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
params.reranking = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
add_opt(common_arg(
@@ -2861,16 +2790,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.ssl_file_cert = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
add_opt(common_arg(
{"--chat-template-kwargs"}, "STRING",
string_format("sets additional params for the json template parser"),
[](common_params & params, const std::string & value) {
auto parsed = json::parse(value);
for (const auto & item : parsed.items()) {
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg(
{"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -2947,9 +2866,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
"- none: leaves thoughts unparsed in `message.content`\n"
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
"(default: auto)",
"(default: deepseek)",
[](common_params & params, const std::string & value) {
params.reasoning_format = common_reasoning_format_from_name(value);
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
else { throw std::invalid_argument("invalid value"); }
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
add_opt(common_arg(
@@ -3291,39 +3212,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
add_opt(common_arg(
{"--spec-replace"}, "TARGET", "DRAFT",
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
[](common_params & params, const std::string & tgt, const std::string & dft) {
params.speculative.replacements.push_back({ tgt, dft });
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
string_format(
"KV cache data type for K for the draft model\n"
"allowed values: %s\n"
"(default: %s)",
get_all_kv_cache_types().c_str(),
ggml_type_name(params.speculative.cache_type_k)
),
[](common_params & params, const std::string & value) {
params.speculative.cache_type_k = kv_cache_type_from_str(value);
}
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
add_opt(common_arg(
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
string_format(
"KV cache data type for V for the draft model\n"
"allowed values: %s\n"
"(default: %s)",
get_all_kv_cache_types().c_str(),
ggml_type_name(params.speculative.cache_type_v)
),
[](common_params & params, const std::string & value) {
params.speculative.cache_type_v = kv_cache_type_from_str(value);
}
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
add_opt(common_arg(
{"-mv", "--model-vocoder"}, "FNAME",
@@ -3487,51 +3375,5 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{ "--diffusion-steps" }, "N",
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
[](common_params & params, int value) { params.diffusion.steps = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-visual" },
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
params.diffusion.visual_mode ? "true" : "false"),
[](common_params & params) { params.diffusion.visual_mode = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-eps" }, "F",
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-algorithm" }, "N",
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
params.diffusion.algorithm),
[](common_params & params, int value) { params.diffusion.algorithm = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-alg-temp" }, "F",
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-block-length" }, "N",
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
[](common_params & params, int value) { params.diffusion.block_length = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-cfg-scale" }, "F",
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-add-gumbel-noise" }, "F",
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
return ctx_arg;
}

View File

@@ -87,3 +87,10 @@ struct common_remote_params {
};
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
// download one single file from remote URL to local path
bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline);
// download multiple files from remote URLs to local paths
// the input is a vector of pairs <url, path>
bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline);

View File

@@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";

View File

@@ -49,21 +49,12 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
result_.tool_calls.emplace_back(tool_call);
return true;
}
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
std::string arguments = "";
if (tool_call.contains("arguments")) {
if (tool_call.at("arguments").is_object()) {
arguments = tool_call.at("arguments").dump();
} else {
arguments = tool_call.at("arguments");
}
}
std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
return add_tool_call(name, id, arguments);
}
@@ -387,7 +378,3 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
/* .is_partial = */ found_healing_marker,
};
}
void common_chat_msg_parser::clear_tools() {
result_.tool_calls.clear();
}

View File

@@ -115,6 +115,4 @@ class common_chat_msg_parser {
const std::vector<std::vector<std::string>> & args_paths = {},
const std::vector<std::vector<std::string>> & content_paths = {}
);
void clear_tools();
};

View File

@@ -17,8 +17,6 @@
#include <string>
#include <vector>
using json = nlohmann::ordered_json;
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
auto time = std::chrono::system_clock::to_time_t(now);
auto local_time = *std::localtime(&time);
@@ -84,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
std::vector<common_chat_msg_diff> diffs;
if (previous_msg.reasoning_content != new_msg.reasoning_content) {
auto & diff = diffs.emplace_back();
diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
}
// if (previous_msg.reasoning_content != current.reasoning_content) {
// auto & diff = diffs.emplace_back();
// diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
// }
if (previous_msg.content != new_msg.content) {
auto & diff = diffs.emplace_back();
diff.content_delta = string_diff(previous_msg.content, new_msg.content);
@@ -126,8 +124,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
typedef minja::chat_template common_chat_template;
struct common_chat_templates {
bool add_bos;
bool add_eos;
bool has_explicit_template; // Model had builtin template or template overridde was specified.
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
std::unique_ptr<common_chat_template> template_tool_use;
@@ -144,9 +140,6 @@ struct templates_params {
bool add_generation_prompt = true;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
json extra_context;
bool add_bos;
bool add_eos;
};
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -392,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
json delta = json::object();
if (!diff.reasoning_content_delta.empty()) {
delta["reasoning_content"] = diff.reasoning_content_delta;
}
// if (!diff.reasoning_content_delta.empty()) {
// delta["reasoning_content"] = msg.reasoning_content;
// }
if (!diff.content_delta.empty()) {
delta["content"] = diff.content_delta;
}
@@ -449,8 +442,6 @@ std::string common_chat_format_single(
common_chat_templates_inputs inputs;
inputs.use_jinja = use_jinja;
inputs.add_bos = tmpls->add_bos;
inputs.add_eos = tmpls->add_eos;
std::string fmt_past_msg;
if (!past_msg.empty()) {
@@ -475,8 +466,6 @@ std::string common_chat_format_single(
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
common_chat_templates_inputs inputs;
inputs.use_jinja = use_jinja;
inputs.add_bos = tmpls->add_bos;
inputs.add_eos = tmpls->add_eos;
auto add_simple_msg = [&](auto role, auto content) {
common_chat_msg msg;
msg.role = role;
@@ -552,21 +541,8 @@ common_chat_templates_ptr common_chat_templates_init(
default_template_src = CHATML_TEMPLATE_SRC;
}
}
// TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
// Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
if (default_template_src.find("<|channel|>") != std::string::npos
// search for the error message and patch it
&& default_template_src.find("in message.content or") != std::string::npos) {
string_replace_all(default_template_src,
"{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
"{%- if false %}");
}
std::string token_bos = bos_token_override;
std::string token_eos = eos_token_override;
bool add_bos = false;
bool add_eos = false;
if (model) {
const auto * vocab = llama_model_get_vocab(model);
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
@@ -581,13 +557,9 @@ common_chat_templates_ptr common_chat_templates_init(
};
token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
add_bos = llama_vocab_get_add_bos(vocab);
add_eos = llama_vocab_get_add_eos(vocab);
}
common_chat_templates_ptr tmpls(new common_chat_templates());
tmpls->has_explicit_template = has_explicit_template;
tmpls->add_bos = add_bos;
tmpls->add_eos = add_eos;
try {
tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
} catch (const std::exception & e) {
@@ -617,8 +589,6 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
default:
throw std::runtime_error("Unknown chat format");
}
@@ -627,28 +597,12 @@ const char * common_chat_format_name(common_chat_format format) {
const char * common_reasoning_format_name(common_reasoning_format format) {
switch (format) {
case COMMON_REASONING_FORMAT_NONE: return "none";
case COMMON_REASONING_FORMAT_AUTO: return "auto";
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
case COMMON_REASONING_FORMAT_GRANITE: return "granite";
default:
throw std::runtime_error("Unknown reasoning format");
}
}
common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
if (format == "none") {
return COMMON_REASONING_FORMAT_NONE;
} else if (format == "auto") {
return COMMON_REASONING_FORMAT_AUTO;
} else if (format == "deepseek") {
return COMMON_REASONING_FORMAT_DEEPSEEK;
} else if (format == "deepseek-legacy") {
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
}
throw std::runtime_error("Unknown reasoning format: " + format);
}
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
std::string arguments;
if (builder.is_partial()) {
@@ -765,23 +719,16 @@ static void foreach_function(const json & tools, const std::function<void(const
static std::string apply(
const common_chat_template & tmpl,
const struct templates_params & inputs,
const std::optional<json> & messages_override = std::nullopt,
const std::optional<json> & tools_override = std::nullopt,
const std::optional<json> & additional_context = std::nullopt)
const nlohmann::ordered_json & messages,
const nlohmann::ordered_json & tools,
bool add_generation_prompt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
{
minja::chat_template_inputs tmpl_inputs;
tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
if (tools_override) {
tmpl_inputs.tools = *tools_override;
} else {
tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
}
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
tmpl_inputs.extra_context = inputs.extra_context;
if (additional_context) {
tmpl_inputs.extra_context.merge_patch(*additional_context);
}
tmpl_inputs.messages = messages;
tmpl_inputs.tools = tools;
tmpl_inputs.add_generation_prompt = add_generation_prompt;
tmpl_inputs.extra_context = extra_context;
// TODO: add flag to control date/time, if only for testing purposes.
// tmpl_inputs.now = std::chrono::system_clock::now();
@@ -790,10 +737,10 @@ static std::string apply(
// instead of using `chat_template_options.use_bos_token = false`, since these tokens
// may be needed inside the template / between messages too.
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
if (string_starts_with(result, tmpl.bos_token())) {
result = result.substr(tmpl.bos_token().size());
}
if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
if (string_ends_with(result, tmpl.eos_token())) {
result = result.substr(0, result.size() - tmpl.eos_token().size());
}
return result;
@@ -880,7 +827,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
inputs.messages,
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_GENERIC;
return data;
}
@@ -956,7 +903,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
data.preserved_tokens = {
"[TOOL_CALLS]",
};
data.prompt = apply(tmpl, inputs);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
return data;
}
@@ -986,7 +933,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
adjusted_messages.push_back(msg);
}
}
data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
if (!inputs.enable_thinking) {
@@ -1174,7 +1121,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
} else {
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
}
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
{"date_string", format_time(inputs.now, "%d %b %Y")},
{"tools_in_user_message", false},
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
@@ -1239,7 +1186,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
auto prompt = apply(tmpl, inputs);
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
// Hacks to fix the official (broken) prompt.
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
@@ -1331,30 +1278,10 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
tool_calls_end);
}
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
auto prompt = apply(tmpl, inputs);
data.prompt = prompt;
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
// TODO: support tool calls in GPT-OSS?
return data;
}
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
// TODO @ngxson : this won't work with --special enabled, we should fix that
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
}
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
LOG_DBG("%s\n", __func__);
common_chat_params data;
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
});
@@ -1410,7 +1337,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
common_chat_params data;
data.prompt = apply(tmpl, inputs);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
if (inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1537,7 +1464,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
}
data.prompt = apply(tmpl, inputs);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
// TODO: if (has_raw_python)
return data;
}
@@ -1570,15 +1497,14 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
json extra_context = json {
json additional_context = {
{"enable_thinking", inputs.enable_thinking},
};
extra_context.update(inputs.extra_context);
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
if (string_ends_with(data.prompt, "<think>\n")) {
if (!extra_context["enable_thinking"]) {
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
@@ -1708,7 +1634,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
"|<function name=\"([^\"]+)\">" // match 5 (function name again)
);
while (auto res = builder.try_find_regex(open_regex)) {
if (auto res = builder.try_find_regex(open_regex)) {
const auto & block_start = res->groups[1];
std::string block_end = block_start.empty() ? "" : "```";
@@ -1730,6 +1656,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
builder.consume_literal(block_end);
builder.consume_spaces();
}
builder.add_content(builder.consume_rest());
} else {
throw common_chat_msg_partial_exception("failed to parse tool call");
}
@@ -1754,124 +1681,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
builder.consume_spaces();
}
}
}
}
builder.add_content(builder.consume_rest());
}
static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
// Pass thinking context for Granite template
json additional_context = {
{"thinking", inputs.enable_thinking},
};
data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
data.format = COMMON_CHAT_FORMAT_GRANITE;
if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
}
}
if (!inputs.tools.is_null()) {
// Granite uses <|tool_call|> followed by JSON list
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
std::vector<std::string> tool_rules;
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
std::string name = function.at("name");
auto parameters = function.at("parameters");
builder.resolve_refs(parameters);
tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
"-args", {
{"type", "object"},
{"properties", {
{"name", {{"const", name}}},
{"arguments", parameters},
}},
{"required", json::array({"name", "arguments"})},
})));
});
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
if (data.thinking_forced_open) {
builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
} else {
builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
}
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
"<|tool_call|>"
});
data.preserved_tokens = {
"<think>",
"</think>",
"<response>",
"</response>",
"<|tool_call|>",
};
});
} else {
// Handle thinking tags for non-tool responses
if (data.thinking_forced_open && inputs.enable_thinking) {
data.grammar_lazy = false;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
});
data.preserved_tokens = {
"<think>",
"</think>",
"<response>",
"</response>",
};
}
}
return data;
}
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
// Parse thinking tags
builder.try_parse_reasoning("<think>", "</think>");
// Parse response tags using regex
static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
if (auto res = builder.try_find_regex(response_regex)) {
// Extract the content between the tags (capture group 1)
auto content = builder.str(res->groups[1]);
builder.add_content(content);
builder.move_to(res->groups[0].end);
}
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
// Look for tool calls
static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
if (auto res = builder.try_find_regex(tool_call_regex)) {
builder.move_to(res->groups[0].end);
// Expect JSON array of tool calls
auto tool_calls_data = builder.consume_json();
if (tool_calls_data.json.is_array()) {
if (!builder.add_tool_calls(tool_calls_data.json)) {
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
}
} else {
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
builder.add_content(builder.consume_rest());
}
} else {
builder.add_content(builder.consume_rest());
@@ -1880,7 +1690,7 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
data.grammar_lazy = false;
if (!inputs.json_schema.is_null()) {
@@ -1911,14 +1721,6 @@ static common_chat_params common_chat_templates_apply_jinja(
params.enable_thinking = inputs.enable_thinking;
params.grammar = inputs.grammar;
params.now = inputs.now;
params.add_bos = inputs.add_bos;
params.add_eos = inputs.add_eos;
params.extra_context = json::object();
for (auto el : inputs.chat_template_kwargs) {
params.extra_context[el.first] = json::parse(el.second);
}
if (!inputs.json_schema.empty()) {
params.json_schema = json::parse(inputs.json_schema);
}
@@ -1949,21 +1751,11 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_command_r7b(tmpl, params);
}
// Granite (IBM) - detects thinking / tools support
if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
return common_chat_params_init_granite(tmpl, params);
}
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
return common_chat_params_init_hermes_2_pro(tmpl, params);
}
// GPT-OSS
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
return common_chat_params_init_gpt_oss(tmpl, params);
}
// Use generic handler when mixing tools + JSON schema.
// TODO: support that mix in handlers below.
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2014,7 +1806,6 @@ static common_chat_params common_chat_templates_apply_legacy(
int alloc_size = 0;
std::vector<llama_chat_message> chat;
std::vector<std::string> contents;
for (const auto & msg : inputs.messages) {
auto content = msg.content;
for (const auto & part : msg.content_parts) {
@@ -2046,7 +1837,7 @@ static common_chat_params common_chat_templates_apply_legacy(
if (res < 0) {
// if the custom "tmpl" is not supported, we throw an error
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
throw std::runtime_error("this custom template is not supported, try using --jinja");
throw std::runtime_error("this custom template is not supported");
}
// if it turns out that our buffer is too small, we resize it
@@ -2116,12 +1907,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_COMMAND_R7B:
common_chat_parse_command_r7b(builder);
break;
case COMMON_CHAT_FORMAT_GRANITE:
common_chat_parse_granite(builder);
break;
case COMMON_CHAT_FORMAT_GPT_OSS:
common_chat_parse_gpt_oss(builder);
break;
default:
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
}
@@ -2135,14 +1920,10 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
} catch (const common_chat_msg_partial_exception & ex) {
LOG_DBG("Partial parse: %s\n", ex.what());
if (!is_partial) {
builder.clear_tools();
builder.move_to(0);
common_chat_parse_content_only(builder);
throw std::runtime_error(ex.what());
}
}
auto msg = builder.result();
if (!is_partial) {
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
}
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
return msg;
}

View File

@@ -7,7 +7,6 @@
#include <chrono>
#include <string>
#include <vector>
#include <map>
struct common_chat_templates;
@@ -71,7 +70,7 @@ struct common_chat_msg {
};
struct common_chat_msg_diff {
std::string reasoning_content_delta;
// std::string reasoning_content_delta;
std::string content_delta;
size_t tool_call_index = std::string::npos;
common_chat_tool_call tool_call_delta;
@@ -109,8 +108,6 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
COMMON_CHAT_FORMAT_HERMES_2_PRO,
COMMON_CHAT_FORMAT_COMMAND_R7B,
COMMON_CHAT_FORMAT_GRANITE,
COMMON_CHAT_FORMAT_GPT_OSS,
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};
@@ -128,9 +125,6 @@ struct common_chat_templates_inputs {
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
std::map<std::string, std::string> chat_template_kwargs;
bool add_bos = false;
bool add_eos = false;
};
struct common_chat_params {
@@ -191,7 +185,6 @@ std::string common_chat_format_example(
const char* common_chat_format_name(common_chat_format format);
const char* common_reasoning_format_name(common_reasoning_format format);
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

View File

@@ -0,0 +1,24 @@
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
# Only write the build info if it changed
if(EXISTS ${OUTPUT_FILE})
file(READ ${OUTPUT_FILE} CONTENTS)
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_COMMIT ${CMAKE_MATCH_1})
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_COMPILER ${CMAKE_MATCH_1})
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_TARGET ${CMAKE_MATCH_1})
if (
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
NOT OLD_TARGET STREQUAL BUILD_TARGET
)
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
endif()
else()
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
endif()

View File

@@ -203,7 +203,6 @@ bool set_process_priority(enum ggml_sched_priority prio) {
DWORD p = NORMAL_PRIORITY_CLASS;
switch (prio) {
case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -229,7 +228,6 @@ bool set_process_priority(enum ggml_sched_priority prio) {
int p = 0;
switch (prio) {
case GGML_SCHED_PRIO_LOW: p = 5; break;
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
case GGML_SCHED_PRIO_HIGH: p = -10; break;
@@ -448,15 +446,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
bool has_suffix = string_ends_with(str, suffix);
if (has_suffix) {
str = str.substr(0, str.size() - suffix.size());
}
return has_suffix;
}
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
if (!str.empty() && !stop.empty()) {
const char text_last_char = str.back();
@@ -475,7 +464,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
std::string regex_escape(const std::string & s) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
return std::regex_replace(s, special_chars, "\\$&");
return std::regex_replace(s, special_chars, "\\$0");
}
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
@@ -715,17 +704,11 @@ bool fs_validate_filename(const std::string & filename) {
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
#if defined(__clang__)
# pragma clang diagnostic pop
#elif defined(__GNUC__)
# pragma GCC diagnostic pop
#endif
filename_utf32 = converter.from_bytes(filename);
@@ -782,9 +765,6 @@ bool fs_validate_filename(const std::string & filename) {
return true;
}
#include <iostream>
// returns true if successful, false otherwise
bool fs_create_directory_with_parents(const std::string & path) {
#ifdef _WIN32
@@ -802,16 +782,9 @@ bool fs_create_directory_with_parents(const std::string & path) {
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
const std::wstring subpath = wpath.substr(0, pos_slash);
const wchar_t * test = subpath.c_str();
pos_slash += 1;
// skip the drive letter, in some systems it can return an access denied error
if (subpath.length() == 2 && subpath[1] == ':') {
continue;
}
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
const bool success = CreateDirectoryW(test, NULL);
if (!success) {
const DWORD error = GetLastError();
@@ -825,6 +798,8 @@ bool fs_create_directory_with_parents(const std::string & path) {
return false;
}
}
pos_slash += 1;
}
return true;
@@ -920,6 +895,34 @@ struct common_init_result common_init_from_params(common_params & params) {
const llama_vocab * vocab = llama_model_get_vocab(model);
if (params.reranking) {
bool ok = true;
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
ok = false;
}
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
if (!ok) {
llama_model_free(model);
return iparams;
}
}
auto cparams = common_context_params_to_llama(params);
llama_context * lctx = llama_init_from_model(model, cparams);
@@ -929,7 +932,7 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}
@@ -961,35 +964,6 @@ struct common_init_result common_init_from_params(common_params & params) {
}
}
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
bool ok = true;
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
ok = false;
}
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
if (!ok) {
llama_free(lctx);
llama_model_free(model);
return iparams;
}
}
// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
@@ -1014,19 +988,13 @@ struct common_init_result common_init_from_params(common_params & params) {
params.sampling.ignore_eos = false;
}
// initialize once
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
}
}
if (params.sampling.ignore_eos) {
// add EOG biases to the active set of logit biases
params.sampling.logit_bias.insert(
params.sampling.logit_bias.end(),
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias.push_back({i, -INFINITY});
}
}
}
if (params.sampling.penalty_last_n == -1) {
@@ -1071,7 +1039,7 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
}
llama_memory_clear(llama_get_memory(lctx), true);
llama_kv_self_clear(lctx);
llama_synchronize(lctx);
llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);
@@ -1122,7 +1090,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
@@ -1173,7 +1140,11 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
cparams.kv_unified = params.kv_unified;
if (params.reranking) {
cparams.embeddings = true;
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
}
cparams.type_k = params.cache_type_k;
cparams.type_v = params.cache_type_v;
@@ -1307,9 +1278,6 @@ std::vector<llama_token> common_tokenize(
int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens == std::numeric_limits<int32_t>::min()) {
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
}
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

View File

@@ -8,7 +8,6 @@
#include <string>
#include <string_view>
#include <vector>
#include <map>
#include <sstream>
#ifdef _WIN32
@@ -81,7 +80,6 @@ enum llama_example {
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_COUNT,
};
@@ -178,8 +176,7 @@ struct common_params_sampling {
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
std::set<llama_token> preserved_tokens;
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
// print the parameters into a string
std::string print() const;
@@ -201,10 +198,6 @@ struct common_params_speculative {
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
@@ -220,26 +213,9 @@ struct common_params_vocoder {
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
};
struct common_params_diffusion {
int32_t steps = 128;
bool visual_mode = false;
float eps = 0; // epsilon for timesteps
int32_t block_length = 0; // block length for generation
int32_t algorithm = 4; // default algorithm: low-confidence
float alg_temp = 0.0f; // algorithm temperature
float cfg_scale = 0; // classifier-free guidance scale
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
};
enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_AUTO,
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
};
struct common_params {
@@ -287,7 +263,6 @@ struct common_params {
struct common_params_sampling sampling;
struct common_params_speculative speculative;
struct common_params_vocoder vocoder;
struct common_params_diffusion diffusion;
struct common_params_model model;
@@ -350,7 +325,6 @@ struct common_params {
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // use mmap for faster loads
@@ -361,7 +335,6 @@ struct common_params {
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
bool no_op_offload = false; // globally disable offload host tensor operations to device
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
bool single_turn = false; // single turn chat conversation
@@ -381,7 +354,7 @@ struct common_params {
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
std::string embd_sep = "\n"; // separator of embeddings
std::string cls_sep = "\t"; // separator of classification sequences
bool reranking = false; // enable reranking support on server
// server params
int32_t port = 8080; // server listens on this network port
@@ -392,11 +365,10 @@ struct common_params {
std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
std::string api_prefix = ""; // NOLINT
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
@@ -405,8 +377,6 @@ struct common_params {
std::string ssl_file_key = ""; // NOLINT
std::string ssl_file_cert = ""; // NOLINT
std::map<std::string, std::string> default_template_kwargs;
// "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = false;
@@ -441,12 +411,10 @@ struct common_params {
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
bool show_statistics = false; // show imatrix statistics per tensor
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
// cvector-generator params
int n_pca_batch = 100;
@@ -546,7 +514,6 @@ static bool string_starts_with(const std::string & str,
// While we wait for C++20's std::string::ends_with...
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

View File

@@ -41,6 +41,49 @@ static std::string build_repetition(const std::string & item_rule, int min_items
return result;
}
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
class string_view {
const std::string & _str;
const size_t _start;
const size_t _end;
public:
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
size_t size() const {
return _end - _start;
}
size_t length() const {
return size();
}
operator std::string() const {
return str();
}
std::string str() const {
return _str.substr(_start, _end - _start);
}
string_view substr(size_t pos, size_t len = std::string::npos) const {
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
}
char operator[](size_t pos) const {
auto index = _start + pos;
if (index >= _end) {
throw std::out_of_range("string_view index out of range");
}
return _str[_start + pos];
}
bool operator==(const string_view & other) const {
std::string this_str = *this;
std::string other_str = other;
return this_str == other_str;
}
};
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
auto has_min = min_value != std::numeric_limits<int>::min();
auto has_max = max_value != std::numeric_limits<int>::max();
@@ -69,14 +112,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
}
out << "}";
};
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
[&](const std::string_view & from, const std::string_view & to) {
std::function<void(const string_view &, const string_view &)> uniform_range =
[&](const string_view & from, const string_view & to) {
size_t i = 0;
while (i < from.length() && i < to.length() && from[i] == to[i]) {
i++;
}
if (i > 0) {
out << "\"" << from.substr(0, i) << "\"";
out << "\"" << from.substr(0, i).str() << "\"";
}
if (i < from.length() && i < to.length()) {
if (i > 0) {

View File

@@ -1,39 +1,30 @@
#include "speculative.h"
#include "ggml.h"
#include "llama.h"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include <cstring>
#include <algorithm>
#include <map>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
struct common_speculative {
struct llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
struct llama_context * ctx_dft;
struct llama_context * ctx;
struct common_sampler * smpl;
llama_batch batch;
llama_tokens prompt_dft;
bool vocab_dft_compatible = true; // whether retokenization is needed
std::map<std::string, std::string> tgt_dft_replacements = {};
llama_tokens prompt;
};
struct common_speculative * common_speculative_init(
struct llama_context * ctx_tgt,
struct llama_context * ctx_dft) {
auto * result = new common_speculative {
/* .ctx_tgt = */ ctx_tgt,
/* .ctx_dft = */ ctx_dft,
/* .smpl = */ nullptr,
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
/* .prompt_dft = */ {},
/* .vocab_dft_compatible = */ false,
/* .ctx = */ ctx_dft,
/* .smpl = */ nullptr,
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
/* .prompt = */ {},
};
// TODO: optimize or pass from outside?
@@ -68,9 +59,6 @@ struct common_speculative * common_speculative_init(
}
#endif
result->vocab_dft_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft);
LOG_DBG("vocab_dft_compatible = %d\n", result->vocab_dft_compatible);
return result;
}
@@ -87,8 +75,8 @@ void common_speculative_free(struct common_speculative * spec) {
}
bool common_speculative_are_compatible(
const struct llama_context * ctx_tgt,
const struct llama_context * ctx_dft) {
const struct llama_context * ctx_tgt,
const struct llama_context * ctx_dft) {
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
const struct llama_model * model_dft = llama_get_model(ctx_dft);
@@ -102,32 +90,31 @@ bool common_speculative_are_compatible(
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) {
LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
return false;
}
if (
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
) {
LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
return false;
}
{
const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
const int vocab_diff = n_vocab_tgt > n_vocab_dft
? n_vocab_tgt - n_vocab_dft
: n_vocab_dft - n_vocab_tgt;
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
__func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
return false;
}
@@ -135,8 +122,8 @@ bool common_speculative_are_compatible(
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
common_token_to_piece(ctx_tgt, i).c_str(),
common_token_to_piece(ctx_dft, i).c_str());
return false;
@@ -147,93 +134,30 @@ bool common_speculative_are_compatible(
return true;
}
void common_speculative_add_replacement_tgt_dft(
struct common_speculative * spec,
const char *source, const char *dest) {
spec->tgt_dft_replacements[source] = dest;
}
static std::string replace_to_dft(
struct common_speculative * spec,
const std::string& input) {
std::string result = input;
for (const auto & pair : spec->tgt_dft_replacements) {
size_t pos = result.find(pair.first);
while (pos != std::string::npos) {
result.replace(pos, pair.first.length(), pair.second);
pos = result.find(pair.first, pos + pair.second.length());
}
}
return result;
}
static std::string replace_to_tgt(
struct common_speculative * spec,
const std::string& input) {
std::string result = input;
for (const auto& pair : spec->tgt_dft_replacements) {
size_t pos = result.find(pair.second);
while (pos != std::string::npos) {
result.replace(pos, pair.second.length(), pair.first);
pos = result.find(pair.second, pos + pair.first.length());
}
}
return result;
}
llama_tokens common_speculative_gen_draft(
struct common_speculative * spec,
struct common_speculative_params params,
const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
const llama_tokens & prompt_tgt,
llama_token id_last) {
auto & batch = spec->batch;
auto & ctx_tgt = spec->ctx_tgt;
auto & ctx_dft = spec->ctx_dft;
auto & ctx = spec->ctx;
auto & smpl = spec->smpl;
auto & prompt_dft = spec->prompt_dft;
auto * mem_dft = llama_get_memory(ctx_dft);
auto & prompt = spec->prompt;
int reuse_i = 0;
int reuse_n = 0;
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_draft;
llama_tokens prompt_tgt_draft_model;
if (!spec->vocab_dft_compatible) {
std::string text;
text = common_detokenize(ctx_tgt, prompt_tgt_main_model, true);
text = replace_to_dft(spec, text);
LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
prompt_tgt_draft_model = common_tokenize(ctx_dft, text, false, true);
// convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation
const auto * model_tgt = llama_get_model(ctx_tgt);
const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
int32_t n_chars = llama_detokenize(vocab_tgt, &id_last, 1, nullptr, 0, false, false);
GGML_ASSERT(n_chars < 0 && "failed to detokenize id_last");
text.resize(-n_chars);
llama_detokenize(vocab_tgt, &id_last, 1, text.data(), text.size(), false, false);
text = replace_to_dft(spec, text);
LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
id_last = common_tokenize(ctx_dft, text, false, true)[0];
}
// prompt_tgt's tokens will always be compatible with ctx_dft
const llama_tokens &prompt_tgt =
spec->vocab_dft_compatible ? prompt_tgt_main_model : prompt_tgt_draft_model;
const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
// reuse as much as possible from the old draft context
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
for (int i = 0; i < (int) prompt_dft.size(); ++i) {
for (int i = 0; i < (int) prompt.size(); ++i) {
int cur = 0;
while (i_start + cur < (int) prompt_tgt.size() &&
i + cur < (int) prompt_dft.size() &&
prompt_tgt[i_start + cur] == prompt_dft[i + cur]) {
i + cur < (int) prompt.size() &&
prompt_tgt[i_start + cur] == prompt[i + cur]) {
cur++;
}
@@ -243,20 +167,21 @@ llama_tokens common_speculative_gen_draft(
}
}
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
llama_tokens result;
result.reserve(params.n_draft);
if (reuse_n == 0) {
llama_memory_clear(mem_dft, false);
prompt_dft.clear();
llama_kv_self_clear(ctx);
prompt.clear();
} else {
// this happens when a previous draft has been discarded (for example, due to being too small), but the
// target model agreed with it. in this case, we simply pass back the previous results to save compute
if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
result.push_back(prompt_dft[i]);
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
result.push_back(prompt[i]);
if (params.n_draft <= (int) result.size()) {
break;
@@ -267,15 +192,16 @@ llama_tokens common_speculative_gen_draft(
}
if (reuse_i > 0) {
llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
}
if (reuse_n < (int) prompt_dft.size()) {
llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
if (reuse_n < (int) prompt.size()) {
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
prompt.erase(prompt.begin() + reuse_n, prompt.end());
}
}
@@ -286,28 +212,28 @@ llama_tokens common_speculative_gen_draft(
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
prompt_dft.push_back(prompt_tgt[i]);
prompt.push_back(prompt_tgt[i]);
}
// we should rarely end-up here during normal decoding
if (batch.n_tokens > 0) {
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
llama_decode(ctx_dft, batch);
llama_decode(ctx, batch);
}
const llama_pos n_past = prompt_dft.size();
const llama_pos n_past = prompt.size();
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
common_batch_clear(batch);
common_batch_add (batch, id_last, n_past, { 0 }, true);
prompt_dft.push_back(id_last);
prompt.push_back(id_last);
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
llama_decode(ctx_dft, batch);
llama_decode(ctx, batch);
common_sampler_reset(smpl);
@@ -315,13 +241,13 @@ llama_tokens common_speculative_gen_draft(
for (int i = 0; i < params.n_draft; ++i) {
common_batch_clear(batch);
common_sampler_sample(smpl, ctx_dft, 0, true);
common_sampler_sample(smpl, ctx, 0, true);
const auto * cur_p = common_sampler_get_candidates(smpl);
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
}
// add drafted token for each sequence
@@ -343,19 +269,10 @@ llama_tokens common_speculative_gen_draft(
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
// evaluate the drafted tokens on the draft model
llama_decode(ctx_dft, batch);
llama_decode(ctx, batch);
prompt_dft.push_back(id);
prompt.push_back(id);
}
if (!spec->vocab_dft_compatible) {
std::string detokenized = common_detokenize(ctx_dft, result, true);
detokenized = replace_to_tgt(spec, detokenized);
LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
result = common_tokenize(ctx_tgt, detokenized, false, true);
if (result.size() > (size_t)params.n_draft) {
result.resize(params.n_draft);
}
}
return result;
}

View File

@@ -12,10 +12,7 @@ struct common_speculative_params {
float p_min = 0.75f; // min probability required to accept a token in the draft
};
struct common_speculative * common_speculative_init(
struct llama_context * ctx_tgt,
struct llama_context * ctx_dft
);
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
void common_speculative_free(struct common_speculative * spec);
@@ -23,10 +20,6 @@ bool common_speculative_are_compatible(
const struct llama_context * ctx_tgt,
const struct llama_context * ctx_dft);
void common_speculative_add_replacement_tgt_dft(
struct common_speculative * spec,
const char *source, const char *dest);
// sample up to n_draft tokens and add them to the batch using the draft model
llama_tokens common_speculative_gen_draft(
struct common_speculative * spec,

File diff suppressed because it is too large Load Diff

View File

@@ -7,6 +7,7 @@ import pathlib
import re
import requests
import sys
import json
import shutil
import argparse
@@ -59,10 +60,6 @@ parser.add_argument(
"--full", action="store_true",
help="download full list of models - make sure you have access to all of them",
)
parser.add_argument(
"--check-missing", action="store_true",
help="only check for missing pre-tokenizer hashes",
)
parser.add_argument(
"hf_token",
help="optional HF token",
@@ -72,11 +69,8 @@ args = parser.parse_args()
hf_token = args.hf_token if args.hf_token is not None else hf_token
if hf_token is None:
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
if args.check_missing and args.full:
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
args.check_missing = False
logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
sys.exit(1)
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
@@ -134,11 +128,6 @@ models = [
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -147,22 +136,12 @@ pre_computed_hashes = [
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902"},
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
]
def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"} if token else None
headers = {"Authorization": f"Bearer {token}"}
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
@@ -232,19 +211,18 @@ if not args.full:
all_models = models.copy()
models = [model for model in all_models if model["name"] not in existing_models]
if not args.check_missing:
logging.info(f"Downloading {len(models)} models...")
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
logging.info(f"Downloading {len(models)} models...")
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
src_ifs = ""
for model in [*pre_computed_hashes, *all_models]:
for model in [*all_models, *pre_computed_hashes]:
name = model["name"]
tokt = model["tokt"]
chkhsh = model.get("chkhsh")
@@ -252,6 +230,11 @@ for model in [*pre_computed_hashes, *all_models]:
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
continue
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os.path.exists(f"models/tokenizers/{name}"):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
continue
# create the tokenizer
if chkhsh is not None:
# if the model has a pre-computed hash, use it
@@ -261,19 +244,15 @@ for model in [*pre_computed_hashes, *all_models]:
chkhsh = existing_models[name]
else:
# otherwise, compute the hash of the tokenizer
# Fail if the tokenizer folder with config does not exist or there are other download issues previously
if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.")
try:
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
if name == "t5":
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except Exception as e:
raise OSError(f"Error loading tokenizer for model {name}.") from e
except OSError as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()

View File

@@ -340,7 +340,7 @@ if __name__ == '__main__':
sys.exit(1)
else:
logger.info(f"Loading base model: {dir_base_model.name}")
hparams = ModelBase.load_hparams(dir_base_model, False)
hparams = ModelBase.load_hparams(dir_base_model)
with torch.inference_mode():
try:

View File

@@ -8,7 +8,6 @@
- [DataType Supports](#datatype-supports)
- [Docker](#docker)
- [Linux](#linux)
- [Environment variable setup](#environment-variable-setup)
- [TODO](#todo)
@@ -291,26 +290,5 @@ Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang
We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
## Environment variable setup
### GGML_CANN_ASYNC_MODE
Enables asynchronous operator submission. Disabled by default.
### GGML_CANN_MEM_POOL
Specifies the memory pool management strategy:
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
- prio: Employs a priority queue-based memory pool management.
- leg: Uses a fixed-size buffer pool.
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
### GGML_CANN_WEIGHT_NZ
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
## TODO
- Support more models and data types.

View File

@@ -757,7 +757,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| Name | Value | Function |
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |

View File

@@ -1,276 +0,0 @@
> [!IMPORTANT]
> This build documentation is specific only to IBM Z & LinuxONE mainframes (s390x). You can find the build documentation for other architectures: [build.md](build.md).
# Build llama.cpp locally (for s390x)
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
**To get the code:**
```bash
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```
## CPU Build with BLAS
Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements. Make sure to have OpenBLAS installed in your environment.
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS
cmake --build build --config Release -j $(nproc)
```
**Notes**:
- For faster repeated compilation, install [ccache](https://ccache.dev/)
- By default, VXE/VXE2 is enabled. To disable it (not recommended):
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_VXE=OFF
cmake --build build --config Release -j $(nproc)
```
- By default, NNPA is disabled by default. To enable it:
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_NNPA=ON
cmake --build build --config Release -j $(nproc)
```
- For debug builds:
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Debug \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS
cmake --build build --config Debug -j $(nproc)
```
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release -j $(nproc)
```
## Getting GGUF Models
All models need to be converted to Big-Endian. You can achieve this in three cases:
1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)**
![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
You can find popular models pre-converted and verified at [s390x Verified Models](https://huggingface.co/collections/taronaeo/s390x-verified-models-672765393af438d0ccb72a08) or [s390x Runnable Models](https://huggingface.co/collections/taronaeo/s390x-runnable-models-686e951824198df12416017e).
These models have already been converted from `safetensors` to `GGUF` Big-Endian and their respective tokenizers verified to run correctly on IBM z15 and later system.
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
![File Type - safetensors](https://img.shields.io/badge/File_Type-safetensors-da1e28)
The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
Ensure that you have installed the required packages in advance
```bash
pip3 install -r requirements.txt
```
Convert the `safetensors` model to `GGUF`
```bash
python3 convert_hf_to_gguf.py \
--outfile model-name-be.f16.gguf \
--outtype f16 \
--bigendian \
model-directory/
```
For example,
```bash
python3 convert_hf_to_gguf.py \
--outfile granite-3.3-2b-instruct-be.f16.gguf \
--outtype f16 \
--bigendian \
granite-3.3-2b-instruct/
```
3. **Convert existing GGUF Little-Endian model to Big-Endian**
![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B GGUF](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
```bash
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
```
For example,
```bash
python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG
mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf
```
**Notes:**
- The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2.
## IBM Accelerators
### 1. SIMD Acceleration
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
### 2. NNPA Vector Intrinsics Acceleration
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
### 3. zDNN Accelerator
_Only available in IBM z16 / LinuxONE 4 or later system. No support currently available._
### 4. Spyre Accelerator
_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
## Performance Tuning
### 1. Virtualization Setup
It is strongly recommended to use only LPAR (Type-1) virtualization to get the most performance.
Note: Type-2 virtualization is not supported at the moment, while you can get it running, the performance will not be the best.
### 2. IFL (Core) Count
It is recommended to allocate a minimum of 8 shared IFLs assigned to the LPAR. Increasing the IFL count past 8 shared IFLs will only improve Prompt Processing performance but not Token Generation.
Note: IFL count does not equate to vCPU count.
### 3. SMT vs NOSMT (Simultaneous Multithreading)
It is strongly recommended to disable SMT via the kernel boot parameters as it negatively affects performance. Please refer to your Linux distribution's guide on disabling SMT via kernel boot parameters.
### 4. BLAS vs NOBLAS
IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS.
## Frequently Asked Questions (FAQ)
1. I'm getting the following error message while trying to load a model: `gguf_init_from_file_impl: failed to load model: this GGUF file version 50331648 is extremely large, is there a mismatch between the host and model endianness?`
Answer: Please ensure that the model you have downloaded/converted is GGUFv3 Big-Endian. These models are usually denoted with the `-be` suffix, i.e., `granite-3.3-2b-instruct-be.F16.gguf`.
You may refer to the [Getting GGUF Models](#getting-gguf-models) section to manually convert a `safetensors` model to `GGUF` Big Endian.
2. I'm getting extremely poor performance when running inference on a model
Answer: Please refer to the [Appendix B: SIMD Support Matrix](#appendix-b-simd-support-matrix) to check if your model quantization is supported by SIMD acceleration.
3. I'm building on IBM z17 and getting the following error messages: `invalid switch -march=z17`
Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
4. Failing to install the `sentencepiece` package using GCC 15+
Answer: The `sentencepiece` team are aware of this as seen in [this issue](https://github.com/google/sentencepiece/issues/1108).
As a temporary workaround, please run the installation command with the following environment variables.
```bash
export CXXFLAGS="-include cstdint"
```
For example,
```bash
CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
```
5. `-DGGML_NNPA=ON` generates gibberish output
Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
## Getting Help on IBM Z & LinuxONE
1. **Bugs, Feature Requests**
Please file an issue in llama.cpp and ensure that the title contains "s390x".
2. **Other Questions**
Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com).
## Appendix A: Hardware Support Matrix
| | Support | Minimum Compiler Version |
| ------- | ------- | ------------------------ |
| IBM z15 | ✅ | |
| IBM z16 | ✅ | |
| IBM z17 | ✅ | GCC 15.1.0 |
- ✅ - supported and verified to run as intended
- 🚫 - unsupported, we are unlikely able to provide support
## Appendix B: SIMD Support Matrix
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
| ---------- | ----------- | ---- | ---- | ----- |
| FP32 | ✅ | ✅ | ❓ | ❓ |
| FP16 | ✅ | ✅ | ❓ | ❓ |
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
| Q4_0 | ✅ | ✅ | ❓ | ❓ |
| Q4_1 | ✅ | ✅ | ❓ | ❓ |
| Q5_0 | 🚫 | 🚫 | ❓ | ❓ |
| Q5_1 | 🚫 | 🚫 | ❓ | ❓ |
| Q8_0 | ✅ | ✅ | ❓ | ❓ |
| Q2_K | 🚫 | 🚫 | ❓ | ❓ |
| Q3_K | ✅ | ✅ | ❓ | ❓ |
| Q4_K | ✅ | ✅ | ❓ | ❓ |
| Q5_K | ✅ | ✅ | ❓ | ❓ |
| Q6_K | ✅ | ✅ | ❓ | ❓ |
| TQ1_0 | 🚫 | 🚫 | ❓ | ❓ |
| TQ2_0 | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_XXS | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_XS | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ3_XXS | 🚫 | 🚫 | ❓ | ❓ |
| IQ3_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ1_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ1_M | 🚫 | 🚫 | ❓ | ❓ |
| IQ4_NL | ✅ | ✅ | ❓ | ❓ |
| IQ4_XS | ✅ | ✅ | ❓ | ❓ |
| FP32->FP16 | 🚫 | ✅ | ❓ | ❓ |
| FP16->FP32 | 🚫 | ✅ | ❓ | ❓ |
- ✅ - acceleration available
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 25, 2025.

View File

@@ -1,9 +1,5 @@
# Build llama.cpp locally
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
**To get the Code:**
```bash
@@ -68,9 +64,6 @@ cmake --build build --config Release
cmake --build build-x64-windows-llvm-release
```
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
- **Debian / Ubuntu:** `sudo apt-get install libcurl4-openssl-dev` # (or `libcurl4-gnutls-dev` if you prefer GnuTLS)
- **Fedora / RHEL / Rocky / Alma:** `sudo dnf install libcurl-devel`
- **Arch / Manjaro:** `sudo pacman -S curl` # includes libcurl headers
## BLAS Build
@@ -308,8 +301,9 @@ On Linux it is possible to use unified memory architecture (UMA) to share main m
## Vulkan
### For Windows Users:
**w64devkit**
**Windows**
### w64devkit
Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
@@ -336,7 +330,7 @@ cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release
```
**Git Bash MINGW64**
### Git Bash MINGW64
Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
@@ -359,8 +353,7 @@ Now you can load the model in conversation mode using `Vulkan`
build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
```
**MSYS2**
### MSYS2
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
```sh
pacman -S git \
@@ -376,9 +369,9 @@ cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release
```
### For Docker users:
**With docker**:
You don't need to install the Vulkan SDK. It will be installed inside the container.
You don't need to install Vulkan SDK. It will be installed inside the container.
```sh
# Build the image
@@ -388,29 +381,32 @@ docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
### For Linux users:
**Without docker**:
First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.
Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
> [!IMPORTANT]
> After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.
For example, on Ubuntu 22.04 (jammy), use the command below:
Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
```bash
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
apt update -y
apt-get install -y vulkan-sdk
# To verify the installation, use the command below:
vulkaninfo
```
Then, assuming you have `cd` into your llama.cpp folder and there are no errors with running `vulkaninfo`, you can proceed to build llama.cpp using the CMake commands below:
Alternatively your package manager might be able to provide the appropriate libraries.
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
Then, build llama.cpp using the cmake command below:
```bash
cmake -B build -DGGML_VULKAN=1
cmake --build build --config Release
```
Finally, after finishing your build, you should be able to do something like this:
```bash
# Test the output binary
# "-ngl 99" should offload all of the layers to GPU for most (if not all) models.
./build/bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -ngl 99
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
# You should see in the output, ggml_vulkan detected your GPU. For example:
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
@@ -557,27 +553,6 @@ ninja
To read documentation for how to build on Android, [click here](./android.md)
## WebGPU [In Progress]
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The currrent implementation is up-to-date with Dawn commit `bed1a61`.
In the llama.cpp directory, build with CMake:
```
cmake -B build -DGGML_WEBGPU=ON
cmake --build build --config Release
```
### Browser Support
WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu.
Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build the emdawbwebgpu package locally, so that it stays in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`.
## IBM Z & LinuxONE
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
## Notes about GPU-accelerated backends
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.

View File

@@ -23,19 +23,11 @@ The convert script reads the model configuration, tokenizer, tensor names+data a
The required steps to implement for an HF model are:
1. Define the model `ModelBase.register` annotation in a new `TextModel` or `MmprojModel` subclass, example:
1. Define the model `Model.register` annotation in a new `Model` subclass, example:
```python
@ModelBase.register("MyModelForCausalLM")
class MyModel(TextModel):
model_arch = gguf.MODEL_ARCH.MYMODEL
```
or
```python
@ModelBase.register("MyModelForConditionalGeneration")
class MyModel(MmprojModel):
@Model.register("MyModelForCausalLM")
class MyModel(Model):
model_arch = gguf.MODEL_ARCH.MYMODEL
```
@@ -83,31 +75,28 @@ block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
- `TextModel#set_gguf_parameters`
- `MmprojModel#set_gguf_parameters`
- `ModelBase#set_vocab`
- `ModelBase#modify_tensors`
- `Model#set_gguf_parameters`
- `Model#set_vocab`
- `Model#write_tensors`
NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights.
### 2. Define the model architecture in `llama.cpp`
The model params and tensors layout must be defined in `llama.cpp` source files:
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
2. In `src/llama-arch.cpp`:
- Add the architecture name to the `LLM_ARCH_NAMES` map.
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
The model params and tensors layout must be defined in `llama.cpp`:
1. Define a new `llm_arch`
2. Define the tensors layout in `LLM_TENSOR_NAMES`
3. Add any non-standard metadata in `llm_load_hparams`
4. Create the tensors for inference in `llm_load_tensors`
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
### 3. Build the GGML graph implementation
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.

View File

@@ -25,9 +25,6 @@ Additionally, there the following images, similar to the above:
- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
@@ -110,7 +107,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
The defaults are:
- `MUSA_VERSION` set to `rc4.2.0`
- `MUSA_VERSION` set to `rc4.0.1`
The resulting images, are essentially the same as the non-MUSA images:

View File

@@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
- Functionary v3.1 / v3.2
- Hermes 2/3, Qwen 2.5
- Qwen 2.5 Coder
- Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
- Mistral Nemo
- Firefunction v2
- Command R7B

View File

@@ -1,42 +1,28 @@
# Install pre-built version of llama.cpp
| Install via | Windows | Mac | Linux |
|-------------|---------|-----|-------|
| Winget | ✅ | | |
| Homebrew | | ✅ | ✅ |
| MacPorts | | ✅ | |
| Nix | | ✅ | ✅ |
## Homebrew
## Winget (Windows)
```sh
winget install llama.cpp
```
The package is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/issues/8188
## Homebrew (Mac and Linux)
On Mac and Linux, the homebrew package manager can be used via
```sh
brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
## MacPorts (Mac)
## MacPorts
```sh
sudo port install llama.cpp
```
see also: https://ports.macports.org/port/llama.cpp/details/
See also: https://ports.macports.org/port/llama.cpp/details/
## Nix
## Nix (Mac and Linux)
On Mac and Linux, the Nix package manager can be used via
```sh
nix profile install nixpkgs#llama-cpp
```
For flake enabled installs.
Or
@@ -48,3 +34,13 @@ nix-env --file '<nixpkgs>' --install --attr llama-cpp
For non-flake enabled installs.
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
## Flox
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
```sh
flox install llama-cpp
```
Flox follows the nixpkgs build of llama.cpp.

View File

@@ -97,9 +97,6 @@ NOTE: some models may require large context window, for example: `-c 8192`
# Qwen2-Audio and SeaLLM-Audio
# note: no pre-quantized GGUF this model, as they have very poor result
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
# Mistral's Voxtral
(tool_name) -hf ggml-org/Voxtral-Mini-3B-2507-GGUF
```
**Mixed modalities**:
@@ -110,7 +107,3 @@ NOTE: some models may require large context window, for example: `-c 8192`
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
```
## Finding more models:
GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf

View File

@@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta
Clone llama.cpp:
```bash
git clone https://github.com/ggml-org/llama.cpp
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```
@@ -29,8 +29,8 @@ cmake --build build --config Release
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
```bash
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-2_6
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --minicpmv_version 4
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
# quantize int4 version

View File

@@ -1,47 +0,0 @@
## MiniCPM-o 4
### Prepare models and code
Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model from huggingface to "MiniCPM-o-4" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-o 4
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-4-gguf) by us)
```bash
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-4
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-4 --minicpmv-projector ../MiniCPM-o-4/minicpmv.projector --output-dir ../MiniCPM-o-4/ --minicpmv_version 6
python ./convert_hf_to_gguf.py ../MiniCPM-o-4/model
# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-o-4/model/ggml-model-f16.gguf ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Inference on Linux or Mac
```bash
# run in single-turn mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run in conversation mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf
```

View File

@@ -28,8 +28,8 @@ cmake --build build --config Release
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
```bash
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --minicpmv_version 2
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version

View File

@@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta
Clone llama.cpp:
```bash
git clone https://github.com/ggml-org/llama.cpp
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```
@@ -28,8 +28,8 @@ cmake --build build --config Release
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
```bash
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --minicpmv_version 3
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
# quantize int4 version

View File

@@ -1,47 +0,0 @@
## MiniCPM-V 4
### Prepare models and code
Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model from huggingface to "MiniCPM-V-4" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-V 4
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4-gguf) by us)
```bash
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4 --minicpmv-projector ../MiniCPM-V-4/minicpmv.projector --output-dir ../MiniCPM-V-4/ --minicpmv_version 5
python ./convert_hf_to_gguf.py ../MiniCPM-V-4/model
# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-V-4/model/ggml-model-f16.gguf ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Inference on Linux or Mac
```bash
# run in single-turn mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run in conversation mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf
```

View File

@@ -1,102 +0,0 @@
# GGML Operations
List of GGML operations and backend support status.
## How to add a backend to this table:
1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)
2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`
Legend:
- ✅ Fully supported by this backend
- 🟡 Partially supported by this backend
- ❌ Not supported by this backend
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan |
|-----------|------|------|------|------|------|------|------|------|
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 |
| CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 |
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 |
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 |
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ |
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 |
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ |
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
| SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ |

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -33,7 +33,6 @@ else()
add_subdirectory(speculative-simple)
add_subdirectory(gen-docs)
add_subdirectory(training)
add_subdirectory(diffusion)
if (NOT GGML_BACKEND_DL)
add_subdirectory(convert-llama2c-to-ggml)
# these examples use the backends directly and cannot be built with dynamic loading

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e
AI_NAME="${AI_NAME:-Miku}"

View File

@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
}
for i in 1 ..< n_parallel {
llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens)
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
}
if n_parallel > 1 {

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -euo pipefail

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
#
# Temporary script - will be removed in the future

View File

@@ -1,5 +0,0 @@
set(TARGET llama-diffusion-cli)
add_executable(${TARGET} diffusion-cli.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,13 +0,0 @@
# Diffusion Text Generation
This directory contains implementations for Diffusion LLMs (DLLMs)
More Info:
- https://github.com/ggml-org/llama.cpp/pull/14644
- https://github.com/ggml-org/llama.cpp/pull/14771
Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`

View File

@@ -1,683 +0,0 @@
#include "arg.h"
#include "chat.h"
#include "common.h"
#include "llama.h"
#include "log.h"
#include <limits.h>
#include <algorithm>
#include <cmath>
#include <cstring>
#include <limits>
#include <random>
#include <string>
#include <vector>
enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
// Unified transfer scheduling methods
enum transfer_schedule {
TIMESTEP_BASED = 0, // Dream-style: (1.0 - s/t) * remaining
BLOCK_BASED = 1, // LLaDA-style: process in blocks with get_num_transfer_tokens
};
typedef bool (*diffusion_step_callback_t)(int32_t step,
int32_t total_steps,
const llama_token * tokens,
int32_t n_tokens,
void * user_data);
struct diffusion_params {
int32_t steps = 0;
float temperature = 0;
llama_token mask_token_id = LLAMA_TOKEN_NULL;
diffusion_step_callback_t step_callback = nullptr;
void * step_callback_user_data = nullptr;
int32_t seed = 0;
bool visual_mode = false;
bool shift_logits = false; // Shift logits by -1 after decode
float top_p = 0.;
int32_t top_k = 0.;
diffusion_algorithm algorithm = CONFIDENCE_BASED;
transfer_schedule schedule = TIMESTEP_BASED;
float cfg_scale = 0.; // Config scale for classifier-free guidance
float eps = 0.; // Timestep scheduling
int32_t block_length = 0; // Block size (for block scheduling)
float alg_temp = 0; // algorithm temperature (0.0 = deterministic)
bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0
int32_t max_length = 0; // Maximum sequence length
};
struct callback_data {
diffusion_params * diff_params;
const llama_vocab * vocab;
int32_t n_input;
};
static float calculate_confidence(const llama_token_data_array & cur_p,
diffusion_algorithm algorithm,
std::mt19937 & rng) {
switch (algorithm) {
case CONFIDENCE_BASED:
return cur_p.data[cur_p.selected].p; // Selected token probability
case ENTROPY_BASED:
{
float entropy = 0.0f;
const float epsilon = 1e-10f;
for (size_t i = 0; i < cur_p.size; i++) {
float prob = cur_p.data[i].p;
entropy += prob * logf(prob + epsilon);
}
return -entropy; // Higher entropy = lower confidence
}
case MARGIN_BASED:
return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
case RANDOM:
{
std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
return uniform(rng); // Random confidence
}
case ORIGIN:
return cur_p.data[cur_p.selected].p;
default:
return 0.0f;
}
}
// Unified transfer count calculation function
static int32_t calculate_transfer_count(int32_t step,
int32_t total_steps,
int32_t remaining_masked,
transfer_schedule schedule,
float eps,
const std::vector<int32_t> & num_transfer_tokens = {}) {
switch (schedule) {
case TIMESTEP_BASED:
{
float t = 1.0f - (float) step / total_steps * (1.0f - eps);
float s = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
return (int32_t) (remaining_masked * p_transfer);
}
case BLOCK_BASED:
if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
return num_transfer_tokens[step];
}
return remaining_masked / (total_steps - step); // Fallback
default:
return remaining_masked / (total_steps - step);
}
}
static bool diffusion_step_callback(int32_t step,
int32_t total_steps,
const llama_token * tokens,
int32_t n_tokens,
void * user_data) {
(void) user_data;
callback_data * data = static_cast<callback_data *>(user_data);
auto print_progress_bar = [](int32_t step, int32_t total_steps) {
int progress_percent = (step * 100) / total_steps;
int progress_bars = (step * 50) / total_steps;
LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%",
step,
total_steps,
std::string(progress_bars, '=').c_str(),
std::string(50 - progress_bars, ' ').c_str(),
progress_percent);
};
if (data->diff_params->visual_mode) {
// Visual mode: clear
LOG_INF("\033[2J\033[H"); // Clear screen and move cursor to top-left
print_progress_bar(step, total_steps);
LOG_INF("\n");
std::string current_text = " ";
for (int32_t i = data->n_input; i < n_tokens; i++) {
std::string token_str;
if (tokens[i] != llama_vocab_mask(data->vocab)) {
char piece[256];
int n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false);
if (n_chars > 0) {
piece[n_chars] = '\0';
token_str = piece;
}
} else {
token_str = " ";
}
current_text += token_str;
}
LOG_INF("%s\n", current_text.c_str());
} else {
print_progress_bar(step, total_steps);
}
return true;
}
static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
if (temperature == 0.0f) {
return;
}
std::uniform_real_distribution<double> uniform(0.0, 1.0);
for (int32_t i = 0; i < n_vocab; i++) {
double noise = uniform(rng);
// Prevent log(0)
noise = std::max(noise, 1e-20);
double gumbel_noise = std::pow(-std::log(noise), temperature);
logits[i] = std::exp(logits[i]) / gumbel_noise;
}
}
static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
std::vector<int32_t> num_transfer_tokens(steps);
int32_t base = mask_count / steps;
int32_t remainder = mask_count % steps;
for (int32_t i = 0; i < steps; i++) {
num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
}
return num_transfer_tokens;
}
static void diffusion_generate(llama_context * ctx,
const llama_token * input_tokens,
llama_token * output_tokens,
int32_t n_input,
const diffusion_params & params,
int32_t & n_generated) {
n_generated = 0;
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
return;
}
const llama_model * model = llama_get_model(ctx);
// Initialize with input and pad with mask tokens
std::copy(input_tokens, input_tokens + n_input, output_tokens);
std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
std::mt19937 rng(params.seed);
llama_set_causal_attn(ctx, false);
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
std::vector<llama_token_data> candidates(n_vocab);
std::vector<llama_token_data> conf_candidates;
conf_candidates.reserve(params.max_length);
std::vector<int32_t> mask_positions;
mask_positions.reserve(params.max_length);
// Setup sampler chain
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
if (params.top_k > 0) {
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
}
if (params.top_p < 1.0f) {
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
}
if (params.temperature > 0.0f) {
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
}
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
llama_batch batch = llama_batch_init(params.max_length, 0, 1);
batch.n_tokens = params.max_length;
// Pre-allocate buffers for CFG if needed
int32_t logits_size = n_vocab * params.max_length;
std::vector<float> cond_logits_buffer;
std::vector<llama_token> un_x_buffer;
if (params.cfg_scale > 0.0f) {
cond_logits_buffer.resize(logits_size);
un_x_buffer.resize(params.max_length);
}
// For block-based processing
std::vector<int32_t> num_transfer_tokens;
int32_t num_blocks = 1;
int32_t steps_per_block = params.steps;
if (params.schedule == BLOCK_BASED) {
GGML_ASSERT(params.max_length % params.block_length == 0);
num_blocks = params.max_length / params.block_length;
GGML_ASSERT(params.steps % num_blocks == 0);
steps_per_block = params.steps / num_blocks;
}
std::vector<float> confidence(params.max_length);
int64_t total_sampling_time = 0;
int64_t total_time = 0;
int64_t time_start = ggml_time_us();
for (int block_num = 0; block_num < num_blocks; block_num++) {
int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
int32_t block_end = (params.schedule == BLOCK_BASED) ?
std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
params.max_length;
// Count masked tokens in current block for block-based processing
if (params.schedule == BLOCK_BASED) {
int32_t block_mask_count = 0;
for (int i = block_start; i < block_end; i++) {
if (output_tokens[i] == params.mask_token_id) {
block_mask_count++;
}
}
num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
}
for (int32_t step = 0; step < steps_per_block; step++) {
int32_t global_step = block_num * steps_per_block + step;
if (params.step_callback) {
if (!params.step_callback(
global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
break;
}
}
// Setup batch
for (int32_t i = 0; i < params.max_length; i++) {
batch.token[i] = output_tokens[i];
batch.pos[i] = i;
batch.n_seq_id[i] = 1;
batch.seq_id[i][0] = 0;
batch.logits[i] = 1;
}
float * logits = nullptr;
if (params.cfg_scale > 0.0f) {
int ret = llama_decode(ctx, batch);
if (ret != 0) {
LOG_ERR("Failed to generate conditional");
break;
}
float * cond_logits_ptr = llama_get_logits(ctx);
std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
// Unconditional generation (mask input)
std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
for (int32_t i = 0; i < n_input; i++) {
un_x_buffer[i] = params.mask_token_id;
}
for (int32_t i = 0; i < params.max_length; i++) {
batch.token[i] = un_x_buffer[i];
}
ret = llama_decode(ctx, batch);
if (ret != 0) {
LOG_ERR("Failed to generate unconditional");
break;
}
float * uncond_logits = llama_get_logits(ctx);
// Apply CFG
for (int32_t i = 0; i < logits_size; i++) {
cond_logits_buffer[i] =
uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
}
logits = cond_logits_buffer.data();
} else {
int ret = llama_decode(ctx, batch);
if (ret != 0) {
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
break;
}
logits = llama_get_logits(ctx);
}
if (!logits) {
LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
break;
}
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
if (params.shift_logits) {
return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
}
return logits + (pos) *n_vocab;
};
int64_t time_start_sampling = ggml_time_us();
mask_positions.clear();
for (int32_t i = 0; i < params.max_length; i++) {
if (output_tokens[i] == params.mask_token_id) {
// For block-based, only consider current block
if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
mask_positions.push_back(i);
}
}
}
if (mask_positions.empty()) {
break;
}
if (params.add_gumbel_noise && params.temperature > 0.0f) {
add_gumbel_noise(logits, n_vocab, params.temperature, rng);
}
if (params.algorithm == ORIGIN) {
int32_t transfer_count = calculate_transfer_count(
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
float p_transfer = (float) transfer_count / mask_positions.size();
for (int32_t pos : mask_positions) {
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
const float * pos_logits = get_logits_for_pos(pos);
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
candidates[token_id].id = token_id;
candidates[token_id].logit = pos_logits[token_id];
candidates[token_id].p = 0.0f;
}
llama_token_data_array cur_p = {
candidates.data(),
(size_t) n_vocab,
-1,
false,
};
llama_sampler_apply(sampler, &cur_p);
output_tokens[pos] = cur_p.data[cur_p.selected].id;
}
}
} else {
std::vector<std::pair<float, int32_t>> confidences;
std::vector<llama_token> sampled_tokens(mask_positions.size());
for (size_t i = 0; i < mask_positions.size(); i++) {
int32_t pos = mask_positions[i];
const float * pos_logits = get_logits_for_pos(pos);
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
candidates[token_id].logit = pos_logits[token_id];
candidates[token_id].p = 0.0f;
candidates[token_id].id = token_id;
}
llama_token_data_array cur_p = {
candidates.data(),
candidates.size(),
-1,
false,
};
llama_sampler_apply(sampler, &cur_p);
llama_token sampled_token = cur_p.data[cur_p.selected].id;
float conf = calculate_confidence(cur_p, params.algorithm, rng);
sampled_tokens[i] = sampled_token;
confidences.emplace_back(conf, i);
}
int32_t transfer_count = calculate_transfer_count(
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
if (transfer_count > 0) {
if (params.alg_temp == 0.0f) {
std::partial_sort(confidences.begin(),
confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
confidences.end(),
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
if (a.first != b.first) {
return a.first > b.first;
}
return a.second < b.second;
});
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
int32_t mask_idx = confidences[i].second;
int32_t pos = mask_positions[mask_idx];
output_tokens[pos] = sampled_tokens[mask_idx];
}
} else {
conf_candidates.clear();
for (size_t i = 0; i < confidences.size(); i++) {
float conf_logit = confidences[i].first / params.alg_temp;
conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
}
llama_token_data_array conf_array = {
conf_candidates.data(),
conf_candidates.size(),
-1,
false,
};
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
llama_sampler_apply(dist_sampler, &conf_array);
int32_t selected_idx = conf_array.selected;
int32_t mask_idx = selected_idx;
int32_t pos = mask_positions[mask_idx];
output_tokens[pos] = sampled_tokens[mask_idx];
conf_candidates[selected_idx].p = 0.0f;
conf_array.selected = -1;
}
}
}
}
int64_t time_end_sampling = ggml_time_us();
total_sampling_time += time_end_sampling - time_start_sampling;
}
}
int64_t time_end = ggml_time_us();
total_time += time_end - time_start;
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
total_time / 1000.0,
total_time / 1000.0 / params.steps,
total_sampling_time / 1000.0 / params.steps);
llama_batch_free(batch);
llama_sampler_free(sampler);
llama_sampler_free(dist_sampler);
n_generated = params.max_length;
}
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
if (!use_chat_template) {
return prompt;
}
auto chat_templates = common_chat_templates_init(model, "");
common_chat_templates_inputs inputs;
common_chat_msg user_msg;
user_msg.role = "user";
user_msg.content = prompt;
inputs.add_generation_prompt = true;
inputs.messages.push_back(user_msg);
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
return result.prompt;
}
int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) {
return 1;
}
common_init();
llama_backend_init();
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = params.n_gpu_layers;
model_params.devices = params.devices.data();
model_params.use_mmap = params.use_mmap;
model_params.use_mlock = params.use_mlock;
model_params.check_tensors = params.check_tensors;
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
if (!model) {
LOG_ERR("error: failed to load model '%s'\n", params.model.path.c_str());
return 1;
}
if (!llama_model_is_diffusion(model)) {
LOG_ERR("error: unsupported model for diffusion");
llama_model_free(model);
return 1;
}
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = params.n_ctx;
ctx_params.n_batch = params.n_batch;
ctx_params.n_ubatch = params.n_ubatch;
ctx_params.flash_attn = params.flash_attn;
ctx_params.no_perf = params.no_perf;
ctx_params.type_k = params.cache_type_k;
ctx_params.type_v = params.cache_type_v;
llama_context * ctx = llama_init_from_model(model, ctx_params);
if (!ctx) {
LOG_ERR("error: failed to create context\n");
llama_model_free(model);
return 1;
}
llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads);
const llama_vocab * vocab = llama_model_get_vocab(model);
std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
std::vector<llama_token> input_tokens = common_tokenize(vocab,
formatted_prompt,
/*add special tokens*/ true,
/*parse special*/ true);
int n_input = input_tokens.size();
if (n_input >= params.n_ctx) {
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
llama_free(ctx);
llama_model_free(model);
return 1;
}
llama_token mask_token_id = llama_vocab_mask(vocab);
GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
bool visual_mode = params.diffusion.visual_mode;
int32_t n_generated = 0;
std::vector<llama_token> output_tokens(params.n_ubatch);
struct diffusion_params diff_params;
char shift_logits_str[8];
if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) {
diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0);
} else {
diff_params.shift_logits = true;
}
//Use either eps or block length, but not both
GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));
if (params.diffusion.eps) {
diff_params.schedule = TIMESTEP_BASED;
diff_params.eps = params.diffusion.eps;
} else if (params.diffusion.block_length) {
diff_params.schedule = BLOCK_BASED;
diff_params.block_length = params.diffusion.block_length;
}
diff_params.mask_token_id = mask_token_id;
diff_params.seed = params.sampling.seed;
diff_params.temperature = params.sampling.temp;
diff_params.steps = params.diffusion.steps;
diff_params.algorithm = static_cast<diffusion_algorithm>(params.diffusion.algorithm);
diff_params.max_length = params.n_ubatch;
diff_params.top_p = params.sampling.top_p;
diff_params.top_k = params.sampling.top_k;
diff_params.visual_mode = params.diffusion.visual_mode;
diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise;
diff_params.step_callback = diffusion_step_callback;
callback_data cb_data = { &diff_params, vocab, n_input };
diff_params.step_callback_user_data = &cb_data;
const char * alg_names[] = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
const char * alg_name =
(diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
const char * sched_name =
(diff_params.schedule >= 0 && diff_params.schedule <= 1) ? sched_names[diff_params.schedule] : "UNKNOWN";
LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id);
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", diff_params.steps);
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "max_length", diff_params.max_length);
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "temperature", diff_params.temperature);
if (diff_params.schedule == TIMESTEP_BASED) {
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", diff_params.eps);
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", diff_params.alg_temp);
}
if (diff_params.schedule == BLOCK_BASED) {
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "block_length", diff_params.block_length);
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", diff_params.cfg_scale);
}
diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated);
if (n_generated > 0) {
if (visual_mode) {
//clear screen and move cursor to top-left
LOG_INF("\033[2J\033[H");
}
output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
std::string output_data = common_detokenize(vocab, output_tokens, false);
LOG_INF("\n%s\n", output_data.c_str());
} else {
LOG_INF("Error: diffusion generation failed\n");
}
llama_free(ctx);
llama_model_free(model);
llama_backend_free();
return 0;
}

View File

@@ -37,7 +37,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
// clear previous kv_cache values (irrelevant for embeddings)
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
@@ -81,14 +81,6 @@ int main(int argc, char ** argv) {
params.embedding = true;
// if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
// --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
// in order to support any number of prompts
if (params.n_parallel == 1) {
LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
params.kv_unified = true;
}
// utilize the full context
if (params.n_batch < params.n_ctx) {
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
@@ -115,7 +107,7 @@ int main(int argc, char ** argv) {
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
@@ -141,36 +133,10 @@ int main(int argc, char ** argv) {
// max batch size
const uint64_t n_batch = params.n_batch;
// get added sep and eos token, if any
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) {
std::vector<llama_token> inp;
// split classification pairs and insert expected separator tokens
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
std::string final_prompt;
for (size_t i = 0; i < pairs.size(); i++) {
final_prompt += pairs[i];
if (i != pairs.size() - 1) {
if (!added_eos_token.empty()) {
final_prompt += added_eos_token;
}
if (!added_sep_token.empty()) {
final_prompt += added_sep_token;
}
}
}
inp = common_tokenize(ctx, final_prompt, true, true);
} else {
inp = common_tokenize(ctx, prompt, true, true);
}
auto inp = common_tokenize(ctx, prompt, true, true);
if (inp.size() > n_batch) {
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
@@ -179,11 +145,11 @@ int main(int argc, char ** argv) {
inputs.push_back(inp);
}
// check if the last token is SEP/EOS
// check if the last token is SEP
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) {
if (inp.empty() || (inp.back() != llama_vocab_sep(vocab) && inp.back() != llama_vocab_eos(vocab))) {
LOG_WRN("%s: last token in the prompt is not SEP or EOS\n", __func__);
if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
}
}
@@ -270,24 +236,9 @@ int main(int argc, char ** argv) {
LOG("\n");
}
} else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
const uint32_t n_cls_out = llama_model_n_cls_out(model);
std::vector<std::string> cls_out_labels;
for (uint32_t i = 0; i < n_cls_out; i++) {
const char * label = llama_model_cls_label(model, i);
const std::string label_i(label == nullptr ? "" : label);
cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
}
for (int j = 0; j < n_embd_count; j++) {
for (uint32_t i = 0; i < n_cls_out; i++) {
// NOTE: if you change this log - update the tests in ci/run.sh
if (n_cls_out == 1) {
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
} else {
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
}
}
// NOTE: if you change this log - update the tests in ci/run.sh
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
}
} else {
// print the first part of the embeddings or for a single prompt, the full embedding

View File

@@ -55,8 +55,6 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) &data[i];
} else if (type == GGML_TYPE_I64) {
v = (float) *(int64_t *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
@@ -136,11 +134,6 @@ static bool run(llama_context * ctx, const common_params & params) {
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
if (tokens.empty()) {
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
return false;
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;

View File

@@ -41,11 +41,12 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
// add input to batch (this increments n_tokens)
for (int32_t j = 0; j < n_toks; j++) {
common_batch_add(batch, inputs[j], j, { 0 }, true);
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
}
// clear previous kv_cache values (irrelevant for embeddings)
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
llama_set_embeddings(ctx, true);
llama_set_causal_attn(ctx, false);
// run model
@@ -101,7 +102,8 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
llama_token eos_token = llama_vocab_eos(vocab);
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
llama_set_embeddings(ctx, false);
llama_set_causal_attn(ctx, true);
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
@@ -164,8 +166,6 @@ int main(int argc, char * argv[]) {
llama_model_params mparams = common_model_params_to_llama(params);
llama_context_params cparams = common_context_params_to_llama(params);
cparams.embeddings = true;
llama_backend_init();
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
@@ -213,8 +213,6 @@ int main(int argc, char * argv[]) {
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
}
llama_set_embeddings(ctx, false);
// ### Generation ###
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
{

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e
MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin

View File

@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
}
batch->logits[batch->n_tokens - 1] = true;
llama_memory_clear(llama_get_memory(context), false);
llama_kv_self_clear(context);
const auto t_pp_start = ggml_time_us();
if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
LOGi("Benchmark text generation (tg)");
llama_memory_clear(llama_get_memory(context), false);
llama_kv_self_clear(context);
const auto t_tg_start = ggml_time_us();
for (i = 0; i < tg; i++) {
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
const auto t_tg_end = ggml_time_us();
llama_memory_clear(llama_get_memory(context), false);
llama_kv_self_clear(context);
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_memory_clear(llama_get_memory(reinterpret_cast<llama_context *>(context)), true);
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
}

View File

@@ -210,7 +210,7 @@ actor LlamaContext {
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
llama_memory_clear(llama_get_memory(context), false)
llama_kv_self_clear(context)
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
@@ -223,7 +223,7 @@ actor LlamaContext {
// bench text generation
llama_memory_clear(llama_get_memory(context), false)
llama_kv_self_clear(context)
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
@@ -242,7 +242,7 @@ actor LlamaContext {
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
llama_memory_clear(llama_get_memory(context), false)
llama_kv_self_clear(context)
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@@ -292,7 +292,7 @@ actor LlamaContext {
func clear() {
tokens_list.removeAll()
temporary_invalid_cchars.removeAll()
llama_memory_clear(llama_get_memory(context), true)
llama_kv_self_clear(context)
}
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

View File

@@ -60,8 +60,6 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();
auto * mem = llama_get_memory(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
// Tokenize the prompt
@@ -96,7 +94,7 @@ int main(int argc, char ** argv) {
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
for (int s = 1; s < W + G + 1; ++s) {
llama_memory_seq_cp(mem, 0, s, -1, -1);
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
}
const auto t_enc_end = ggml_time_us();
@@ -429,17 +427,17 @@ int main(int argc, char ** argv) {
// KV cache management
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
llama_memory_seq_rm(mem, -1, n_past, -1);
llama_kv_self_seq_rm(ctx, -1, n_past, -1);
if (seq_id_best != 0) {
// if a verification token matched, we keep the best sequence and remove the rest
// this leads to some KV cache fragmentation
llama_memory_seq_keep(mem, seq_id_best);
llama_memory_seq_cp (mem, seq_id_best, 0, -1, -1);
llama_memory_seq_rm (mem, seq_id_best, -1, -1);
llama_kv_self_seq_keep(ctx, seq_id_best);
llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1);
llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1);
for (int s = 1; s < W + G + 1; ++s) {
llama_memory_seq_cp(mem, 0, s, -1, -1);
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
}
}
}

View File

@@ -181,7 +181,7 @@ int main(int argc, char ** argv){
// KV cache management
// clean the cache of draft tokens that weren't accepted
llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
common_batch_clear(batch_tgt);
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);

View File

@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
common_params params;
params.n_predict = 128;
params.n_junk = 1;
params.n_junk = 0;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
return 1;
@@ -182,10 +182,7 @@ int main(int argc, char ** argv) {
const bool is_sp_shared = params.is_pp_shared;
// extra text to insert in each client's prompt in order to make it larger
const int32_t n_junk = std::max(1, params.n_junk);
// signed seed, use negative values to indicate different seeds for the different clients
const int32_t & sseed = params.sampling.seed;
const int32_t n_junk = params.n_junk;
// init llama.cpp
llama_backend_init();
@@ -197,8 +194,6 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();
auto * mem = llama_get_memory(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
// load the prompts from an external file if there are any
@@ -222,21 +217,11 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx);
if (sseed >= 0) {
LOG_INF("%s: initializing all samplers with the same RNG seed: %d (use a negative seed to have different seeds)\n", __func__, sseed);
} else {
LOG_INF("%s: initializing samplers with different RNG seeds, starting from %d\n", __func__, sseed);
}
std::vector<client> clients(n_clients);
for (size_t i = 0; i < clients.size(); ++i) {
auto & client = clients[i];
client.id = i;
client.smpl = common_sampler_init(model, params.sampling);
if (sseed < 0) {
params.sampling.seed--;
}
}
std::vector<llama_token> tokens_system;
@@ -274,7 +259,7 @@ int main(int argc, char ** argv) {
// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i <= n_clients; ++i) {
llama_memory_seq_cp(mem, 0, i, -1, -1);
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
}
LOG_INF("\n");
@@ -301,9 +286,9 @@ int main(int argc, char ** argv) {
if (batch.n_tokens == 0) {
// all sequences have ended - clear the entire KV cache
for (int i = 1; i <= n_clients; ++i) {
llama_memory_seq_rm(mem, i, -1, -1);
llama_kv_self_seq_rm(ctx, i, -1, -1);
// but keep the system prompt
llama_memory_seq_cp(mem, 0, i, -1, -1);
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
}
LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -358,7 +343,7 @@ int main(int argc, char ** argv) {
client.n_decoded = 0;
client.i_batch = batch.n_tokens - 1;
LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, prompt = %d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur, client.n_prompt);
LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur);
g_seq_id += 1;
@@ -462,8 +447,8 @@ int main(int argc, char ** argv) {
}
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
llama_memory_seq_rm(mem, client.id + 1, -1, -1);
llama_memory_seq_cp(mem, 0, client.id + 1, -1, -1);
llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1);
llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
const auto t_main_end = ggml_time_us();

View File

@@ -126,8 +126,6 @@ int main(int argc, char ** argv) {
int n_past = 0;
auto * mem = llama_get_memory(ctx);
// fill the KV cache
for (int i = 0; i < n_ctx; i += n_batch) {
if (i > 0 && n_grp > 1) {
@@ -135,10 +133,10 @@ int main(int argc, char ** argv) {
const int ib = i/n_batch - 1;
const int bd = n_batch_grp*(n_grp - 1);
llama_memory_seq_add(mem, 0, n_past - n_batch, n_past, ib*bd);
llama_memory_seq_div(mem, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
llama_kv_self_seq_add(ctx, 0, n_past - n_batch, n_past, ib*bd);
llama_kv_self_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
}
common_batch_clear(batch);
@@ -168,10 +166,10 @@ int main(int argc, char ** argv) {
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
llama_memory_seq_rm (mem, 0, n_keep , n_keep + n_discard);
llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx, -n_discard);
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
common_batch_clear(batch);
@@ -197,10 +195,10 @@ int main(int argc, char ** argv) {
if (n_discard > 0) {
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
llama_memory_seq_rm (mem, 0, n_keep , n_keep + n_discard);
llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx, -n_discard);
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
}
}

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
cd `dirname $0`
cd ..

View File

@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_memory_clear(llama_get_memory(ctx), false);
llama_kv_self_clear(ctx);
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);

View File

@@ -15,12 +15,6 @@ int main(int argc, char ** argv) {
return 1;
}
if (params.n_parallel == 1) {
// the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache
printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
params.kv_unified = true;
}
common_init();
if (params.n_predict < 0) {
@@ -202,7 +196,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
// erase whole kv
llama_memory_clear(llama_get_memory(ctx3), true);
llama_kv_self_clear(ctx3);
fprintf(stderr, "%s : kv cache cleared\n", __func__);
// restore kv into seq 1

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e

View File

@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
auto generate = [&](const std::string & prompt) {
std::string response;
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == -1;
const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
// tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,16 +113,15 @@ int main(int argc, char ** argv) {
while (true) {
// check if we have enough space in the context to evaluate this batch
int n_ctx = llama_n_ctx(ctx);
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) + 1;
int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");
exit(0);
}
int ret = llama_decode(ctx, batch);
if (ret != 0) {
GGML_ABORT("failed to decode, ret = %d\n", ret);
if (llama_decode(ctx, batch)) {
GGML_ABORT("failed to decode\n");
}
// sample the next token

View File

@@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
ctx_dft = llama_init_dft.context.get();
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
return 1;
}
// Tokenize the prompt
@@ -130,10 +130,7 @@ int main(int argc, char ** argv) {
params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
params_spec.p_min = p_min;
struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft);
for (auto &pair : params.speculative.replacements) {
common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str());
}
struct common_speculative * spec = common_speculative_init(ctx_dft);
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
@@ -220,7 +217,7 @@ int main(int argc, char ** argv) {
{
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1);
llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
}
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {

View File

@@ -142,8 +142,6 @@ int main(int argc, char ** argv) {
}
}
auto * mem_tgt = llama_get_memory(ctx_tgt);
auto * mem_dft = llama_get_memory(ctx_dft);
// Tokenize the prompt
std::vector<llama_token> inp;
@@ -422,14 +420,14 @@ int main(int argc, char ** argv) {
{
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
llama_memory_seq_keep(mem_dft, s_keep);
llama_memory_seq_cp (mem_dft, s_keep, 0, -1, -1);
llama_memory_seq_keep(mem_dft, 0);
llama_kv_self_seq_keep(ctx_dft, s_keep);
llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
llama_kv_self_seq_keep(ctx_dft, 0);
llama_memory_seq_rm (mem_tgt, s_keep, n_past_tgt, -1);
llama_memory_seq_keep(mem_tgt, s_keep);
llama_memory_seq_cp (mem_tgt, s_keep, 0, -1, -1);
llama_memory_seq_keep(mem_tgt, 0);
llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
llama_kv_self_seq_keep(ctx_tgt, s_keep);
llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
llama_kv_self_seq_keep(ctx_tgt, 0);
}
for (int s = 0; s < n_seq_dft; ++s) {
@@ -446,7 +444,7 @@ int main(int argc, char ** argv) {
common_batch_clear(batch_dft);
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
llama_memory_seq_rm(mem_dft, 0, n_past_dft, -1);
llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
llama_decode(ctx_dft, batch_dft);
@@ -505,8 +503,8 @@ int main(int argc, char ** argv) {
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
llama_memory_seq_rm(mem_dft, n_seq_cur, -1, -1);
llama_memory_seq_cp(mem_dft, s, n_seq_cur, -1, -1);
llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
// all previous tokens from this branch are now also part of the new branch
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -587,9 +585,9 @@ int main(int argc, char ** argv) {
// evaluate the target model on the drafted tokens
{
llama_memory_seq_keep(mem_tgt, 0);
llama_kv_self_seq_keep(ctx_tgt, 0);
for (int s = 1; s < n_seq_dft; ++s) {
llama_memory_seq_cp(mem_tgt, 0, s, -1, -1);
llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
}
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
# MIT license
# Copyright (C) 2024 Intel Corporation

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
# MIT license
# Copyright (C) 2025 Intel Corporation

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
#
# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json

Some files were not shown because too many files have changed in this diff Show More