Compare commits

..

3 Commits

Author SHA1 Message Date
Max Krasnyansky
ac35e50c16 Update tools/llama-bench/llama-bench.cpp
Co-authored-by: Diego Devesa <slarengh@gmail.com>
2025-05-31 15:38:37 -07:00
Max Krasnyansky
9087dd2664 threading: disable SetThreadInfo() calls for older Windows versions 2025-05-30 17:15:45 -07:00
Max Krasnyansky
199a838422 threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling
We talked about adding LOW priority for GGML threads in the original threadpool PR.
It might be useful for some cases to avoid contention.

Latest Windows ARM64 releases started parking (offlining) the CPU cores
more aggresively which results in suboptimal performance with n_threads > 4.
To deal with that we now disable Power Throttling for our threads for the NORMAL
and higher priorities.

Co-authored-by: Diego Devesa <slarengh@gmail.com>
2025-05-30 17:15:38 -07:00
451 changed files with 36023 additions and 91625 deletions

View File

@@ -22,8 +22,8 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: false
BinPackParameters: false # OnePerLine
BinPackArguments: true
BinPackParameters: true # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
@@ -70,17 +70,14 @@ ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '".*"'
- Regex: '^<.*\.h>'
Priority: 1
SortPriority: 0
- Regex: '^<.*\.h>'
- Regex: '^<.*'
Priority: 2
SortPriority: 0
- Regex: '^<.*'
Priority: 3
SortPriority: 0
- Regex: '.*'
Priority: 4
Priority: 3
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''

View File

@@ -49,23 +49,19 @@ COPY --from=build /app/full /app
WORKDIR /app
RUN apt-get update && \
apt-get install -y \
git \
python3 \
python3-pip \
python3-venv && \
python3 -m venv /opt/venv && \
. /opt/venv/bin/activate && \
pip install --upgrade pip setuptools wheel && \
pip install -r requirements.txt && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete
RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
ENV PATH="/opt/venv/bin:$PATH"
ENTRYPOINT ["/app/tools.sh"]

View File

@@ -47,7 +47,6 @@ let
inherit (lib)
cmakeBool
cmakeFeature
optionalAttrs
optionals
strings
;
@@ -198,7 +197,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
];
# Environment variables needed for ROCm
env = optionalAttrs useRocm {
env = optionals useRocm {
ROCM_PATH = "${rocmPackages.clr}";
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
};

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e
# Read the first argument into a variable

View File

@@ -40,7 +40,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
multiple: true
validations:
required: true

View File

@@ -42,7 +42,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
multiple: true
validations:
required: true

18
.github/labeler.yml vendored
View File

@@ -1,4 +1,10 @@
# https://github.com/actions/labeler
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute/**
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
@@ -80,15 +86,3 @@ nix:
embedding:
- changed-files:
- any-glob-to-any-file: examples/embedding/
Ascend NPU:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-cann.h
- ggml/src/ggml-cann/**
- docs/backend/CANN.md
OpenCL:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-opencl.h
- ggml/src/ggml-opencl/**

View File

@@ -1,51 +0,0 @@
name: Build relocatable cmake package
on:
workflow_dispatch:
workflow_call:
jobs:
linux:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y build-essential tcl
- name: Build
run: |
PREFIX="$(pwd)"/inst
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release
cmake --install build --prefix "$PREFIX" --config Release
export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
tclsh <<'EOF'
set build(commit) [string trim [exec git rev-parse --short HEAD]]
set build(number) [string trim [exec git rev-list --count HEAD]]
set build(version) "0.0.$build(number)"
set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
set checks [list "set\\(LLAMA_VERSION \\s+$build(version)\\)" \
"set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
"set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]
puts -nonewline "Checking llama-config.cmake version... "
foreach check $checks {
if {![regexp -expanded -- $check $llamaconfig]} {
puts "\"$check\" failed!"
exit 1
}
}
puts "success."
EOF
cd examples/simple-cmake-pkg
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
cmake --build build

View File

@@ -48,98 +48,98 @@ jobs:
cmake --build build --config Release -j $(nproc)
# ubuntu-24-riscv64-vulkan-cross:
# runs-on: ubuntu-24.04
ubuntu-24-riscv64-vulkan-cross:
runs-on: ubuntu-24.04
# steps:
# - uses: actions/checkout@v4
# - name: Setup Riscv
# run: |
# sudo dpkg --add-architecture riscv64
steps:
- uses: actions/checkout@v4
- name: Setup Riscv
run: |
sudo dpkg --add-architecture riscv64
# # Add arch-specific repositories for non-amd64 architectures
# cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
# EOF
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get update || true ;# Prevent failure due to missing URLs.
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# glslc \
# gcc-14-riscv64-linux-gnu \
# g++-14-riscv64-linux-gnu \
# libvulkan-dev:riscv64
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libvulkan-dev:riscv64
# - name: Build
# run: |
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
# cmake --build build --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
# ubuntu-24-arm64-vulkan-cross:
# runs-on: ubuntu-24.04
ubuntu-24-arm64-vulkan-cross:
runs-on: ubuntu-24.04
# steps:
# - uses: actions/checkout@v4
# - name: Setup Arm64
# run: |
# sudo dpkg --add-architecture arm64
steps:
- uses: actions/checkout@v4
- name: Setup Arm64
run: |
sudo dpkg --add-architecture arm64
# # Add arch-specific repositories for non-amd64 architectures
# cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
# deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
# deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
# deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
# deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
# EOF
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get update || true ;# Prevent failure due to missing URLs.
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# glslc \
# crossbuild-essential-arm64 \
# libvulkan-dev:arm64
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
crossbuild-essential-arm64 \
libvulkan-dev:arm64
# - name: Build
# run: |
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
# -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
# -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=aarch64 \
-DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
-DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
# cmake --build build --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
ubuntu-24-ppc64el-cpu-cross:
runs-on: ubuntu-24.04
@@ -185,144 +185,31 @@ jobs:
cmake --build build --config Release -j $(nproc)
# ubuntu-24-ppc64el-vulkan-cross:
# runs-on: ubuntu-24.04
# steps:
# - uses: actions/checkout@v4
# - name: Setup PowerPC64le
# run: |
# sudo dpkg --add-architecture ppc64el
# # Add arch-specific repositories for non-amd64 architectures
# cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
# EOF
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# glslc \
# gcc-14-powerpc64le-linux-gnu \
# g++-14-powerpc64le-linux-gnu \
# libvulkan-dev:ppc64el
# - name: Build
# run: |
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
# -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
# -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
# cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-cpu-cross:
ubuntu-24-ppc64el-vulkan-cross:
runs-on: ubuntu-24.04
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
steps:
- uses: actions/checkout@v4
- name: Setup LoongArch
- name: Setup PowerPC64le
run: |
rm -f /etc/apt/sources.list.d/*
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
EOF
( echo 'quiet "true";'; \
echo 'APT::Get::Assume-Yes "true";'; \
echo 'APT::Install-Recommends "false";'; \
echo 'Acquire::Check-Valid-Until "false";'; \
echo 'Acquire::Retries "5";'; \
) > /etc/apt/apt.conf.d/99snapshot-repos
apt-get update
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
dpkg --add-architecture loong64
sudo dpkg --add-architecture ppc64el
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get update || true ;# Prevent failure due to missing URLs.
apt-get install -y --no-install-recommends \
build-essential \
gcc-14-loongarch64-linux-gnu \
g++-14-loongarch64-linux-gnu
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-vulkan-cross:
runs-on: ubuntu-24.04
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
steps:
- uses: actions/checkout@v4
- name: Setup LoongArch
run: |
rm -f /etc/apt/sources.list.d/*
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
EOF
( echo 'quiet "true";'; \
echo 'APT::Get::Assume-Yes "true";'; \
echo 'APT::Install-Recommends "false";'; \
echo 'Acquire::Check-Valid-Until "false";'; \
echo 'Acquire::Retries "5";'; \
) > /etc/apt/apt.conf.d/99snapshot-repos
apt-get update
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
dpkg --add-architecture loong64
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
EOF
apt-get update || true ;# Prevent failure due to missing URLs.
apt-get install -y --no-install-recommends \
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
gcc-14-loongarch64-linux-gnu \
g++-14-loongarch64-linux-gnu \
libvulkan-dev:loong64
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
libvulkan-dev:ppc64el
- name: Build
run: |
@@ -334,11 +221,11 @@ jobs:
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

View File

@@ -5,43 +5,10 @@ on:
push:
branches:
- master
paths: [
'.github/workflows/build.yml',
'.github/workflows/build-linux-cross.yml',
'.github/workflows/build-cmake-pkg.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp'
]
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build.yml',
'.github/workflows/build-linux-cross.yml',
'.github/workflows/build-cmake-pkg.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp'
]
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -84,8 +51,7 @@ jobs:
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=OFF \
-DGGML_METAL_SHADER_DEBUG=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
@@ -135,69 +101,6 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
macOS-latest-cmake-arm64-webgpu:
runs-on: macos-14
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: macOS-latest-cmake-arm64-webgpu
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Dawn Dependency
id: dawn-depends
run: |
ARTIFACTS_JSON=$(curl -s -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"https://api.github.com/repos/google/dawn/actions/artifacts")
echo "Finding latest macos-latest-Release artifact..."
DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
| sort_by(.created_at)
| reverse
| map(select(.name | test("macos-latest-Release$")))
| .[0].archive_download_url')
if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
echo "No suitable Dawn artifact found!"
exit 1
fi
echo "Downloading from: $DOWNLOAD_URL"
curl -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-o artifact.zip "$DOWNLOAD_URL"
unzip artifact.zip
mkdir dawn
tar_file=$(find . -name '*.tar.gz' | head -n 1)
echo "Extracting: $tar_file"
tar -xvf "$tar_file" -C dawn --strip-components=1
- name: Build
id: cmake_build
run: |
export CMAKE_PREFIX_PATH=dawn
cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-cpu-cmake:
strategy:
matrix:
@@ -399,73 +302,6 @@ jobs:
-DGGML_VULKAN=ON
cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
export GGML_VK_VISIBLE_DEVICES=0
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 4200
ubuntu-22-cmake-webgpu:
runs-on: ubuntu-22.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
with:
key: ubuntu-22-cmake-webgpu
evict-old-files: 1d
- name: Vulkan SDK Dependencies
id: vulkan-depends
run: |
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
- name: Dawn Dependency
id: dawn-depends
run: |
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
ARTIFACTS_JSON=$(curl -s -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"https://api.github.com/repos/google/dawn/actions/artifacts")
echo "Finding latest ubuntu-latest-Release artifact..."
DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
| sort_by(.created_at)
| reverse
| map(select(.name | test("ubuntu-latest-Release$")))
| .[0].archive_download_url')
if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
echo "No suitable Dawn artifact found!"
exit 1
fi
echo "Downloading from: $DOWNLOAD_URL"
curl -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-o artifact.zip "$DOWNLOAD_URL"
unzip artifact.zip
mkdir dawn
tar_file=$(find . -name '*.tar.gz' | head -n 1)
echo "Extracting: $tar_file"
tar -xvf "$tar_file" -C dawn --strip-components=1
- name: Build
id: cmake_build
run: |
export Dawn_DIR=dawn/lib64/cmake/Dawn
cmake -B build -DGGML_WEBGPU=ON
cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
@@ -641,9 +477,6 @@ jobs:
build-linux-cross:
uses: ./.github/workflows/build-linux-cross.yml
build-cmake-pkg:
uses: ./.github/workflows/build-cmake-pkg.yml
macOS-latest-cmake-ios:
runs-on: macos-latest
@@ -794,7 +627,7 @@ jobs:
./build-xcframework.sh
windows-msys2:
runs-on: windows-2025
runs-on: windows-latest
strategy:
fail-fast: false
@@ -844,31 +677,28 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
runs-on: windows-2025
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.4.313.2
VULKAN_VERSION: 1.4.309.0
strategy:
matrix:
include:
- build: 'cpu-x64 (static)'
arch: 'x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
- build: 'cpu-x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
- build: 'openblas-x64'
arch: 'x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'vulkan-x64'
arch: 'x64'
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
- build: 'llvm-arm64'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
- build: 'llvm-arm64-opencl-adreno'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
# - build: 'kompute-x64'
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
steps:
- name: Clone
@@ -882,6 +712,12 @@ jobs:
variant: ccache
evict-old-files: 1d
- name: Clone Kompute submodule
id: clone_kompute
if: ${{ matrix.build == 'kompute-x64' }}
run: |
git submodule update --init ggml/src/ggml-kompute/kompute
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
@@ -897,9 +733,9 @@ jobs:
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'vulkan-x64' }}
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
@@ -932,8 +768,6 @@ jobs:
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
with:
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
- name: Build
id: cmake_build
@@ -943,7 +777,6 @@ jobs:
cmake -S . -B build ${{ matrix.defines }} `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
- name: Add libopenblas.dll
id: add_libopenblas_dll
@@ -954,7 +787,7 @@ jobs:
- name: Test
id: cmake_test
if: ${{ matrix.arch == 'x64' }}
if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
@@ -1006,12 +839,12 @@ jobs:
-DGGML_CUDA=ON
cmake --build build
windows-2022-cmake-cuda:
runs-on: windows-2022
windows-2019-cmake-cuda:
runs-on: windows-2019
strategy:
matrix:
cuda: ['12.4']
cuda: ['12.4', '11.7']
steps:
- name: Clone
@@ -1045,7 +878,7 @@ jobs:
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" ^
-DLLAMA_BUILD_SERVER=ON ^
-DGGML_NATIVE=OFF ^
@@ -1059,7 +892,7 @@ jobs:
cmake --build build --config Release
windows-latest-cmake-sycl:
runs-on: windows-2022
runs-on: windows-latest
defaults:
run:
@@ -1093,7 +926,7 @@ jobs:
windows-latest-cmake-hip:
if: ${{ github.event.inputs.create_release != 'true' }}
runs-on: windows-2022
runs-on: windows-latest
steps:
- name: Clone

View File

@@ -49,8 +49,7 @@ jobs:
run: |
sysctl -a
cmake -B build \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
@@ -104,8 +103,7 @@ jobs:
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
@@ -133,9 +131,8 @@ jobs:
include:
- build: 'x64'
os: ubuntu-22.04
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
# - build: 'arm64'
# os: ubuntu-22.04-arm
- build: 'arm64'
os: ubuntu-22.04-arm
runs-on: ${{ matrix.os }}
@@ -162,11 +159,6 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
-DLLAMA_FATAL_WARNINGS=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
@@ -215,11 +207,6 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_VULKAN=ON \
${{ env.CMAKE_ARGS }}
cmake --build build --config Release -j $(nproc)
@@ -241,7 +228,7 @@ jobs:
name: llama-bin-ubuntu-vulkan-x64.zip
windows-cpu:
runs-on: windows-2025
runs-on: windows-latest
strategy:
matrix:
@@ -277,7 +264,7 @@ jobs:
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
cmake -S . -B build -G "Ninja Multi-Config" ^
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-DGGML_NATIVE=OFF ^
@@ -294,7 +281,7 @@ jobs:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
@@ -304,11 +291,11 @@ jobs:
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
windows:
runs-on: windows-2025
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
VULKAN_VERSION: 1.4.313.2
VULKAN_VERSION: 1.4.309.0
strategy:
matrix:
@@ -338,7 +325,7 @@ jobs:
id: get_vulkan
if: ${{ matrix.backend == 'vulkan' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
@@ -386,11 +373,11 @@ jobs:
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
windows-cuda:
runs-on: windows-2022
runs-on: windows-2019
strategy:
matrix:
cuda: ['12.4']
cuda: ['12.4', '11.7']
steps:
- name: Clone
@@ -418,7 +405,7 @@ jobs:
id: cmake_build
shell: cmd
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" ^
-DGGML_BACKEND_DL=ON ^
-DGGML_NATIVE=OFF ^
@@ -454,7 +441,7 @@ jobs:
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
windows-sycl:
runs-on: windows-2022
runs-on: windows-latest
defaults:
run:
@@ -526,7 +513,7 @@ jobs:
name: llama-bin-win-sycl-x64.zip
windows-hip:
runs-on: windows-2022
runs-on: windows-latest
strategy:
matrix:

View File

@@ -180,7 +180,7 @@ jobs:
server-windows:
runs-on: windows-2022
runs-on: windows-2019
steps:
- name: Clone

View File

@@ -1,40 +0,0 @@
name: Update Operations Documentation
on:
push:
paths:
- 'docs/ops/**'
- 'scripts/create_ops_docs.py'
pull_request:
paths:
- 'docs/ops/**'
- 'scripts/create_ops_docs.py'
jobs:
update-ops-docs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Generate operations documentation to temporary file
run: |
mkdir -p /tmp/ops_check
./scripts/create_ops_docs.py /tmp/ops_check/ops.md
- name: Check if docs/ops.md matches generated version
run: |
if ! diff -q docs/ops.md /tmp/ops_check/ops.md; then
echo "Operations documentation (docs/ops.md) is not up to date with the backend CSV files."
echo "To fix: run ./scripts/create_ops_docs.py and commit the updated docs/ops.md along with your changes"
echo "Differences found:"
diff docs/ops.md /tmp/ops_check/ops.md || true
exit 1
fi
echo "Operations documentation is up to date."

3
.gitmodules vendored
View File

@@ -0,0 +1,3 @@
[submodule "kompute"]
path = ggml/src/ggml-kompute/kompute
url = https://github.com/nomic-ai/kompute.git

View File

@@ -89,14 +89,6 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
if (NOT DEFINED LLAMA_BUILD_NUMBER)
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
endif()
if (NOT DEFINED LLAMA_BUILD_COMMIT)
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
endif()
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
# override ggml options
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
@@ -120,6 +112,7 @@ endfunction()
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
@@ -162,17 +155,10 @@ if (LLAMA_USE_SYSTEM_GGML)
endif()
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
if (MINGW)
# Target Windows 8 for PrefetchVirtualMemory
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
endif()
#
# build the library
#
@@ -213,6 +199,10 @@ endif()
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")

View File

@@ -55,17 +55,6 @@
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
}
},
{
"name": "x64-linux-gcc", "hidden": true,
"cacheVariables": {
"CMAKE_C_COMPILER": "gcc",
"CMAKE_CXX_COMPILER": "g++"
}
},
{ "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
{ "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
{ "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
{ "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },

View File

@@ -9,4 +9,3 @@
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/gguf.cpp @JohannesGaessler
/ggml/src/ggml-vulkan/ @0cc4m

View File

@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
endif
ifndef GGML_NO_CPU_AARCH64
MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
endif
# warnings
@@ -970,7 +970,7 @@ OBJ_GGML = \
$(DIR_GGML)/src/ggml-threading.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
$(DIR_GGML)/src/ggml-cpu/repack.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \

View File

@@ -3,12 +3,11 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
LLM inference in C/C++
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
## Recent API changes
@@ -17,9 +16,11 @@ LLM inference in C/C++
## Hot topics
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
@@ -27,30 +28,6 @@ LLM inference in C/C++
----
## Quick start
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
- Run with Docker - see our [Docker documentation](docs/docker.md)
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
Example command:
```sh
# Use a local model file
llama-cli -m my_model.gguf
# Or download and run a model directly from Hugging Face
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
# Launch OpenAI-compatible API server
llama-server -hf ggml-org/gemma-3-1b-it-GGUF
```
## Description
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@@ -133,7 +110,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
#### Multimodal
@@ -154,7 +130,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
<details>
<summary>Bindings</summary>
- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@@ -254,7 +229,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
</details>
## Supported backends
| Backend | Target devices |
@@ -269,9 +243,18 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
## Building the project
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
- Clone this repository and build locally, see [how to build](docs/build.md)
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
- Use a Docker image, see [documentation for Docker](docs/docker.md)
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
## Obtaining and quantizing models
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
@@ -279,11 +262,7 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
```sh
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
```
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
@@ -435,7 +414,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
## [`llama-perplexity`](tools/perplexity)
#### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text.
#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
- <details open>
<summary>Measure the perplexity over a text file</summary>
@@ -458,7 +437,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
</details>
[^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
## [`llama-bench`](tools/llama-bench)

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
#
# Options
IOS_MIN_OS_VERSION=16.4

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
#
# sample usage:
#
@@ -16,9 +16,6 @@
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with WebGPU support
# GG_BUILD_WEBGPU=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with MUSA support
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
@@ -42,27 +39,14 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
if command -v nvidia-smi >/dev/null 2>&1; then
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
else
echo "Warning: Using fallback CUDA architectures"
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
fi
else
echo "Error: nvidia-smi not found, cannot build with CUDA"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -84,10 +68,6 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
fi
if [ ! -z ${GG_BUILD_MUSA} ]; then
# Use qy1 by default (MTT S80)
MUSA_ARCH=${MUSA_ARCH:-21}
@@ -786,7 +766,7 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf"
# for this model, the SEP token is "</s>"
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
# sample output
# rerank score 0: 0.029

View File

@@ -7,8 +7,8 @@ llama_add_compile_flags()
# Build info header
#
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
@@ -18,26 +18,36 @@ if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
if (SLASH_POS EQUAL 0)
set(GIT_DIR "${REAL_GIT_DIR}")
else()
set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
endif()
endif()
if(EXISTS "${GIT_DIR}/index")
# For build-info.cpp below
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
set(GIT_INDEX "${GIT_DIR}/index")
else()
message(WARNING "Git index not found in git repository.")
set(GIT_INDEX "")
endif()
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "")
endif()
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
# Add a custom command to rebuild build-info.cpp when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
)
set(TARGET build_info)
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
add_library(${TARGET} OBJECT build-info.cpp)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
@@ -86,7 +96,8 @@ if (LLAMA_CURL)
endif()
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
endif ()
if (LLAMA_LLGUIDANCE)
@@ -111,13 +122,13 @@ if (LLAMA_LLGUIDANCE)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v1.0.1:
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
# v0.7.20 (+ fix to build on GCC 15):
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND cargo build --release --package llguidance
BUILD_COMMAND cargo build --release
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
UPDATE_COMMAND ""

View File

@@ -988,6 +988,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
params.tensor_buft_overrides.push_back({nullptr, nullptr});
}
if (params.reranking && params.embedding) {
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
}
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
throw std::runtime_error(string_format(
"error: the supplied chat template is not supported: %s%s\n",
@@ -1464,14 +1468,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.swa_full = true;
}
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
[](common_params & params) {
params.kv_unified = true;
}
).set_env("LLAMA_ARG_KV_SPLIT"));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1612,7 +1608,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.antiprompt.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"-sp", "--special"},
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
@@ -2655,13 +2651,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.i_chunk = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--show-statistics"},
string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
[](common_params & params) {
params.show_statistics = true;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--parse-special"},
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
@@ -2721,13 +2710,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.embd_sep = value;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg(
{"--cls-separator"}, "STRING",
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
[](common_params & params, const std::string & value) {
params.cls_sep = value;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg(
{"--host"}, "HOST",
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -2749,13 +2731,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.public_path = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
add_opt(common_arg(
{"--api-prefix"}, "PREFIX",
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
[](common_params & params, const std::string & value) {
params.api_prefix = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg(
{"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
@@ -2772,10 +2747,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
add_opt(common_arg(
{"--reranking", "--rerank"},
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
[](common_params & params) {
params.embedding = true;
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
params.reranking = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
add_opt(common_arg(
@@ -2816,16 +2790,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.ssl_file_cert = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
add_opt(common_arg(
{"--chat-template-kwargs"}, "STRING",
string_format("sets additional params for the json template parser"),
[](common_params & params, const std::string & value) {
auto parsed = json::parse(value);
for (const auto & item : parsed.items()) {
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg(
{"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -2905,7 +2869,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"(default: deepseek)",
[](common_params & params, const std::string & value) {
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
else { throw std::invalid_argument("invalid value"); }
}
@@ -3249,32 +3212,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
add_opt(common_arg(
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
string_format(
"KV cache data type for K for the draft model\n"
"allowed values: %s\n"
"(default: %s)",
get_all_kv_cache_types().c_str(),
ggml_type_name(params.speculative.cache_type_k)
),
[](common_params & params, const std::string & value) {
params.speculative.cache_type_k = kv_cache_type_from_str(value);
}
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
add_opt(common_arg(
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
string_format(
"KV cache data type for V for the draft model\n"
"allowed values: %s\n"
"(default: %s)",
get_all_kv_cache_types().c_str(),
ggml_type_name(params.speculative.cache_type_v)
),
[](common_params & params, const std::string & value) {
params.speculative.cache_type_v = kv_cache_type_from_str(value);
}
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
add_opt(common_arg(
{"-mv", "--model-vocoder"}, "FNAME",
@@ -3438,34 +3375,5 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
// diffusion parameters
add_opt(common_arg(
{ "--diffusion-steps" }, "N",
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
[](common_params & params, int value) { params.diffusion.steps = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-eps" }, "F",
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-algorithm" }, "N",
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
params.diffusion.algorithm),
[](common_params & params, int value) { params.diffusion.algorithm = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-alg-temp" }, "F",
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-visual" },
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
params.diffusion.visual_mode ? "true" : "false"),
[](common_params & params) { params.diffusion.visual_mode = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
return ctx_arg;
}

View File

@@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";

View File

@@ -49,7 +49,6 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
result_.tool_calls.emplace_back(tool_call);
return true;
}
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
@@ -155,10 +154,9 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
if (!rest.empty()) {
handle_reasoning(rest, /* closed */ !is_partial());
}
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
// if (!syntax_.thinking_forced_open) {
// throw common_chat_msg_partial_exception(end_think);
// }
if (!syntax_.thinking_forced_open) {
throw common_chat_msg_partial_exception(end_think);
}
return true;
}
}
@@ -379,7 +377,3 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
/* .is_partial = */ found_healing_marker,
};
}
void common_chat_msg_parser::clear_tools() {
result_.tool_calls.clear();
}

View File

@@ -115,6 +115,4 @@ class common_chat_msg_parser {
const std::vector<std::vector<std::string>> & args_paths = {},
const std::vector<std::vector<std::string>> & content_paths = {}
);
void clear_tools();
};

View File

@@ -17,8 +17,6 @@
#include <string>
#include <vector>
using json = nlohmann::ordered_json;
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
auto time = std::chrono::system_clock::to_time_t(now);
auto local_time = *std::localtime(&time);
@@ -84,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
std::vector<common_chat_msg_diff> diffs;
if (previous_msg.reasoning_content != new_msg.reasoning_content) {
auto & diff = diffs.emplace_back();
diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
}
// if (previous_msg.reasoning_content != current.reasoning_content) {
// auto & diff = diffs.emplace_back();
// diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
// }
if (previous_msg.content != new_msg.content) {
auto & diff = diffs.emplace_back();
diff.content_delta = string_diff(previous_msg.content, new_msg.content);
@@ -142,7 +140,6 @@ struct templates_params {
bool add_generation_prompt = true;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
json extra_context;
};
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -388,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
json delta = json::object();
if (!diff.reasoning_content_delta.empty()) {
delta["reasoning_content"] = diff.reasoning_content_delta;
}
// if (!diff.reasoning_content_delta.empty()) {
// delta["reasoning_content"] = msg.reasoning_content;
// }
if (!diff.content_delta.empty()) {
delta["content"] = diff.content_delta;
}
@@ -601,7 +598,6 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
switch (format) {
case COMMON_REASONING_FORMAT_NONE: return "none";
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
default:
throw std::runtime_error("Unknown reasoning format");
}
@@ -723,23 +719,16 @@ static void foreach_function(const json & tools, const std::function<void(const
static std::string apply(
const common_chat_template & tmpl,
const struct templates_params & inputs,
const std::optional<json> & messages_override = std::nullopt,
const std::optional<json> & tools_override = std::nullopt,
const std::optional<json> & additional_context = std::nullopt)
const nlohmann::ordered_json & messages,
const nlohmann::ordered_json & tools,
bool add_generation_prompt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
{
minja::chat_template_inputs tmpl_inputs;
tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
if (tools_override) {
tmpl_inputs.tools = *tools_override;
} else {
tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
}
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
tmpl_inputs.extra_context = inputs.extra_context;
if (additional_context) {
tmpl_inputs.extra_context.merge_patch(*additional_context);
}
tmpl_inputs.messages = messages;
tmpl_inputs.tools = tools;
tmpl_inputs.add_generation_prompt = add_generation_prompt;
tmpl_inputs.extra_context = extra_context;
// TODO: add flag to control date/time, if only for testing purposes.
// tmpl_inputs.now = std::chrono::system_clock::now();
@@ -838,7 +827,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
inputs.messages,
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_GENERIC;
return data;
}
@@ -914,7 +903,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
data.preserved_tokens = {
"[TOOL_CALLS]",
};
data.prompt = apply(tmpl, inputs);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
return data;
}
@@ -944,7 +933,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
adjusted_messages.push_back(msg);
}
}
data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
if (!inputs.enable_thinking) {
@@ -1132,7 +1121,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
} else {
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
}
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
{"date_string", format_time(inputs.now, "%d %b %Y")},
{"tools_in_user_message", false},
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
@@ -1197,7 +1186,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
auto prompt = apply(tmpl, inputs);
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
// Hacks to fix the official (broken) prompt.
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
@@ -1292,7 +1281,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
LOG_DBG("%s\n", __func__);
common_chat_params data;
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
});
@@ -1348,7 +1337,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
common_chat_params data;
data.prompt = apply(tmpl, inputs);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
if (inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1475,7 +1464,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
}
data.prompt = apply(tmpl, inputs);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
// TODO: if (has_raw_python)
return data;
}
@@ -1508,15 +1497,14 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
json extra_context = json {
json additional_context = {
{"enable_thinking", inputs.enable_thinking},
};
extra_context.update(inputs.extra_context);
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
if (string_ends_with(data.prompt, "<think>\n")) {
if (!extra_context["enable_thinking"]) {
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
@@ -1702,7 +1690,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
data.grammar_lazy = false;
if (!inputs.json_schema.is_null()) {
@@ -1733,12 +1721,6 @@ static common_chat_params common_chat_templates_apply_jinja(
params.enable_thinking = inputs.enable_thinking;
params.grammar = inputs.grammar;
params.now = inputs.now;
params.extra_context = json::object();
for (auto el : inputs.chat_template_kwargs) {
params.extra_context[el.first] = json::parse(el.second);
}
if (!inputs.json_schema.empty()) {
params.json_schema = json::parse(inputs.json_schema);
}
@@ -1855,7 +1837,7 @@ static common_chat_params common_chat_templates_apply_legacy(
if (res < 0) {
// if the custom "tmpl" is not supported, we throw an error
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
throw std::runtime_error("this custom template is not supported, try using --jinja");
throw std::runtime_error("this custom template is not supported");
}
// if it turns out that our buffer is too small, we resize it
@@ -1938,9 +1920,7 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
} catch (const common_chat_msg_partial_exception & ex) {
LOG_DBG("Partial parse: %s\n", ex.what());
if (!is_partial) {
builder.clear_tools();
builder.move_to(0);
common_chat_parse_content_only(builder);
throw std::runtime_error(ex.what());
}
}
auto msg = builder.result();

View File

@@ -7,7 +7,6 @@
#include <chrono>
#include <string>
#include <vector>
#include <map>
struct common_chat_templates;
@@ -71,7 +70,7 @@ struct common_chat_msg {
};
struct common_chat_msg_diff {
std::string reasoning_content_delta;
// std::string reasoning_content_delta;
std::string content_delta;
size_t tool_call_index = std::string::npos;
common_chat_tool_call tool_call_delta;
@@ -126,7 +125,6 @@ struct common_chat_templates_inputs {
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
std::map<std::string, std::string> chat_template_kwargs;
};
struct common_chat_params {

View File

@@ -0,0 +1,24 @@
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
# Only write the build info if it changed
if(EXISTS ${OUTPUT_FILE})
file(READ ${OUTPUT_FILE} CONTENTS)
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_COMMIT ${CMAKE_MATCH_1})
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_COMPILER ${CMAKE_MATCH_1})
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_TARGET ${CMAKE_MATCH_1})
if (
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
NOT OLD_TARGET STREQUAL BUILD_TARGET
)
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
endif()
else()
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
endif()

View File

@@ -448,15 +448,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
bool has_suffix = string_ends_with(str, suffix);
if (has_suffix) {
str = str.substr(0, str.size() - suffix.size());
}
return has_suffix;
}
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
if (!str.empty() && !stop.empty()) {
const char text_last_char = str.back();
@@ -475,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
std::string regex_escape(const std::string & s) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
return std::regex_replace(s, special_chars, "\\$&");
return std::regex_replace(s, special_chars, "\\$0");
}
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
@@ -715,17 +706,11 @@ bool fs_validate_filename(const std::string & filename) {
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
#if defined(__clang__)
# pragma clang diagnostic pop
#elif defined(__GNUC__)
# pragma GCC diagnostic pop
#endif
filename_utf32 = converter.from_bytes(filename);
@@ -782,9 +767,6 @@ bool fs_validate_filename(const std::string & filename) {
return true;
}
#include <iostream>
// returns true if successful, false otherwise
bool fs_create_directory_with_parents(const std::string & path) {
#ifdef _WIN32
@@ -802,16 +784,9 @@ bool fs_create_directory_with_parents(const std::string & path) {
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
const std::wstring subpath = wpath.substr(0, pos_slash);
const wchar_t * test = subpath.c_str();
pos_slash += 1;
// skip the drive letter, in some systems it can return an access denied error
if (subpath.length() == 2 && subpath[1] == ':') {
continue;
}
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
const bool success = CreateDirectoryW(test, NULL);
if (!success) {
const DWORD error = GetLastError();
@@ -825,6 +800,8 @@ bool fs_create_directory_with_parents(const std::string & path) {
return false;
}
}
pos_slash += 1;
}
return true;
@@ -920,6 +897,34 @@ struct common_init_result common_init_from_params(common_params & params) {
const llama_vocab * vocab = llama_model_get_vocab(model);
if (params.reranking) {
bool ok = true;
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
ok = false;
}
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
if (!ok) {
llama_model_free(model);
return iparams;
}
}
auto cparams = common_context_params_to_llama(params);
llama_context * lctx = llama_init_from_model(model, cparams);
@@ -929,7 +934,7 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}
@@ -961,35 +966,6 @@ struct common_init_result common_init_from_params(common_params & params) {
}
}
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
bool ok = true;
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
ok = false;
}
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}
if (!ok) {
llama_free(lctx);
llama_model_free(model);
return iparams;
}
}
// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
@@ -1014,19 +990,13 @@ struct common_init_result common_init_from_params(common_params & params) {
params.sampling.ignore_eos = false;
}
// initialize once
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
}
}
if (params.sampling.ignore_eos) {
// add EOG biases to the active set of logit biases
params.sampling.logit_bias.insert(
params.sampling.logit_bias.end(),
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias.push_back({i, -INFINITY});
}
}
}
if (params.sampling.penalty_last_n == -1) {
@@ -1071,7 +1041,7 @@ struct common_init_result common_init_from_params(common_params & params) {
if (llama_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
}
llama_memory_clear(llama_get_memory(lctx), true);
llama_kv_self_clear(lctx);
llama_synchronize(lctx);
llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);
@@ -1172,7 +1142,11 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.no_perf = params.no_perf;
cparams.op_offload = !params.no_op_offload;
cparams.swa_full = params.swa_full;
cparams.kv_unified = params.kv_unified;
if (params.reranking) {
cparams.embeddings = true;
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
}
cparams.type_k = params.cache_type_k;
cparams.type_v = params.cache_type_v;
@@ -1306,9 +1280,6 @@ std::vector<llama_token> common_tokenize(
int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens == std::numeric_limits<int32_t>::min()) {
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
}
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

View File

@@ -8,7 +8,6 @@
#include <string>
#include <string_view>
#include <vector>
#include <map>
#include <sstream>
#ifdef _WIN32
@@ -81,7 +80,6 @@ enum llama_example {
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_TTS,
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_COUNT,
};
@@ -178,8 +176,7 @@ struct common_params_sampling {
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
std::set<llama_token> preserved_tokens;
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
// print the parameters into a string
std::string print() const;
@@ -202,9 +199,6 @@ struct common_params_speculative {
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
@@ -219,18 +213,9 @@ struct common_params_vocoder {
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
};
struct common_params_diffusion {
int32_t steps = 64; // number of diffusion steps
float eps = 1e-3f; // epsilon for timesteps
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
float alg_temp = 0.0f; // algorithm temperature
bool visual_mode = false; // show progressive diffusion on screen
};
enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
};
struct common_params {
@@ -278,7 +263,6 @@ struct common_params {
struct common_params_sampling sampling;
struct common_params_speculative speculative;
struct common_params_vocoder vocoder;
struct common_params_diffusion diffusion;
struct common_params_model model;
@@ -341,7 +325,6 @@ struct common_params {
bool no_perf = false; // disable performance metrics
bool ctx_shift = true; // context shift on inifinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // use mmap for faster loads
@@ -371,7 +354,7 @@ struct common_params {
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
std::string embd_sep = "\n"; // separator of embeddings
std::string cls_sep = "\t"; // separator of classification sequences
bool reranking = false; // enable reranking support on server
// server params
int32_t port = 8080; // server listens on this network port
@@ -382,7 +365,6 @@ struct common_params {
std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
std::string api_prefix = ""; // NOLINT
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
@@ -395,8 +377,6 @@ struct common_params {
std::string ssl_file_key = ""; // NOLINT
std::string ssl_file_cert = ""; // NOLINT
std::map<std::string, std::string> default_template_kwargs;
// "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = false;
@@ -432,10 +412,9 @@ struct common_params {
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
bool show_statistics = false; // show imatrix statistics per tensor
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
// cvector-generator params
int n_pca_batch = 100;
@@ -535,7 +514,6 @@ static bool string_starts_with(const std::string & str,
// While we wait for C++20's std::string::ends_with...
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

View File

@@ -41,6 +41,49 @@ static std::string build_repetition(const std::string & item_rule, int min_items
return result;
}
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
class string_view {
const std::string & _str;
const size_t _start;
const size_t _end;
public:
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
size_t size() const {
return _end - _start;
}
size_t length() const {
return size();
}
operator std::string() const {
return str();
}
std::string str() const {
return _str.substr(_start, _end - _start);
}
string_view substr(size_t pos, size_t len = std::string::npos) const {
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
}
char operator[](size_t pos) const {
auto index = _start + pos;
if (index >= _end) {
throw std::out_of_range("string_view index out of range");
}
return _str[_start + pos];
}
bool operator==(const string_view & other) const {
std::string this_str = *this;
std::string other_str = other;
return this_str == other_str;
}
};
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
auto has_min = min_value != std::numeric_limits<int>::min();
auto has_max = max_value != std::numeric_limits<int>::max();
@@ -69,14 +112,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
}
out << "}";
};
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
[&](const std::string_view & from, const std::string_view & to) {
std::function<void(const string_view &, const string_view &)> uniform_range =
[&](const string_view & from, const string_view & to) {
size_t i = 0;
while (i < from.length() && i < to.length() && from[i] == to[i]) {
i++;
}
if (i > 0) {
out << "\"" << from.substr(0, i) << "\"";
out << "\"" << from.substr(0, i).str() << "\"";
}
if (i < from.length() && i < to.length()) {
if (i > 0) {

View File

@@ -144,8 +144,6 @@ llama_tokens common_speculative_gen_draft(
auto & smpl = spec->smpl;
auto & prompt = spec->prompt;
auto * mem = llama_get_memory(ctx);
int reuse_i = 0;
int reuse_n = 0;
@@ -175,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
result.reserve(params.n_draft);
if (reuse_n == 0) {
llama_memory_clear(mem, false);
llama_kv_self_clear(ctx);
prompt.clear();
} else {
@@ -194,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
}
if (reuse_i > 0) {
llama_memory_seq_rm (mem, 0, 0, reuse_i);
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
}
if (reuse_n < (int) prompt.size()) {
llama_memory_seq_rm (mem, 0, reuse_n, -1);
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
prompt.erase(prompt.begin() + reuse_n, prompt.end());
}

File diff suppressed because it is too large Load Diff

View File

@@ -7,6 +7,7 @@ import pathlib
import re
import requests
import sys
import json
import shutil
import argparse
@@ -68,7 +69,8 @@ args = parser.parse_args()
hf_token = args.hf_token if args.hf_token is not None else hf_token
if hf_token is None:
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
sys.exit(1)
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
@@ -126,10 +128,6 @@ models = [
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -139,18 +137,11 @@ pre_computed_hashes = [
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
]
def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"} if token else None
headers = {"Authorization": f"Bearer {token}"}
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
@@ -231,7 +222,7 @@ for model in models:
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
src_ifs = ""
for model in [*pre_computed_hashes, *all_models]:
for model in [*all_models, *pre_computed_hashes]:
name = model["name"]
tokt = model["tokt"]
chkhsh = model.get("chkhsh")
@@ -239,6 +230,11 @@ for model in [*pre_computed_hashes, *all_models]:
if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
continue
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os.path.exists(f"models/tokenizers/{name}"):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
continue
# create the tokenizer
if chkhsh is not None:
# if the model has a pre-computed hash, use it
@@ -248,19 +244,15 @@ for model in [*pre_computed_hashes, *all_models]:
chkhsh = existing_models[name]
else:
# otherwise, compute the hash of the tokenizer
# Fail if the tokenizer folder with config does not exist or there are other download issues previously
if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.")
try:
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
if name == "t5":
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except Exception as e:
raise OSError(f"Error loading tokenizer for model {name}.") from e
except OSError as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()

View File

@@ -8,7 +8,6 @@
- [DataType Supports](#datatype-supports)
- [Docker](#docker)
- [Linux](#linux)
- [Environment variable setup](#environment-variable-setup)
- [TODO](#todo)
@@ -291,24 +290,5 @@ Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang
We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
## Environment variable setup
### GGML_CANN_ASYNC_MODE
Enables asynchronous operator submission. Disabled by default.
### GGML_CANN_MEM_POOL
Specifies the memory pool management strategy:
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
- prio: Employs a priority queue-based memory pool management.
- leg: Uses a fixed-size buffer pool.
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
## TODO
- Support more models and data types.

View File

@@ -757,7 +757,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| Name | Value | Function |
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |

View File

@@ -1,246 +0,0 @@
> [!IMPORTANT]
> This build documentation is specific only to IBM Z & LinuxONE mainframes (s390x). You can find the build documentation for other architectures: [build.md](build.md).
# Build llama.cpp locally (for s390x)
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
**To get the code:**
```bash
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```
## CPU Build with BLAS
Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements. Make sure to have OpenBLAS installed in your environment.
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS
cmake --build build --config Release -j $(nproc)
```
**Notes**:
- For faster repeated compilation, install [ccache](https://ccache.dev/)
- By default, VXE/VXE2 is enabled. To disable it (not recommended):
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_VXE=OFF
cmake --build build --config Release -j $(nproc)
```
- By default, NNPA is enabled when available. To disable it (not recommended):
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_NNPA=OFF
cmake --build build --config Release -j $(nproc)
```
- For debug builds:
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Debug \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS
cmake --build build --config Debug -j $(nproc)
```
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release -j $(nproc)
```
## Getting GGUF Models
All models need to be converted to Big-Endian. You can achieve this in three cases:
1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)**
![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
![File Type - safetensors](https://img.shields.io/badge/File_Type-safetensors-da1e28)
The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
```bash
python3 convert_hf_to_gguf.py \
--outfile model-name-be.f16.gguf \
--outtype f16 \
--bigendian \
model-directory/
```
For example,
```bash
python3 convert_hf_to_gguf.py \
--outfile granite-3.3-2b-instruct-be.f16.gguf \
--outtype f16 \
--bigendian \
granite-3.3-2b-instruct/
```
3. **Convert existing GGUF Little-Endian model to Big-Endian**
![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
```bash
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
```
For example,
```bash
python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG
mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf
```
**Notes:**
- The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2.
## IBM Accelerators
### 1. SIMD Acceleration
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
### 2. NNPA Vector Intrinsics Acceleration
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
### 3. zDNN Accelerator
_Only available in IBM z16 or later system. No direction at the moment._
### 4. Spyre Accelerator
_No direction at the moment._
## Performance Tuning
### 1. Virtualization Setup
It is strongly recommended to use only LPAR (Type-1) virtualization to get the most performance.
Note: Type-2 virtualization is not supported at the moment, while you can get it running, the performance will not be the best.
### 2. IFL (Core) Count
It is recommended to allocate a minimum of 8 shared IFLs assigned to the LPAR. Increasing the IFL count past 8 shared IFLs will only improve Prompt Processing performance but not Token Generation.
Note: IFL count does not equate to vCPU count.
### 3. SMT vs NOSMT (Simultaneous Multithreading)
It is strongly recommended to disable SMT via the kernel boot parameters as it negatively affects performance. Please refer to your Linux distribution's guide on disabling SMT via kernel boot parameters.
### 4. BLAS vs NOBLAS
IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS.
## Frequently Asked Questions (FAQ)
1. I'm getting the following error message while trying to load a model: `gguf_init_from_file_impl: failed to load model: this GGUF file version 50331648 is extremely large, is there a mismatch between the host and model endianness?`
Answer: Please ensure that the model you have downloaded/converted is GGUFv3 Big-Endian. These models are usually denoted with the `-be` suffix, i.e., `granite-3.3-2b-instruct-be.F16.gguf`.
You may refer to the [Getting GGUF Models](#getting-gguf-models) section to manually convert a `safetensors` model to `GGUF` Big Endian.
2. I'm getting extremely poor performance when running inference on a model
Answer: Please refer to the [Appendix B: SIMD Support Matrix](#appendix-b-simd-support-matrix) to check if your model quantization is supported by SIMD acceleration.
3. I'm building on IBM z17 and getting the following error messages: `invalid switch -march=z17`
Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
## Getting Help on IBM Z & LinuxONE
1. **Bugs, Feature Requests**
Please file an issue in llama.cpp and ensure that the title contains "s390x".
2. **Other Questions**
Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com).
## Appendix A: Hardware Support Matrix
| | Support | Minimum Compiler Version |
| ------- | ------- | ------------------------ |
| IBM z15 | ✅ | |
| IBM z16 | ✅ | |
| IBM z17 | ✅ | GCC 15.1.0 |
- ✅ - supported and verified to run as intended
- 🚫 - unsupported, we are unlikely able to provide support
## Appendix B: SIMD Support Matrix
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
| ---------- | ----------- | ---- | ---- | ----- |
| FP32 | ✅ | ✅ | ❓ | ❓ |
| FP16 | ✅ | ✅ | ❓ | ❓ |
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
| Q4_0 | ✅ | ✅ | ❓ | ❓ |
| Q4_1 | ✅ | ✅ | ❓ | ❓ |
| Q5_0 | 🚫 | 🚫 | ❓ | ❓ |
| Q5_1 | 🚫 | 🚫 | ❓ | ❓ |
| Q8_0 | ✅ | ✅ | ❓ | ❓ |
| Q2_K | 🚫 | 🚫 | ❓ | ❓ |
| Q3_K | ✅ | ✅ | ❓ | ❓ |
| Q4_K | ✅ | ✅ | ❓ | ❓ |
| Q5_K | ✅ | ✅ | ❓ | ❓ |
| Q6_K | ✅ | ✅ | ❓ | ❓ |
| TQ1_0 | 🚫 | 🚫 | ❓ | ❓ |
| TQ2_0 | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_XXS | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_XS | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ3_XXS | 🚫 | 🚫 | ❓ | ❓ |
| IQ3_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ1_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ1_M | 🚫 | 🚫 | ❓ | ❓ |
| IQ4_NL | ✅ | ✅ | ❓ | ❓ |
| IQ4_XS | ✅ | ✅ | ❓ | ❓ |
| FP32->FP16 | 🚫 | ✅ | ❓ | ❓ |
| FP16->FP32 | 🚫 | ✅ | ❓ | ❓ |
- ✅ - acceleration available
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself

View File

@@ -1,9 +1,5 @@
# Build llama.cpp locally
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
**To get the Code:**
```bash
@@ -67,7 +63,6 @@ cmake --build build --config Release
cmake --preset x64-windows-llvm-release
cmake --build build-x64-windows-llvm-release
```
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
## BLAS Build
@@ -305,8 +300,9 @@ On Linux it is possible to use unified memory architecture (UMA) to share main m
## Vulkan
### For Windows Users:
**w64devkit**
**Windows**
### w64devkit
Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
@@ -333,7 +329,7 @@ cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release
```
**Git Bash MINGW64**
### Git Bash MINGW64
Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
@@ -356,8 +352,7 @@ Now you can load the model in conversation mode using `Vulkan`
build/bin/Release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
```
**MSYS2**
### MSYS2
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
```sh
pacman -S git \
@@ -373,9 +368,9 @@ cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release
```
### For Docker users:
**With docker**:
You don't need to install the Vulkan SDK. It will be installed inside the container.
You don't need to install Vulkan SDK. It will be installed inside the container.
```sh
# Build the image
@@ -385,29 +380,32 @@ docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
### For Linux users:
**Without docker**:
First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.
Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
> [!IMPORTANT]
> After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.
For example, on Ubuntu 22.04 (jammy), use the command below:
Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
```bash
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
apt update -y
apt-get install -y vulkan-sdk
# To verify the installation, use the command below:
vulkaninfo
```
Then, assuming you have `cd` into your llama.cpp folder and there are no errors with running `vulkaninfo`, you can proceed to build llama.cpp using the CMake commands below:
Alternatively your package manager might be able to provide the appropriate libraries.
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
Then, build llama.cpp using the cmake command below:
```bash
cmake -B build -DGGML_VULKAN=1
cmake --build build --config Release
```
Finally, after finishing your build, you should be able to do something like this:
```bash
# Test the output binary
# "-ngl 99" should offload all of the layers to GPU for most (if not all) models.
./build/bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -ngl 99
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
# You should see in the output, ggml_vulkan detected your GPU. For example:
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
@@ -554,27 +552,6 @@ ninja
To read documentation for how to build on Android, [click here](./android.md)
## WebGPU [In Progress]
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The currrent implementation is up-to-date with Dawn commit `bed1a61`.
In the llama.cpp directory, build with CMake:
```
cmake -B build -DGGML_WEBGPU=ON
cmake --build build --config Release
```
### Browser Support
WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu.
Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build the emdawbwebgpu package locally, so that it stays in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`.
## IBM Z & LinuxONE
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
## Notes about GPU-accelerated backends
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.

View File

@@ -83,22 +83,20 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv
### 2. Define the model architecture in `llama.cpp`
The model params and tensors layout must be defined in `llama.cpp` source files:
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
2. In `src/llama-arch.cpp`:
- Add the architecture name to the `LLM_ARCH_NAMES` map.
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
The model params and tensors layout must be defined in `llama.cpp`:
1. Define a new `llm_arch`
2. Define the tensors layout in `LLM_TENSOR_NAMES`
3. Add any non-standard metadata in `llm_load_hparams`
4. Create the tensors for inference in `llm_load_tensors`
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
### 3. Build the GGML graph implementation
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.

View File

@@ -25,9 +25,6 @@ Additionally, there the following images, similar to the above:
- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

View File

@@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
- Functionary v3.1 / v3.2
- Hermes 2/3, Qwen 2.5
- Qwen 2.5 Coder
- Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
- Mistral Nemo
- Firefunction v2
- Command R7B

View File

@@ -1,42 +1,28 @@
# Install pre-built version of llama.cpp
| Install via | Windows | Mac | Linux |
|-------------|---------|-----|-------|
| Winget | ✅ | | |
| Homebrew | | ✅ | ✅ |
| MacPorts | | ✅ | |
| Nix | | ✅ | ✅ |
## Homebrew
## Winget (Windows)
```sh
winget install llama.cpp
```
The package is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/issues/8188
## Homebrew (Mac and Linux)
On Mac and Linux, the homebrew package manager can be used via
```sh
brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
## MacPorts (Mac)
## MacPorts
```sh
sudo port install llama.cpp
```
see also: https://ports.macports.org/port/llama.cpp/details/
See also: https://ports.macports.org/port/llama.cpp/details/
## Nix
## Nix (Mac and Linux)
On Mac and Linux, the Nix package manager can be used via
```sh
nix profile install nixpkgs#llama-cpp
```
For flake enabled installs.
Or
@@ -48,3 +34,13 @@ nix-env --file '<nixpkgs>' --install --attr llama-cpp
For non-flake enabled installs.
This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164).
## Flox
On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via
```sh
flox install llama-cpp
```
Flox follows the nixpkgs build of llama.cpp.

View File

@@ -107,7 +107,3 @@ NOTE: some models may require large context window, for example: `-c 8192`
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
```
## Finding more models:
GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf

View File

@@ -1,95 +0,0 @@
# GGML Operations
List of GGML operations and backend support status.
Legend:
- ✅ Fully supported by this backend
- 🟡 Partially supported by this backend
- ❌ Not supported by this backend
| Operation | BLAS | CPU | CUDA | Metal |
|-----------|------|------|------|------|
| ABS | ❌ | ✅ | 🟡 | ❌ |
| ACC | ❌ | ✅ | ✅ | ✅ |
| ADD | ❌ | ✅ | ✅ | 🟡 |
| ADD1 | ❌ | ✅ | ✅ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ |
| CLAMP | ❌ | ✅ | ✅ | 🟡 |
| CONCAT | ❌ | ✅ | 🟡 | ✅ |
| CONT | ❌ | ✅ | 🟡 | ✅ |
| CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
| CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
| COS | ❌ | ✅ | ✅ | 🟡 |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 |
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
| DIV | ❌ | ✅ | ✅ | 🟡 |
| DUP | ❌ | ✅ | 🟡 | 🟡 |
| ELU | ❌ | ✅ | ❌ | 🟡 |
| EXP | ❌ | ✅ | 🟡 | ❌ |
| FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | 🟡 |
| GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 |
| GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 |
| GELU | ❌ | ✅ | 🟡 | 🟡 |
| GELU_ERF | ❌ | ✅ | 🟡 | 🟡 |
| GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 |
| GET_ROWS | ❌ | ✅ | 🟡 | ✅ |
| GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ |
| HARDSIGMOID | ❌ | ✅ | 🟡 | ❌ |
| HARDSWISH | ❌ | ✅ | 🟡 | ❌ |
| IM2COL | ❌ | ✅ | ✅ | 🟡 |
| L2_NORM | ❌ | ✅ | ✅ | ✅ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ |
| LOG | ❌ | ✅ | ✅ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ |
| MUL | ❌ | ✅ | ✅ | 🟡 |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 |
| MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ |
| NEG | ❌ | ✅ | 🟡 | 🟡 |
| NORM | ❌ | ✅ | ✅ | 🟡 |
| OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ |
| OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ |
| PAD | ❌ | ✅ | ✅ | ✅ |
| PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ |
| POOL_2D | ❌ | ✅ | ✅ | ✅ |
| REGLU | ❌ | ✅ | ✅ | 🟡 |
| RELU | ❌ | ✅ | 🟡 | 🟡 |
| REPEAT | ❌ | ✅ | 🟡 | ✅ |
| REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
| RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
| RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
| RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
| ROPE | ❌ | ✅ | ✅ | ✅ |
| ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
| RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
| RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
| SCALE | ❌ | ✅ | ✅ | ✅ |
| SET | ❌ | ✅ | ❌ | ✅ |
| SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
| SGN | ❌ | ✅ | 🟡 | ❌ |
| SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
| SILU | ❌ | ✅ | 🟡 | 🟡 |
| SILU_BACK | ❌ | ✅ | ✅ | ❌ |
| SIN | ❌ | ✅ | ✅ | 🟡 |
| SOFT_MAX | ❌ | ✅ | ✅ | ✅ |
| SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ |
| SQR | ❌ | ✅ | ✅ | 🟡 |
| SQRT | ❌ | ✅ | ✅ | 🟡 |
| SSM_CONV | ❌ | ✅ | ✅ | ✅ |
| SSM_SCAN | ❌ | ✅ | ✅ | ✅ |
| STEP | ❌ | ✅ | 🟡 | ❌ |
| SUB | ❌ | ✅ | ✅ | 🟡 |
| SUM | ❌ | ✅ | ✅ | ❌ |
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ |
| SWIGLU | ❌ | ✅ | ✅ | 🟡 |
| TANH | ❌ | ✅ | 🟡 | 🟡 |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ |
| UPSCALE | ❌ | ✅ | ✅ | 🟡 |

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -33,7 +33,6 @@ else()
add_subdirectory(speculative-simple)
add_subdirectory(gen-docs)
add_subdirectory(training)
add_subdirectory(diffusion)
if (NOT GGML_BACKEND_DL)
add_subdirectory(convert-llama2c-to-ggml)
# these examples use the backends directly and cannot be built with dynamic loading

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e
AI_NAME="${AI_NAME:-Miku}"

View File

@@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
}
for i in 1 ..< n_parallel {
llama_memory_seq_cp(llama_get_memory(context), 0, Int32(i), 0, batch.n_tokens)
llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
}
if n_parallel > 1 {

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -euo pipefail

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
#
# Temporary script - will be removed in the future

View File

@@ -1,5 +0,0 @@
set(TARGET llama-diffusion-cli)
add_executable(${TARGET} diffusion-cli.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -1,507 +0,0 @@
#include "arg.h"
#include "chat.h"
#include "common.h"
#include "llama.h"
#include "log.h"
#include <limits.h>
#include <string>
#include <vector>
#include <algorithm>
#include <cmath>
#include <limits>
#include <random>
typedef bool (*diffusion_step_callback_t)(int32_t step,
int32_t total_steps,
const llama_token * tokens,
int32_t n_tokens,
void * user_data);
enum diffusion_alg {
DIFFUSION_ALG_ORIGIN = 0,
DIFFUSION_ALG_MASKGIT_PLUS = 1,
DIFFUSION_ALG_TOPK_MARGIN = 2,
DIFFUSION_ALG_ENTROPY = 3,
};
struct diffusion_params {
int32_t steps;
float eps;
float temperature;
float top_p;
int32_t top_k;
llama_token mask_token_id;
enum diffusion_alg algorithm;
float alg_temp;
diffusion_step_callback_t step_callback;
void * step_callback_user_data;
int32_t seed;
};
static diffusion_params diffusion_default_params() {
diffusion_params params = {};
params.steps = 64;
params.eps = 1e-3f;
params.temperature = 0.2f;
params.top_p = 0.95f;
params.top_k = 0;
params.mask_token_id = LLAMA_TOKEN_NULL;
params.algorithm = DIFFUSION_ALG_ORIGIN;
params.alg_temp = 0.0f;
params.step_callback = nullptr;
params.step_callback_user_data = nullptr;
params.seed = 0;
return params;
}
static void diffusion_generate(llama_context * ctx,
const llama_token * input_tokens,
llama_token * output_tokens,
int32_t n_input,
int32_t max_length,
struct diffusion_params params,
int32_t & n_generated) {
n_generated = 0;
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) {
return;
}
const llama_model * model = llama_get_model(ctx);
// Initialize with input and pad with mask tokens
std::copy(input_tokens, input_tokens + n_input, output_tokens);
std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id);
std::mt19937 rng(params.seed);
std::vector<float> timesteps(params.steps + 1);
for (int32_t i = 0; i <= params.steps; i++) {
timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps);
}
llama_set_causal_attn(ctx, false);
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
std::vector<llama_token_data> candidates(n_vocab);
std::vector<llama_token_data> conf_candidates;
conf_candidates.reserve(max_length);
std::vector<int32_t> mask_positions;
mask_positions.reserve(max_length);
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
if (params.top_k > 0) {
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
}
if (params.top_p < 1.0f) {
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
}
if (params.temperature > 0.0f) {
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
}
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
llama_batch batch = llama_batch_init(max_length, 0, 1);
batch.n_tokens = max_length;
int64_t total_sampling_time = 0;
int64_t total_time = 0;
int64_t time_start = ggml_time_us();
for (int32_t step = 0; step < params.steps; step++) {
if (params.step_callback) {
if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) {
break;
}
}
for (int32_t i = 0; i < max_length; i++) {
batch.token[i] = output_tokens[i];
batch.pos[i] = i;
batch.n_seq_id[i] = 1;
batch.seq_id[i][0] = 0;
batch.logits[i] = 1;
}
int ret = llama_decode(ctx, batch);
if (ret != 0) {
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret);
break;
}
float * raw_logits = llama_get_logits(ctx);
if (!raw_logits) {
LOG_ERR("%s: failed to get logits at step %d\n", __func__, step);
break;
}
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab;
};
int64_t time_start_sampling = ggml_time_us();
mask_positions.clear();
for (int32_t i = 0; i < max_length; i++) {
if (output_tokens[i] == params.mask_token_id) {
mask_positions.push_back(i);
}
}
if (mask_positions.empty()) {
break;
}
float t = timesteps[step];
float s = timesteps[step + 1];
if (params.algorithm == DIFFUSION_ALG_ORIGIN) {
float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f;
for (int32_t pos : mask_positions) {
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
const float * pos_logits = get_logits_for_pos(pos);
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
candidates[token_id].id = token_id;
candidates[token_id].logit = pos_logits[token_id];
candidates[token_id].p = 0.0f;
}
llama_token_data_array cur_p = {
/* .data = */ candidates.data(),
/* .size = */ (size_t) n_vocab, // Reset size to full vocab
/* .selected = */ -1,
/* .sorted = */ false,
};
llama_sampler_apply(sampler, &cur_p);
output_tokens[pos] = cur_p.data[cur_p.selected].id;
}
}
} else {
std::vector<std::pair<float, int32_t>> confidences;
std::vector<llama_token> sampled_tokens(mask_positions.size());
for (size_t i = 0; i < mask_positions.size(); i++) {
int32_t pos = mask_positions[i];
const float * pos_logits = get_logits_for_pos(pos);
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
candidates[token_id].logit = pos_logits[token_id];
candidates[token_id].p = 0.0f;
candidates[token_id].id = token_id;
}
llama_token_data_array cur_p = {
/* .data = */ candidates.data(),
/* .size = */ candidates.size(),
/* .selected = */ -1,
/* .sorted = */ false,
};
llama_sampler_apply(sampler, &cur_p);
llama_token sampled_token = cur_p.data[cur_p.selected].id;
float confidence = 0.0f;
if (params.algorithm == DIFFUSION_ALG_ENTROPY) {
const float epsilon = 1e-10f;
for (size_t j = 0; j < cur_p.size; j++) {
float prob = cur_p.data[j].p;
confidence += prob * logf(prob + epsilon);
}
} else if (params.algorithm == DIFFUSION_ALG_TOPK_MARGIN) {
confidence = cur_p.data[0].p - cur_p.data[1].p;
} else {
confidence = cur_p.data[cur_p.selected].p;
}
sampled_tokens[i] = sampled_token;
confidences.emplace_back(confidence, i);
}
int32_t num_transfer =
(step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size();
if (num_transfer > 0) {
if (params.alg_temp == 0.0f) {
std::partial_sort(confidences.begin(), confidences.begin() + num_transfer, confidences.end(),
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
if (a.first != b.first) {
return a.first > b.first;
}
return a.second < b.second;
});
} else {
conf_candidates.clear();
for (int32_t pos = 0; pos < max_length; pos++) {
float conf_logit = -std::numeric_limits<float>::infinity();
auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
if (it != mask_positions.end()) {
size_t mask_idx = std::distance(mask_positions.begin(), it);
conf_logit = confidences[mask_idx].first / params.alg_temp; // Apply temperature scaling
}
conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f });
}
llama_token_data_array conf_array = {
/* .data = */ conf_candidates.data(),
/* .size = */ conf_candidates.size(),
/* .selected = */ -1,
/* .sorted = */ false,
};
for (int32_t i = 0; i < num_transfer; i++) {
// Apply distribution sampler to get selected index
llama_sampler_apply(dist_sampler, &conf_array);
int selected_idx = conf_array.selected;
confidences[i].second = conf_candidates[selected_idx].id;
conf_candidates[selected_idx].p = 0.0f;
conf_array.selected = -1;
}
}
if (params.alg_temp == 0.0f) {
// Deterministic - use confidence order
for (int32_t i = 0; i < num_transfer; i++) {
int32_t mask_idx = confidences[i].second;
int32_t pos = mask_positions[mask_idx];
llama_token token = sampled_tokens[mask_idx];
output_tokens[pos] = token;
}
} else {
for (int32_t i = 0; i < num_transfer; i++) {
int32_t pos = confidences[i].second;
auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
if (it != mask_positions.end()) {
int32_t mask_idx = std::distance(mask_positions.begin(), it);
output_tokens[pos] = sampled_tokens[mask_idx];
}
}
}
}
}
int64_t time_end_sampling = ggml_time_us();
total_sampling_time += time_end_sampling - time_start_sampling;
}
int64_t time_end = ggml_time_us();
total_time += time_end - time_start;
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
total_time / 1000.0, total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps);
llama_batch_free(batch);
llama_sampler_free(sampler);
llama_sampler_free(dist_sampler);
n_generated = max_length;
}
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
if (!use_chat_template) {
return prompt;
}
auto chat_templates = common_chat_templates_init(model, "");
common_chat_templates_inputs inputs;
common_chat_msg user_msg;
user_msg.role = "user";
user_msg.content = prompt;
inputs.add_generation_prompt = true;
inputs.messages.push_back(user_msg);
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
return result.prompt;
}
struct callback_data {
const common_params_diffusion * diff_params;
const llama_vocab * vocab;
int32_t n_input;
};
static bool diffusion_step_callback(int32_t step,
int32_t total_steps,
const llama_token * tokens,
int32_t n_tokens,
void * user_data) {
(void)user_data;
callback_data * data = static_cast<callback_data *>(user_data);
auto print_progress_bar = [](int32_t step, int32_t total_steps) {
int progress_percent = (step * 100) / total_steps;
int progress_bars = (step * 50) / total_steps;
LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%",
step,
total_steps,
std::string(progress_bars, '=').c_str(),
std::string(50 - progress_bars, ' ').c_str(),
progress_percent);
};
if (data->diff_params->visual_mode) {
// Visual mode: clear
LOG_INF("\033[2J\033[H"); // Clear screen and move cursor to top-left
print_progress_bar(step, total_steps);
LOG_INF("\n");
std::string current_text = " ";
for (int32_t i = data->n_input; i < n_tokens; i++) {
std::string token_str;
if (tokens[i] != llama_vocab_mask(data->vocab)) {
char piece[256];
int n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false);
if (n_chars > 0) {
piece[n_chars] = '\0';
token_str = piece;
}
} else {
token_str = " ";
}
current_text += token_str;
}
LOG_INF("%s\n", current_text.c_str());
} else {
print_progress_bar(step, total_steps);
}
return true;
}
int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) {
return 1;
}
const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" };
const char * alg_name = (params.diffusion.algorithm >= 0 && params.diffusion.algorithm <= 3) ?
alg_names[params.diffusion.algorithm] :
"UNKNOWN";
common_init();
llama_backend_init();
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = params.n_gpu_layers;
model_params.devices = params.devices.data();
model_params.use_mmap = params.use_mmap;
model_params.use_mlock = params.use_mlock;
model_params.check_tensors = params.check_tensors;
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
if (!model) {
LOG_ERR("error: failed to load model '%s'\n", params.model.path.c_str());
return 1;
}
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = params.n_ctx;
ctx_params.n_batch = params.n_batch;
ctx_params.n_ubatch = params.n_ubatch;
ctx_params.flash_attn = params.flash_attn;
ctx_params.no_perf = params.no_perf;
ctx_params.type_k = params.cache_type_k;
ctx_params.type_v = params.cache_type_v;
llama_context * ctx = llama_init_from_model(model, ctx_params);
if (!ctx) {
LOG_ERR("error: failed to create context\n");
llama_model_free(model);
return 1;
}
llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads);
const llama_vocab * vocab = llama_model_get_vocab(model);
std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
std::vector<llama_token> input_tokens = common_tokenize(vocab, formatted_prompt,
/*add special tokens*/ true,
/*parse special*/ true);
int n_input = input_tokens.size();
if (n_input >= params.n_ctx) {
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
llama_free(ctx);
llama_model_free(model);
return 1;
}
struct diffusion_params ldiff_params = diffusion_default_params();
ldiff_params.steps = params.diffusion.steps;
ldiff_params.eps = params.diffusion.eps;
ldiff_params.temperature = params.sampling.temp;
ldiff_params.top_p = params.sampling.top_p;
ldiff_params.top_k = params.sampling.top_k;
ldiff_params.algorithm = static_cast<enum diffusion_alg>(params.diffusion.algorithm);
ldiff_params.alg_temp = params.diffusion.alg_temp;
ldiff_params.seed = params.sampling.seed;
llama_token mask_token_id = llama_vocab_mask(vocab);
GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id);
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion.steps);
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion.eps);
LOG_INF("diffusion_params: - %-25s u32 = %d (%s)\n", "algorithm", params.diffusion.algorithm,
alg_name);
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion.alg_temp);
ldiff_params.mask_token_id = mask_token_id;
callback_data cb_data = { &params.diffusion, vocab, n_input };
ldiff_params.step_callback = diffusion_step_callback;
ldiff_params.step_callback_user_data = &cb_data;
int32_t n_generated = 0;
std::vector<llama_token> output_tokens(params.n_ubatch);
diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch,
ldiff_params, n_generated);
if (n_generated > 0) {
if (params.diffusion.visual_mode) {
//clear screen and move cursor to top-left
LOG_INF("\033[2J\033[H");
}
output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
std::string output_data = common_detokenize(vocab, output_tokens, false);
LOG_INF("\n%s\n", output_data.c_str());
} else {
LOG_INF("Error: diffusion generation failed\n");
}
llama_free(ctx);
llama_model_free(model);
llama_backend_free();
return 0;
}

View File

@@ -37,7 +37,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
// clear previous kv_cache values (irrelevant for embeddings)
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
@@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
@@ -133,36 +133,10 @@ int main(int argc, char ** argv) {
// max batch size
const uint64_t n_batch = params.n_batch;
// get added sep and eos token, if any
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) {
std::vector<llama_token> inp;
// split classification pairs and insert expected separator tokens
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
std::string final_prompt;
for (size_t i = 0; i < pairs.size(); i++) {
final_prompt += pairs[i];
if (i != pairs.size() - 1) {
if (!added_eos_token.empty()) {
final_prompt += added_eos_token;
}
if (!added_sep_token.empty()) {
final_prompt += added_sep_token;
}
}
}
inp = common_tokenize(ctx, final_prompt, true, true);
} else {
inp = common_tokenize(ctx, prompt, true, true);
}
auto inp = common_tokenize(ctx, prompt, true, true);
if (inp.size() > n_batch) {
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
@@ -171,11 +145,11 @@ int main(int argc, char ** argv) {
inputs.push_back(inp);
}
// check if the last token is SEP/EOS
// check if the last token is SEP
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) {
if (inp.empty() || (inp.back() != llama_vocab_sep(vocab) && inp.back() != llama_vocab_eos(vocab))) {
LOG_WRN("%s: last token in the prompt is not SEP or EOS\n", __func__);
if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
}
}
@@ -262,24 +236,9 @@ int main(int argc, char ** argv) {
LOG("\n");
}
} else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
const uint32_t n_cls_out = llama_model_n_cls_out(model);
std::vector<std::string> cls_out_labels;
for (uint32_t i = 0; i < n_cls_out; i++) {
const char * label = llama_model_cls_label(model, i);
const std::string label_i(label == nullptr ? "" : label);
cls_out_labels.emplace_back(label_i.empty() ? std::to_string(i) : label_i);
}
for (int j = 0; j < n_embd_count; j++) {
for (uint32_t i = 0; i < n_cls_out; i++) {
// NOTE: if you change this log - update the tests in ci/run.sh
if (n_cls_out == 1) {
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
} else {
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
}
}
// NOTE: if you change this log - update the tests in ci/run.sh
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
}
} else {
// print the first part of the embeddings or for a single prompt, the full embedding

View File

@@ -55,8 +55,6 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) &data[i];
} else if (type == GGML_TYPE_I64) {
v = (float) *(int64_t *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
@@ -136,11 +134,6 @@ static bool run(llama_context * ctx, const common_params & params) {
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
if (tokens.empty()) {
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
return false;
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;

View File

@@ -41,11 +41,12 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
// add input to batch (this increments n_tokens)
for (int32_t j = 0; j < n_toks; j++) {
common_batch_add(batch, inputs[j], j, { 0 }, true);
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
}
// clear previous kv_cache values (irrelevant for embeddings)
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
llama_set_embeddings(ctx, true);
llama_set_causal_attn(ctx, false);
// run model
@@ -101,7 +102,8 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
llama_token eos_token = llama_vocab_eos(vocab);
llama_memory_clear(llama_get_memory(ctx), true);
llama_kv_self_clear(ctx);
llama_set_embeddings(ctx, false);
llama_set_causal_attn(ctx, true);
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
@@ -164,8 +166,6 @@ int main(int argc, char * argv[]) {
llama_model_params mparams = common_model_params_to_llama(params);
llama_context_params cparams = common_context_params_to_llama(params);
cparams.embeddings = true;
llama_backend_init();
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
@@ -213,8 +213,6 @@ int main(int argc, char * argv[]) {
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
}
llama_set_embeddings(ctx, false);
// ### Generation ###
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
{

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e
MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin

View File

@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
}
batch->logits[batch->n_tokens - 1] = true;
llama_memory_clear(llama_get_memory(context), false);
llama_kv_self_clear(context);
const auto t_pp_start = ggml_time_us();
if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
LOGi("Benchmark text generation (tg)");
llama_memory_clear(llama_get_memory(context), false);
llama_kv_self_clear(context);
const auto t_tg_start = ggml_time_us();
for (i = 0; i < tg; i++) {
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
const auto t_tg_end = ggml_time_us();
llama_memory_clear(llama_get_memory(context), false);
llama_kv_self_clear(context);
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_memory_clear(llama_get_memory(reinterpret_cast<llama_context *>(context)), true);
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
}

View File

@@ -210,7 +210,7 @@ actor LlamaContext {
}
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
llama_memory_clear(llama_get_memory(context), false)
llama_kv_self_clear(context)
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
@@ -223,7 +223,7 @@ actor LlamaContext {
// bench text generation
llama_memory_clear(llama_get_memory(context), false)
llama_kv_self_clear(context)
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
@@ -242,7 +242,7 @@ actor LlamaContext {
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
llama_memory_clear(llama_get_memory(context), false)
llama_kv_self_clear(context)
let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@@ -292,7 +292,7 @@ actor LlamaContext {
func clear() {
tokens_list.removeAll()
temporary_invalid_cchars.removeAll()
llama_memory_clear(llama_get_memory(context), true)
llama_kv_self_clear(context)
}
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

View File

@@ -60,8 +60,6 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();
auto * mem = llama_get_memory(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
// Tokenize the prompt
@@ -96,7 +94,7 @@ int main(int argc, char ** argv) {
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
for (int s = 1; s < W + G + 1; ++s) {
llama_memory_seq_cp(mem, 0, s, -1, -1);
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
}
const auto t_enc_end = ggml_time_us();
@@ -429,17 +427,17 @@ int main(int argc, char ** argv) {
// KV cache management
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
llama_memory_seq_rm(mem, -1, n_past, -1);
llama_kv_self_seq_rm(ctx, -1, n_past, -1);
if (seq_id_best != 0) {
// if a verification token matched, we keep the best sequence and remove the rest
// this leads to some KV cache fragmentation
llama_memory_seq_keep(mem, seq_id_best);
llama_memory_seq_cp (mem, seq_id_best, 0, -1, -1);
llama_memory_seq_rm (mem, seq_id_best, -1, -1);
llama_kv_self_seq_keep(ctx, seq_id_best);
llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1);
llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1);
for (int s = 1; s < W + G + 1; ++s) {
llama_memory_seq_cp(mem, 0, s, -1, -1);
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
}
}
}

View File

@@ -181,7 +181,7 @@ int main(int argc, char ** argv){
// KV cache management
// clean the cache of draft tokens that weren't accepted
llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
common_batch_clear(batch_tgt);
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);

View File

@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
common_params params;
params.n_predict = 128;
params.n_junk = 1;
params.n_junk = 0;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
return 1;
@@ -182,10 +182,7 @@ int main(int argc, char ** argv) {
const bool is_sp_shared = params.is_pp_shared;
// extra text to insert in each client's prompt in order to make it larger
const int32_t n_junk = std::max(1, params.n_junk);
// signed seed, use negative values to indicate different seeds for the different clients
const int32_t & sseed = params.sampling.seed;
const int32_t n_junk = params.n_junk;
// init llama.cpp
llama_backend_init();
@@ -197,8 +194,6 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();
auto * mem = llama_get_memory(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
// load the prompts from an external file if there are any
@@ -222,21 +217,11 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx);
if (sseed >= 0) {
LOG_INF("%s: initializing all samplers with the same RNG seed: %d (use a negative seed to have different seeds)\n", __func__, sseed);
} else {
LOG_INF("%s: initializing samplers with different RNG seeds, starting from %d\n", __func__, sseed);
}
std::vector<client> clients(n_clients);
for (size_t i = 0; i < clients.size(); ++i) {
auto & client = clients[i];
client.id = i;
client.smpl = common_sampler_init(model, params.sampling);
if (sseed < 0) {
params.sampling.seed--;
}
}
std::vector<llama_token> tokens_system;
@@ -274,7 +259,7 @@ int main(int argc, char ** argv) {
// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i <= n_clients; ++i) {
llama_memory_seq_cp(mem, 0, i, -1, -1);
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
}
LOG_INF("\n");
@@ -301,9 +286,9 @@ int main(int argc, char ** argv) {
if (batch.n_tokens == 0) {
// all sequences have ended - clear the entire KV cache
for (int i = 1; i <= n_clients; ++i) {
llama_memory_seq_rm(mem, i, -1, -1);
llama_kv_self_seq_rm(ctx, i, -1, -1);
// but keep the system prompt
llama_memory_seq_cp(mem, 0, i, -1, -1);
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
}
LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -358,7 +343,7 @@ int main(int argc, char ** argv) {
client.n_decoded = 0;
client.i_batch = batch.n_tokens - 1;
LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, prompt = %d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur, client.n_prompt);
LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur);
g_seq_id += 1;
@@ -377,9 +362,7 @@ int main(int argc, char ** argv) {
// process in chunks of params.n_batch
int32_t n_batch = params.n_batch;
int32_t i_next = 0;
for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
// experiment: process in powers of 2
//if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
// n_batch /= 2;
@@ -387,7 +370,7 @@ int main(int argc, char ** argv) {
// continue;
//}
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
@@ -407,24 +390,19 @@ int main(int argc, char ** argv) {
return 1;
}
LOG_WRN("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
n_cache_miss += 1;
// retry with half the batch size to try to find a free slot in the KV cache
n_batch /= 2;
i -= n_batch;
continue;
}
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
// move the head of the batch forward with the number of tokens we just processed
i_next = i + n_tokens;
// on successful decode, restore the original batch size
n_batch = params.n_batch;
for (auto & client : clients) {
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
continue;
@@ -462,8 +440,8 @@ int main(int argc, char ** argv) {
}
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
llama_memory_seq_rm(mem, client.id + 1, -1, -1);
llama_memory_seq_cp(mem, 0, client.id + 1, -1, -1);
llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1);
llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
const auto t_main_end = ggml_time_us();

View File

@@ -126,8 +126,6 @@ int main(int argc, char ** argv) {
int n_past = 0;
auto * mem = llama_get_memory(ctx);
// fill the KV cache
for (int i = 0; i < n_ctx; i += n_batch) {
if (i > 0 && n_grp > 1) {
@@ -135,10 +133,11 @@ int main(int argc, char ** argv) {
const int ib = i/n_batch - 1;
const int bd = n_batch_grp*(n_grp - 1);
llama_memory_seq_add(mem, 0, n_past - n_batch, n_past, ib*bd);
llama_memory_seq_div(mem, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
llama_kv_self_update (ctx);
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
}
common_batch_clear(batch);
@@ -168,10 +167,12 @@ int main(int argc, char ** argv) {
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
llama_memory_seq_rm (mem, 0, n_keep , n_keep + n_discard);
llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx, -n_discard);
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
//llama_kv_self_defrag (ctx);
llama_kv_self_update (ctx);
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
common_batch_clear(batch);
@@ -197,10 +198,12 @@ int main(int argc, char ** argv) {
if (n_discard > 0) {
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
llama_memory_seq_rm (mem, 0, n_keep , n_keep + n_discard);
llama_memory_seq_add(mem, 0, n_keep + n_discard, n_ctx, -n_discard);
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
//llama_kv_self_defrag (ctx);
llama_kv_self_update (ctx);
n_past = llama_memory_seq_pos_max(mem, 0) + 1;
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
}
}

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
cd `dirname $0`
cd ..

View File

@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_memory_clear(llama_get_memory(ctx), false);
llama_kv_self_clear(ctx);
// run model
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);

View File

@@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
// erase whole kv
llama_memory_clear(llama_get_memory(ctx3), true);
llama_kv_self_clear(ctx3);
fprintf(stderr, "%s : kv cache cleared\n", __func__);
// restore kv into seq 1

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
set -e

View File

@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
auto generate = [&](const std::string & prompt) {
std::string response;
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == -1;
const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
// tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,16 +113,15 @@ int main(int argc, char ** argv) {
while (true) {
// check if we have enough space in the context to evaluate this batch
int n_ctx = llama_n_ctx(ctx);
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) + 1;
int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");
exit(0);
}
int ret = llama_decode(ctx, batch);
if (ret != 0) {
GGML_ABORT("failed to decode, ret = %d\n", ret);
if (llama_decode(ctx, batch)) {
GGML_ABORT("failed to decode\n");
}
// sample the next token

View File

@@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
{
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
llama_memory_seq_rm(llama_get_memory(ctx_tgt), 0, n_past, -1);
llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
}
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {

View File

@@ -142,8 +142,6 @@ int main(int argc, char ** argv) {
}
}
auto * mem_tgt = llama_get_memory(ctx_tgt);
auto * mem_dft = llama_get_memory(ctx_dft);
// Tokenize the prompt
std::vector<llama_token> inp;
@@ -422,14 +420,14 @@ int main(int argc, char ** argv) {
{
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
llama_memory_seq_keep(mem_dft, s_keep);
llama_memory_seq_cp (mem_dft, s_keep, 0, -1, -1);
llama_memory_seq_keep(mem_dft, 0);
llama_kv_self_seq_keep(ctx_dft, s_keep);
llama_kv_self_seq_cp (ctx_dft, s_keep, 0, -1, -1);
llama_kv_self_seq_keep(ctx_dft, 0);
llama_memory_seq_rm (mem_tgt, s_keep, n_past_tgt, -1);
llama_memory_seq_keep(mem_tgt, s_keep);
llama_memory_seq_cp (mem_tgt, s_keep, 0, -1, -1);
llama_memory_seq_keep(mem_tgt, 0);
llama_kv_self_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
llama_kv_self_seq_keep(ctx_tgt, s_keep);
llama_kv_self_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
llama_kv_self_seq_keep(ctx_tgt, 0);
}
for (int s = 0; s < n_seq_dft; ++s) {
@@ -446,7 +444,7 @@ int main(int argc, char ** argv) {
common_batch_clear(batch_dft);
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
llama_memory_seq_rm(mem_dft, 0, n_past_dft, -1);
llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
llama_decode(ctx_dft, batch_dft);
@@ -505,8 +503,8 @@ int main(int argc, char ** argv) {
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
llama_memory_seq_rm(mem_dft, n_seq_cur, -1, -1);
llama_memory_seq_cp(mem_dft, s, n_seq_cur, -1, -1);
llama_kv_self_seq_rm(ctx_dft, n_seq_cur, -1, -1);
llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
// all previous tokens from this branch are now also part of the new branch
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -587,9 +585,9 @@ int main(int argc, char ** argv) {
// evaluate the target model on the drafted tokens
{
llama_memory_seq_keep(mem_tgt, 0);
llama_kv_self_seq_keep(ctx_tgt, 0);
for (int s = 1; s < n_seq_dft; ++s) {
llama_memory_seq_cp(mem_tgt, 0, s, -1, -1);
llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
}
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
# MIT license
# Copyright (C) 2024 Intel Corporation

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
# MIT license
# Copyright (C) 2025 Intel Corporation

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env bash
#!/bin/bash
#
# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json

View File

@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
message(DEBUG "INS_ENB : ${INS_ENB}")
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
@@ -131,14 +131,13 @@ option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ON)
option(GGML_NNPA "ggml: enable nnpa" ON)
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
if (MINGW)
if (WIN32)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
endif()
@@ -173,7 +172,6 @@ option(GGML_HIP "ggml: use HIP"
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
@@ -181,8 +179,7 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@@ -267,12 +264,12 @@ set(GGML_PUBLIC_HEADERS
include/ggml-cann.h
include/ggml-cpp.h
include/ggml-cuda.h
include/ggml-kompute.h
include/ggml-opt.h
include/ggml-metal.h
include/ggml-rpc.h
include/ggml-sycl.h
include/ggml-vulkan.h
include/ggml-webgpu.h
include/gguf.h)
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
@@ -361,13 +358,6 @@ write_basic_package_version_file(
VERSION ${GGML_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion)
target_compile_definitions(ggml-base PRIVATE
GGML_VERSION="${GGML_INSTALL_VERSION}"
GGML_COMMIT="${GGML_BUILD_COMMIT}"
)
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
@@ -377,8 +367,6 @@ if (MSVC)
/wd4005 # Macro redefinition
/wd4244 # Conversion from one type to another type, possible loss of data
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
/wd4305 # Conversion from 'type1' to 'type2', possible loss of data
/wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
/wd4996 # Disable POSIX deprecation warnings
/wd4702 # Unreachable code warnings
)
@@ -398,46 +386,4 @@ if (MSVC)
disable_msvc_warnings(ggml-cpu-skylakex)
disable_msvc_warnings(ggml-cpu-icelake)
disable_msvc_warnings(ggml-cpu-alderlake)
if (GGML_BUILD_EXAMPLES)
disable_msvc_warnings(common-ggml)
disable_msvc_warnings(common)
disable_msvc_warnings(mnist-common)
disable_msvc_warnings(mnist-eval)
disable_msvc_warnings(mnist-train)
disable_msvc_warnings(gpt-2-ctx)
disable_msvc_warnings(gpt-2-alloc)
disable_msvc_warnings(gpt-2-backend)
disable_msvc_warnings(gpt-2-sched)
disable_msvc_warnings(gpt-2-quantize)
disable_msvc_warnings(gpt-2-batched)
disable_msvc_warnings(gpt-j)
disable_msvc_warnings(gpt-j-quantize)
disable_msvc_warnings(magika)
disable_msvc_warnings(yolov3-tiny)
disable_msvc_warnings(sam)
disable_msvc_warnings(simple-ctx)
disable_msvc_warnings(simple-backend)
endif()
if (GGML_BUILD_TESTS)
disable_msvc_warnings(test-mul-mat)
disable_msvc_warnings(test-arange)
disable_msvc_warnings(test-backend-ops)
disable_msvc_warnings(test-cont)
disable_msvc_warnings(test-conv-transpose)
disable_msvc_warnings(test-conv-transpose-1d)
disable_msvc_warnings(test-conv1d)
disable_msvc_warnings(test-conv2d)
disable_msvc_warnings(test-conv2d-dw)
disable_msvc_warnings(test-customop)
disable_msvc_warnings(test-dup)
disable_msvc_warnings(test-opt)
disable_msvc_warnings(test-pool)
endif ()
endif()

View File

@@ -36,7 +36,8 @@ function(ggml_get_system_arch)
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)

View File

@@ -339,7 +339,7 @@ extern "C" {
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
// Tensor initialization
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);

View File

@@ -101,7 +101,6 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
@@ -134,7 +133,6 @@ extern "C" {
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

View File

@@ -0,0 +1,50 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_KOMPUTE_MAX_DEVICES 16
struct ggml_vk_device {
int index;
int type; // same as VkPhysicalDeviceType
size_t heapSize;
const char * name;
const char * vendor;
int subgroupSize;
uint64_t bufferAlignment;
uint64_t maxAlloc;
};
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
bool ggml_vk_has_vulkan(void);
bool ggml_vk_has_device(void);
struct ggml_vk_device ggml_vk_current_device(void);
//
// backend API
//
// forward declaration
typedef struct ggml_backend * ggml_backend_t;
GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
#ifdef __cplusplus
}
#endif

View File

@@ -1,19 +0,0 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_WEBGPU_NAME "WebGPU"
// Needed for examples in ggml
GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
#ifdef __cplusplus
}
#endif

View File

@@ -314,13 +314,6 @@
extern "C" {
#endif
// Function type used in fatal error callbacks
typedef void (*ggml_abort_callback_t)(const char * error_message);
// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
// Returns the old callback for chaining
GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
@@ -477,7 +470,6 @@ extern "C" {
GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS,
GGML_OP_GET_ROWS_BACK,
GGML_OP_SET_ROWS,
GGML_OP_DIAG,
GGML_OP_DIAG_MASK_INF,
GGML_OP_DIAG_MASK_ZERO,
@@ -489,16 +481,14 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
GGML_OP_ROLL,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@@ -528,8 +518,6 @@ extern "C" {
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_OPT_STEP_ADAMW,
GGML_OP_GLU,
GGML_OP_COUNT,
};
@@ -553,16 +541,6 @@ extern "C" {
GGML_UNARY_OP_COUNT,
};
enum ggml_glu_op {
GGML_GLU_OP_REGLU,
GGML_GLU_OP_GEGLU,
GGML_GLU_OP_SWIGLU,
GGML_GLU_OP_GEGLU_ERF,
GGML_GLU_OP_GEGLU_QUICK,
GGML_GLU_OP_COUNT,
};
enum ggml_object_type {
GGML_OBJECT_TYPE_TENSOR,
GGML_OBJECT_TYPE_GRAPH,
@@ -648,9 +626,6 @@ extern "C" {
// misc
GGML_API const char * ggml_version(void);
GGML_API const char * ggml_commit(void);
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
GGML_API int64_t ggml_time_ms(void);
GGML_API int64_t ggml_time_us(void);
@@ -681,7 +656,6 @@ extern "C" {
GGML_API const char * ggml_op_symbol(enum ggml_op op);
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op);
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
@@ -712,9 +686,6 @@ extern "C" {
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
// true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@@ -786,7 +757,6 @@ extern "C" {
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor);
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
@@ -1115,89 +1085,6 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// gated linear unit ops
// A: n columns, r rows,
// result is n / 2 columns, r rows,
// expects gate in second half of row, unless swapped is true
GGML_API struct ggml_tensor * ggml_glu(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_glu_op op,
bool swapped);
GGML_API struct ggml_tensor * ggml_reglu(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_reglu_swapped(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu_swapped(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_swiglu(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_swiglu_swapped(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu_erf(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu_quick(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
struct ggml_context * ctx,
struct ggml_tensor * a);
// A: n columns, r rows,
// B: n columns, r rows,
GGML_API struct ggml_tensor * ggml_glu_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
enum ggml_glu_op op);
GGML_API struct ggml_tensor * ggml_reglu_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_geglu_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_swiglu_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_geglu_erf_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_geglu_quick_split(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// normalize along rows
GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
@@ -1297,19 +1184,6 @@ extern "C" {
struct ggml_tensor * a,
float s);
// x = s * a + b
GGML_API struct ggml_tensor * ggml_scale_bias(
struct ggml_context * ctx,
struct ggml_tensor * a,
float s,
float b);
GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
float s,
float b);
// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set(
struct ggml_context * ctx,
@@ -1500,23 +1374,6 @@ extern "C" {
struct ggml_tensor * b, // row indices
struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
// a TD [n_embd, ne1, ne2, ne3]
// b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
// c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
//
// undefined behavior if destination rows overlap
//
// broadcast:
// ne2 % ne11 == 0
// ne3 % ne12 == 0
//
// return view(a)
GGML_API struct ggml_tensor * ggml_set_rows(
struct ggml_context * ctx,
struct ggml_tensor * a, // destination
struct ggml_tensor * b, // source
struct ggml_tensor * c); // row indices
GGML_API struct ggml_tensor * ggml_diag(
struct ggml_context * ctx,
struct ggml_tensor * a);
@@ -1554,14 +1411,8 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// a [ne0, ne01, ne02, ne03]
// mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
//
// broadcast:
// ne02 % ne12 == 0
// ne03 % ne13 == 0
//
// fused soft_max(a*scale + mask*(ALiBi slope))
// mask is optional
// max_bias = 0.0f for no ALiBi
GGML_API struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx,
@@ -1871,17 +1722,6 @@ extern "C" {
struct ggml_tensor * b,
int stride);
GGML_API struct ggml_tensor * ggml_conv_2d_direct(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
struct ggml_tensor * b, // input data [W, H, C, N]
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
enum ggml_op_pool {
GGML_OP_POOL_MAX,
GGML_OP_POOL_AVG,
@@ -1924,12 +1764,6 @@ extern "C" {
enum ggml_scale_mode {
GGML_SCALE_MODE_NEAREST = 0,
GGML_SCALE_MODE_BILINEAR = 1,
GGML_SCALE_MODE_COUNT
};
enum ggml_scale_flag {
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
};
// interpolate
@@ -1942,26 +1776,14 @@ extern "C" {
// interpolate
// interpolate scale to specified dimensions
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
GGML_API struct ggml_tensor * ggml_upscale_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int ne0,
int ne1,
int ne2,
int ne3,
enum ggml_scale_mode mode),
"use ggml_interpolate instead");
// Up- or downsamples the input to the specified size.
// 2D scale modes (eg. bilinear) are applied to the first two dimensions.
GGML_API struct ggml_tensor * ggml_interpolate(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3,
uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...]
enum ggml_scale_mode mode);
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
GGML_API struct ggml_tensor * ggml_pad(
@@ -1979,17 +1801,6 @@ extern "C" {
int p0,
int p1);
// Move tensor elements by an offset given for each dimension. Elements that
// are shifted beyond the last position are wrapped around to the beginning.
GGML_API struct ggml_tensor * ggml_roll(
struct ggml_context * ctx,
struct ggml_tensor * a,
int shift0,
int shift1,
int shift2,
int shift3);
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
@@ -2024,17 +1835,11 @@ extern "C" {
#define GGML_KQ_MASK_PAD 64
// q: [n_embd_k, n_batch, n_head, ne3 ]
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
//
// broadcast:
// n_head % n_head_kv == 0
// n_head % ne32 == 0
// ne3 % ne33 == 0
//
// q: [n_embd_k, n_batch, n_head, 1]
// k: [n_embd_k, n_kv, n_head_kv, 1]
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
struct ggml_context * ctx,
struct ggml_tensor * q,
@@ -2073,8 +1878,7 @@ extern "C" {
struct ggml_tensor * dt,
struct ggml_tensor * A,
struct ggml_tensor * B,
struct ggml_tensor * C,
struct ggml_tensor * ids);
struct ggml_tensor * C);
// partition into non-overlapping windows with padding if needed
// example:
@@ -2291,6 +2095,9 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
// print info and performance information for the graph
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

View File

@@ -125,6 +125,7 @@ if (NOT MSVC)
endif()
if (MINGW)
# Target Windows 8 for PrefetchVirtualMemory
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
endif()
@@ -195,7 +196,6 @@ add_library(ggml-base
../include/ggml-opt.h
../include/gguf.h
ggml.c
ggml.cpp
ggml-alloc.c
ggml-backend.cpp
ggml-opt.cpp
@@ -212,7 +212,6 @@ endif()
add_library(ggml
ggml-backend-reg.cpp)
add_library(ggml::ggml ALIAS ggml)
target_link_libraries(ggml PUBLIC ggml-base)
@@ -227,7 +226,6 @@ function(ggml_add_backend_library backend)
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
add_dependencies(ggml ${backend})
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
else()
add_library(${backend} ${ARGN})
target_link_libraries(ggml PUBLIC ${backend})
@@ -270,27 +268,17 @@ endfunction()
function(ggml_add_cpu_backend_variant tag_name)
set(GGML_CPU_TAG_NAME ${tag_name})
# other: OPENMP LLAMAFILE CPU_HBM
if (GGML_SYSTEM_ARCH STREQUAL "x86")
foreach (feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16)
set(GGML_${feat} OFF)
endforeach()
foreach (feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16)
set(GGML_${feat} OFF)
endforeach()
foreach (feat ${ARGN})
set(GGML_${feat} ON)
endforeach()
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
foreach (feat ${ARGN})
set(GGML_INTERNAL_${feat} ON)
endforeach()
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
foreach (feat ${ARGN})
set(GGML_INTERNAL_${feat} ON)
endforeach()
endif()
foreach (feat ${ARGN})
set(GGML_${feat} ON)
endforeach()
ggml_add_cpu_backend_variant_impl(${tag_name})
endfunction()
@@ -300,8 +288,6 @@ ggml_add_backend(CPU)
if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
elseif (GGML_CPU_ARM_ARCH)
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
endif()
if (GGML_SYSTEM_ARCH STREQUAL "x86")
ggml_add_cpu_backend_variant(x64)
@@ -315,47 +301,8 @@ if (GGML_CPU_ALL_VARIANTS)
# MSVC doesn't support AMX
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
endif()
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
# Many of these features are optional so we build versions with popular
# combinations and name the backends based on the version they were
# first released with
ggml_add_cpu_backend_variant(armv8.0_1)
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
# Android-specific backends with SoC-compatible feature sets
ggml_add_cpu_backend_variant(android_armv8.0_1)
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
elseif (APPLE)
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME)
else()
message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
endif()
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
ggml_add_cpu_backend_variant(power0)
ggml_add_cpu_backend_variant(power7_1 POWER7)
ggml_add_cpu_backend_variant(power7_2 POWER7 VSX)
ggml_add_cpu_backend_variant(power8_1 POWER8)
ggml_add_cpu_backend_variant(power8_2 POWER8 VSX)
ggml_add_cpu_backend_variant(power9 POWER9 VSX)
ggml_add_cpu_backend_variant(power10 POWER10 VSX)
ggml_add_cpu_backend_variant(power11 POWER11 VSX)
else()
message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
endif()
else()
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
endif()
elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("")
@@ -365,12 +312,12 @@ ggml_add_backend(BLAS)
ggml_add_backend(CANN)
ggml_add_backend(CUDA)
ggml_add_backend(HIP)
ggml_add_backend(Kompute)
ggml_add_backend(METAL)
ggml_add_backend(MUSA)
ggml_add_backend(RPC)
ggml_add_backend(SYCL)
ggml_add_backend(Vulkan)
ggml_add_backend(WebGPU)
ggml_add_backend(OpenCL)
foreach (target ggml-base ggml)

View File

@@ -22,6 +22,21 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
return t->view_src != NULL;
}
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
if (a->type != b->type) {
return false;
}
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (a->ne[i] != b->ne[i]) {
return false;
}
if (a->nb[i] != b->nb[i]) {
return false;
}
}
return true;
}
// ops that return true for this function must not use restrict pointers for their backend implementations
static bool ggml_op_can_inplace(enum ggml_op op) {
switch (op) {

View File

@@ -45,10 +45,6 @@
#include "ggml-vulkan.h"
#endif
#ifdef GGML_USE_WEBGPU
#include "ggml-webgpu.h"
#endif
#ifdef GGML_USE_OPENCL
#include "ggml-opencl.h"
#endif
@@ -65,13 +61,14 @@
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_KOMPUTE
#include "ggml-kompute.h"
#endif
// disable C++17 deprecation warning for std::codecvt_utf8
#if defined(__clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
namespace fs = std::filesystem;
@@ -94,8 +91,6 @@ static std::string path_str(const fs::path & path) {
#if defined(__clang__)
# pragma clang diagnostic pop
#elif defined(__GNUC__)
# pragma GCC diagnostic pop
#endif
#ifdef _WIN32
@@ -177,9 +172,6 @@ struct ggml_backend_registry {
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
#ifdef GGML_USE_WEBGPU
register_backend(ggml_backend_webgpu_reg());
#endif
#ifdef GGML_USE_OPENCL
register_backend(ggml_backend_opencl_reg());
#endif
@@ -192,6 +184,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
#ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg());
#endif
#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg());
#endif
@@ -575,6 +570,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("cann", silent, dir_path);
ggml_backend_load_best("cuda", silent, dir_path);
ggml_backend_load_best("hip", silent, dir_path);
ggml_backend_load_best("kompute", silent, dir_path);
ggml_backend_load_best("metal", silent, dir_path);
ggml_backend_load_best("rpc", silent, dir_path);
ggml_backend_load_best("sycl", silent, dir_path);

View File

@@ -352,6 +352,21 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
// backend copy
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
if (a->type != b->type) {
return false;
}
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (a->ne[i] != b->ne[i]) {
return false;
}
if (a->nb[i] != b->nb[i]) {
return false;
}
}
return true;
}
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
@@ -802,9 +817,8 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
}
if (sched->debug > 1) {
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
@@ -1812,7 +1826,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
ggml_free(copy.ctx_unallocated);
}
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
if (copy.buffer == NULL) {
return false;
@@ -1823,45 +1837,28 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
assert(g1->n_nodes == g2->n_nodes);
if (test_node != nullptr) {
// Compute the whole graph and only test the output for a specific tensor
ggml_backend_graph_compute(backend1, g1);
ggml_backend_graph_compute(backend2, g2);
for (int i = 0; i < g1->n_nodes; i++) {
struct ggml_tensor * t1 = g1->nodes[i];
struct ggml_tensor * t2 = g2->nodes[i];
int test_node_idx = -1;
for (int i = 0; i < g1->n_nodes; i++) {
struct ggml_tensor * t1 = g1->nodes[i];
if (t1 == test_node) {
test_node_idx = i;
break;
}
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
ggml_backend_graph_compute(backend1, &g1v);
ggml_backend_graph_compute(backend2, &g2v);
if (ggml_is_view_op(t1->op)) {
continue;
}
GGML_ASSERT(test_node_idx != -1);
callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
} else {
for (int i = 0; i < g1->n_nodes; i++) {
struct ggml_tensor * t1 = g1->nodes[i];
struct ggml_tensor * t2 = g2->nodes[i];
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
ggml_backend_graph_compute(backend1, &g1v);
ggml_backend_graph_compute(backend2, &g2v);
if (ggml_is_view_op(t1->op)) {
continue;
}
// compare results, calculate rms etc
if (!callback(i, t1, t2, user_data)) {
break;
}
// compare results, calculate rms etc
if (!callback(i, t1, t2, user_data)) {
break;
}
}
ggml_backend_graph_copy_free(copy);
return true;

View File

@@ -81,7 +81,7 @@ if (BLAS_FOUND)
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
else()
message(FATAL_ERROR "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct GGML_BLAS_VENDOR")
message(ERROR "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct GGML_BLAS_VENDOR")
endif()

View File

@@ -65,9 +65,8 @@
#include <aclnnop/aclnn_eq_tensor.h>
#include <aclnnop/aclnn_gt_scalar.h>
#include <aclnnop/aclnn_pow.h>
#include <aclnnop/aclnn_grouped_matmul_v3.h>
#include <aclnnop/aclnn_grouped_matmul_v2.h>
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
#include <aclnnop/aclnn_zero.h>
#include <float.h>
#include <cmath>
@@ -805,11 +804,10 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
nb[i] = nb[i - 1] * ne[i - 1];
}
ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
aclTensor* zero =
ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
return zero;
GGML_UNUSED(n_bytes);
}
/**
@@ -2656,67 +2654,6 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
}
#ifdef ASCEND_310P
ggml_tensor src0_row = *src0;
ggml_tensor src1_row = *src1;
ggml_tensor dst_row = *dst;
if (src0->type == GGML_TYPE_F16) {
src0_row.type = GGML_TYPE_F32;
}
// src0_row [D, M, 1, 1] weight without permute
src0_row.ne[2] = 1;
src0_row.ne[3] = 1;
src0_row.nb[0] = ori_src0_nb[0];
src0_row.nb[1] = ori_src0_nb[1];
src0_row.nb[2] = ori_src0_nb[1];
src0_row.nb[3] = ori_src0_nb[1];
// src1_row [D, 1, 1, 1] -> input
src1_row.ne[1] = 1;
src1_row.ne[2] = 1;
src1_row.ne[3] = 1;
src1_row.nb[2] = nb11;
src1_row.nb[3] = nb11;
// dst_row [M, 1, 1, 1] -> out
dst_row.ne[1] = 1;
dst_row.ne[2] = 1;
dst_row.ne[3] = 1;
dst_row.nb[2] = nb1;
dst_row.nb[3] = nb1;
//create weight for one row
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
for (int64_t id = 0; id < n_ids; id++) {
// expert index
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(i02 >= 0 && i02 < n_as);
// If B = 1 (broadcast), always use 0; otherwise, use id.
int64_t i11 = (ne11 == 1 ? 0 : id);
int64_t i12 = iid1;
int64_t i1 = id;
int64_t i2 = i12;
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
src0_row.data = src0_tmp_ptr;
src1_row.data = src1_tmp_ptr;
dst_row.data = dst_tmp_ptr;
dst_row.src[0] = &src0_row;
dst_row.src[1] = &src1_row;
ggml_cann_mul_mat(ctx, &dst_row);
}
}
return;
#endif
std::vector<aclTensor*> src0_tensor_vec;
std::vector<aclTensor*> src1_tensor_vec;
std::vector<aclTensor*> dst_tensor_vec;
@@ -2764,9 +2701,9 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
}
size_t GROUP_SIZE = 128;
// GroupedMatmulV3 required tensor_list.size < 128
// GroupedMatmulV2 required tensor_list.size < 128
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
// split and call GroupedMatmulV3
// split and call GroupedMatmulV2
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
@@ -2776,7 +2713,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);

View File

@@ -37,7 +37,6 @@
#include <thread>
#include <unistd.h>
#include <functional>
#include <optional>
#include "../include/ggml-cann.h"
#include "../include/ggml.h"
@@ -104,9 +103,6 @@ const ggml_cann_device_info& ggml_cann_info();
void ggml_cann_set_device(int32_t device);
int32_t ggml_cann_get_device();
std::optional<std::string> get_env(const std::string& name);
bool parse_bool(const std::string& value);
/**
* @brief Abstract base class for memory pools used by CANN.
*/
@@ -358,8 +354,7 @@ struct ggml_backend_cann_context {
: device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
ggml_cann_set_device(device);
description = aclrtGetSocName();
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
async_mode = (getenv("GGML_CANN_ASYNC_MODE") != nullptr);
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
device, async_mode ? "ON" : "OFF");
}

View File

@@ -31,8 +31,6 @@
#include <mutex>
#include <queue>
#include <chrono>
#include <unordered_set>
#include <optional>
#include "ggml-impl.h"
#include "ggml-backend-impl.h"
@@ -95,26 +93,6 @@ int32_t ggml_cann_get_device() {
return id;
}
/**
* @brief Get the value of the specified environment variable (name).
* if not empty, return a std::string object
*/
std::optional<std::string> get_env(const std::string& name) {
const char* val = std::getenv(name.c_str());
if (!val) return std::nullopt;
std::string res = std::string(val);
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
return res;
}
/**
* @brief Verify whether the environment variable is a valid value.
*/
bool parse_bool(const std::string& value) {
std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
return valid_values.find(value) != valid_values.end();
}
/**
* @brief Initialize the CANN device information.
*
@@ -236,7 +214,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
* @param device The device ID to associate with this buffer pool.
*/
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
}
/**
@@ -432,7 +410,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
* @param device The device ID to associate with this buffer pool.
*/
explicit ggml_cann_pool_buf(int device) : device(device) {
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
}
/**
@@ -753,18 +731,16 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
*/
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
int device) {
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
if (mem_pool_type == "prio") {
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
}
if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
}
bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
if (enable_buf_prio) {
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
}
GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
}
@@ -2086,13 +2062,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
return false;
}
} break;
case GGML_OP_SET_ROWS:
{
// TODO: add support
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
return false;
} break;
case GGML_OP_CPY: {
ggml_tensor *src = op->src[0];
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
@@ -2189,10 +2158,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_RMS_NORM:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_CLAMP:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
@@ -2210,14 +2181,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_COUNT_EQUAL:
return true;
case GGML_OP_SCALE:
float bias;
memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
return bias == 0.0f; // TODO: support bias != 0.0f
case GGML_OP_SOFT_MAX:
// TODO: support broadcast
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
case GGML_OP_FLASH_ATTN_EXT:{
// derived from [ggml-cuda.cu]
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
@@ -2240,8 +2203,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
// DeepSeek MLA
return false;
}
// TODO: support broadcast
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
if (op->src[0]->ne[3] != 1) {
return false;
}

View File

@@ -1074,10 +1074,6 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
GGML_TABLE_END()
GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
GGML_TABLE_END()
#define NGRID_IQ1S 2048
#define IQ1S_DELTA 0.125f
#define IQ1M_DELTA 0.125f

View File

@@ -1,17 +1,3 @@
function(ggml_add_cpu_backend_features cpu_name arch)
# The feature detection code is compiled as a separate target so that
# it can be built without the architecture flags
# Since multiple variants of the CPU backend may be included in the same
# build, using set_source_files_properties() to set the arch flags is not possible
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
endfunction()
function(ggml_add_cpu_backend_variant_impl tag_name)
if (tag_name)
set(GGML_CPU_NAME ggml-cpu-${tag_name})
@@ -24,14 +10,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list (APPEND GGML_CPU_SOURCES
ggml-cpu/ggml-cpu.c
ggml-cpu/ggml-cpu.cpp
ggml-cpu/repack.cpp
ggml-cpu/repack.h
ggml-cpu/hbm.cpp
ggml-cpu/hbm.h
ggml-cpu/quants.c
ggml-cpu/quants.h
ggml-cpu/traits.cpp
ggml-cpu/traits.h
ggml-cpu/ggml-cpu-aarch64.cpp
ggml-cpu/ggml-cpu-aarch64.h
ggml-cpu/ggml-cpu-hbm.cpp
ggml-cpu/ggml-cpu-hbm.h
ggml-cpu/ggml-cpu-quants.c
ggml-cpu/ggml-cpu-quants.h
ggml-cpu/ggml-cpu-traits.cpp
ggml-cpu/ggml-cpu-traits.h
ggml-cpu/amx/amx.cpp
ggml-cpu/amx/amx.h
ggml-cpu/amx/mmq.cpp
@@ -98,11 +84,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (GGML_SYSTEM_ARCH STREQUAL "ARM")
message(STATUS "ARM detected")
list(APPEND GGML_CPU_SOURCES
ggml-cpu/arch/arm/quants.c
ggml-cpu/arch/arm/repack.cpp
)
if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
else()
@@ -157,49 +138,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
else()
if (GGML_CPU_ARM_ARCH)
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
elseif(GGML_CPU_ALL_VARIANTS)
# Begin with the lowest baseline
set(ARM_MCPU "armv8-a")
set(ARCH_TAGS "")
set(ARCH_DEFINITIONS "")
# When a feature is selected, bump the MCPU to the first
# version that supported it
if (GGML_INTERNAL_DOTPROD)
set(ARM_MCPU "armv8.2-a")
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
endif()
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
set(ARM_MCPU "armv8.2-a")
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
endif()
if (GGML_INTERNAL_SVE)
set(ARM_MCPU "armv8.2-a")
set(ARCH_TAGS "${ARCH_TAGS}+sve")
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
endif()
if (GGML_INTERNAL_MATMUL_INT8)
set(ARM_MCPU "armv8.6-a")
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
endif()
if (GGML_INTERNAL_SVE2)
set(ARM_MCPU "armv8.6-a")
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
endif()
if (GGML_INTERNAL_NOSVE)
set(ARCH_TAGS "${ARCH_TAGS}+nosve")
endif()
if (GGML_INTERNAL_SME)
set(ARM_MCPU "armv9.2-a")
set(ARCH_TAGS "${ARCH_TAGS}+sme")
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
endif()
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
endif()
endif()
@@ -229,11 +167,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
message(STATUS "x86 detected")
list(APPEND GGML_CPU_SOURCES
ggml-cpu/arch/x86/quants.c
ggml-cpu/arch/x86/repack.cpp
)
if (MSVC)
# instruction set detection for MSVC only
if (GGML_NATIVE)
@@ -363,11 +296,21 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
endif()
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
# The feature detection code is compiled as a separate target so that
# it can be built without the architecture flags
# Since multiple variants of the CPU backend may be included in the same
# build, using set_source_files_properties() to set the arch flags is not possible
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
endif()
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
message(STATUS "PowerPC detected")
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c)
if (GGML_NATIVE)
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
file(READ "/proc/cpuinfo" POWER10_M)
@@ -375,8 +318,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
endif()
string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
@@ -388,27 +330,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
else()
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
endif()
elseif(GGML_CPU_ALL_VARIANTS)
# Begin with the lowest baseline
set(ARCH_DEFINITIONS "")
# When a feature is selected, bump the MCPU to the first
# version that supported it
foreach(PVER RANGE 7 11)
if(DEFINED GGML_INTERNAL_POWER${PVER})
set(POWERPC_MCPU "power${PVER}")
list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
endif()
endforeach()
if (GGML_INTERNAL_VSX)
list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
list(APPEND ARCH_FLAGS -mvsx)
endif()
if (DEFINED POWERPC_MCPU)
list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
endif()
ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
else()
if (GGML_CPU_POWERPC_CPUTYPE)
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
@@ -416,8 +337,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
message(STATUS "loongarch64 detected")
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c)
list(APPEND ARCH_FLAGS -march=loongarch64)
if (GGML_LASX)
list(APPEND ARCH_FLAGS -mlasx)
@@ -427,10 +346,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
message(STATUS "riscv64 detected")
list(APPEND GGML_CPU_SOURCES
ggml-cpu/arch/riscv/quants.c
ggml-cpu/arch/riscv/repack.cpp
)
if (GGML_RVV)
if (GGML_XTHEADVECTOR)
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
@@ -442,13 +357,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
message(STATUS "s390x detected")
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
# TODO: Separation to determine activation of VX/VXE/VXE2
if (${S390X_M} MATCHES "8561|8562")
set(GGML_NNPA OFF)
message(STATUS "z15 target")
list(APPEND ARCH_FLAGS -march=z15)
elseif (${S390X_M} MATCHES "3931")
@@ -465,25 +378,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
if (GGML_VXE)
message(STATUS "VX/VXE/VXE2 enabled")
list(APPEND ARCH_FLAGS -mvx -mzvector)
list(APPEND ARCH_DEFINITIONS GGML_VXE)
endif()
if (GGML_NNPA)
message(STATUS "NNPA enabled")
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
endif()
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
message(STATUS "Wasm detected")
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
else()
message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
message(STATUS "Unknown architecture")
endif()
if (GGML_CPU_REPACK)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
if (GGML_CPU_AARCH64)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
endif()
if (GGML_CPU_KLEIDIAI)
@@ -494,9 +396,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# Fetch KleidiAI sources:
include(FetchContent)
set(KLEIDIAI_COMMIT_TAG "v1.11.0")
set(KLEIDIAI_COMMIT_TAG "v1.6.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2")
set(KLEIDIAI_ARCHIVE_MD5 "75b4ad68f25ab673dcc01065e5a0b05f")
if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
@@ -589,9 +491,4 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (EMSCRIPTEN)
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
# The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
endif()
endfunction()

View File

@@ -5,7 +5,7 @@
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"
#include "traits.h"
#include "ggml-cpu-traits.h"
#if defined(__gnu_linux__)
#include <sys/syscall.h>

View File

@@ -8,8 +8,7 @@
#include "mmq.h"
#include "ggml-impl.h"
#include "ggml-cpu-impl.h"
#include "simd-mappings.h"
#include "quants.h"
#include "ggml-cpu-quants.h"
#include "ggml-quants.h"
#include <algorithm>
#include <type_traits>
@@ -454,7 +453,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
// Quantize these floats
const float iscale = 127.f / amax;
y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
y[i].d = GGML_FP32_TO_FP16(1 / iscale);
const float id = ( amax != 0.0f ) ? iscale : 0.f;
const __m512 vscale = _mm512_set1_ps(id);
@@ -1091,7 +1090,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
for (int m = 0; m < nr; ++m) {
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
__m512 vsum;
@@ -1114,8 +1113,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
for (int m = 0; m < nr; ++m) {
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
__m512 vsum;
@@ -1138,7 +1137,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
for (int m = 0; m < nr; ++m) {
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
__m512 vsum;
@@ -1438,7 +1437,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
va[k] = _mm512_set1_epi32(a_ptr[k]);
vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
}
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
}
// load b
@@ -1499,8 +1498,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
for (int k = 0; k < 8; ++k) {
va[k] = _mm512_set1_epi32(a_ptr[k]);
}
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
}
// load b
@@ -1572,7 +1571,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
va[k] = _mm512_set1_epi32(a_ptr[k]);
va[k] = _mm512_add_epi8(va[k], off);
}
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
}
// load b

View File

@@ -1,184 +0,0 @@
#pragma once
// Rename `_generic` functions if no native implementation is available.
// This effectively selects the generic implementation.
#if defined(GGML_CPU_GENERIC)
// quants.c
#define quantize_row_q8_0_generic quantize_row_q8_0
#define quantize_row_q8_1_generic quantize_row_q8_1
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
// repack.cpp
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__POWERPC__) || defined(__powerpc__)
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__loongarch64)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__riscv)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__s390x__)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__wasm__)
// quants.c
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#endif

View File

@@ -1,94 +0,0 @@
#include "ggml-backend-impl.h"
#if defined(__aarch64__)
#if defined(__linux__)
#include <sys/auxv.h>
#elif defined(__APPLE__)
#include <sys/sysctl.h>
#endif
#if !defined(HWCAP2_I8MM)
#define HWCAP2_I8MM (1 << 13)
#endif
#if !defined(HWCAP2_SME)
#define HWCAP2_SME (1 << 23)
#endif
struct aarch64_features {
// has_neon not needed, aarch64 has NEON guaranteed
bool has_dotprod = false;
bool has_fp16_va = false;
bool has_sve = false;
bool has_sve2 = false;
bool has_i8mm = false;
bool has_sme = false;
aarch64_features() {
#if defined(__linux__)
uint32_t hwcap = getauxval(AT_HWCAP);
uint32_t hwcap2 = getauxval(AT_HWCAP2);
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
has_fp16_va = !!(hwcap & HWCAP_FPHP);
has_sve = !!(hwcap & HWCAP_SVE);
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
has_sme = !!(hwcap2 & HWCAP2_SME);
#elif defined(__APPLE__)
int oldp = 0;
size_t size = sizeof(oldp);
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
has_dotprod = static_cast<bool>(oldp);
}
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
has_i8mm = static_cast<bool>(oldp);
}
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
has_sme = static_cast<bool>(oldp);
}
// Apple apparently does not implement SVE yet
#endif
}
};
static int ggml_backend_cpu_aarch64_score() {
int score = 1;
aarch64_features af;
#ifdef GGML_USE_DOTPROD
if (!af.has_dotprod) { return 0; }
score += 1<<1;
#endif
#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
if (!af.has_fp16_va) { return 0; }
score += 1<<2;
#endif
#ifdef GGML_USE_SVE
if (!af.has_sve) { return 0; }
score += 1<<3;
#endif
#ifdef GGML_USE_MATMUL_INT8
if (!af.has_i8mm) { return 0; }
score += 1<<4;
#endif
#ifdef GGML_USE_SVE2
if (!af.has_sve2) { return 0; }
score += 1<<5;
#endif
#ifdef GGML_USE_SME
if (!af.has_sme) { return 0; }
score += 1<<6;
#endif
return score;
}
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
# endif // defined(__aarch64__)

Some files were not shown because too many files have changed in this diff Show More