mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-09 16:17:31 +03:00
Compare commits
6 Commits
0cc4m/vulk
...
compilade/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
93fbd407f3 | ||
|
|
0d5cfed596 | ||
|
|
adec43d774 | ||
|
|
899398277d | ||
|
|
1ae6ab7601 | ||
|
|
de12f8ac50 |
@@ -3,8 +3,7 @@
|
||||
# ==============================================================================
|
||||
|
||||
# Define the CANN base image for easier version updates later
|
||||
ARG CHIP_TYPE=910b
|
||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
|
||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
|
||||
|
||||
# ==============================================================================
|
||||
# BUILD STAGE
|
||||
@@ -12,8 +11,11 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
|
||||
# ==============================================================================
|
||||
FROM ${CANN_BASE_IMAGE} AS build
|
||||
|
||||
# Define the Ascend chip model for compilation. Default is Ascend910B3
|
||||
ARG ASCEND_SOC_TYPE=Ascend910B3
|
||||
|
||||
# -- Install build dependencies --
|
||||
RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
|
||||
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
|
||||
yum clean all && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
@@ -34,22 +36,20 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||
# For brevity, only core variables are listed here. You can paste the original ENV list here.
|
||||
|
||||
# -- Build llama.cpp --
|
||||
# Use the passed CHIP_TYPE argument and add general build options
|
||||
ARG CHIP_TYPE
|
||||
# Use the passed ASCEND_SOC_TYPE argument and add general build options
|
||||
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
|
||||
&& \
|
||||
cmake -B build \
|
||||
-DGGML_CANN=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DSOC_TYPE=ascend${CHIP_TYPE} \
|
||||
-DUSE_ACL_GRAPH=ON \
|
||||
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
|
||||
. && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
# -- Organize build artifacts for copying in later stages --
|
||||
# Create a lib directory to store all .so files
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
# Create a full directory to store all executables and Python scripts
|
||||
RUN mkdir -p /app/full && \
|
||||
@@ -108,11 +108,11 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
# ENTRYPOINT ["/app/llama-server"]
|
||||
|
||||
### Target: light
|
||||
# Lightweight image containing only llama-cli and llama-completion
|
||||
# Lightweight image containing only llama-cli
|
||||
# ==============================================================================
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y gcc-14 g++-14 build-essential git cmake libssl-dev
|
||||
|
||||
ENV CC=gcc-14 CXX=g++-14
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -22,7 +20,7 @@ RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
@@ -36,7 +34,7 @@ RUN mkdir -p /app/full \
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
@@ -57,9 +55,8 @@ RUN apt-get update \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
@@ -71,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
ARG CUDA_VERSION=12.4.0
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
@@ -12,9 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
ARG CUDA_DOCKER_ARCH=default
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
|
||||
|
||||
ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
|
||||
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -27,7 +25,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
@@ -41,7 +39,7 @@ RUN mkdir -p /app/full \
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
@@ -62,8 +60,7 @@ RUN apt-get update \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
@@ -77,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
|
||||
ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04
|
||||
|
||||
## Build Image
|
||||
|
||||
@@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libssl-dev
|
||||
apt-get install -y git libcurl4-openssl-dev
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -21,7 +21,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
@@ -33,25 +33,8 @@ RUN mkdir -p /app/full \
|
||||
|
||||
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
|
||||
|
||||
ARG IGC_VERSION=v2.30.1
|
||||
ARG IGC_VERSION_FULL=2_2.30.1+20950
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
|
||||
ARG IGDGMM_VERSION=22.9.0
|
||||
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
|
||||
&& wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
|
||||
&& dpkg --install *.deb
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
@@ -90,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
|
||||
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
|
||||
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
||||
|
||||
@@ -6,7 +6,7 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN yum install -y gcc g++ cmake make openssl-devel
|
||||
RUN yum install -y gcc g++ cmake make libcurl-devel
|
||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
|
||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
|
||||
@@ -23,12 +23,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||
RUN echo "Building with static libs" && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \
|
||||
cmake --build build --config Release --target llama-cli && \
|
||||
cmake --build build --config Release --target llama-completion
|
||||
cmake --build build --config Release --target llama-cli
|
||||
|
||||
# TODO: use image with NNRT
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
||||
COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
|
||||
@@ -37,7 +37,6 @@ make -j GGML_CUDA=1
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
|
||||
cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
|
||||
cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
|
||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
|
||||
|
||||
@@ -69,7 +68,6 @@ rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llama-cuda-cli
|
||||
%{_bindir}/llama-cuda-completion
|
||||
%{_bindir}/llama-cuda-server
|
||||
%{_bindir}/llama-cuda-simple
|
||||
/usr/lib/systemd/system/llamacuda.service
|
||||
|
||||
@@ -39,7 +39,6 @@ make -j
|
||||
%install
|
||||
mkdir -p %{buildroot}%{_bindir}/
|
||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
|
||||
cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
|
||||
cp -p llama-server %{buildroot}%{_bindir}/llama-server
|
||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
|
||||
|
||||
@@ -71,7 +70,6 @@ rm -rf %{_builddir}/*
|
||||
|
||||
%files
|
||||
%{_bindir}/llama-cli
|
||||
%{_bindir}/llama-completion
|
||||
%{_bindir}/llama-server
|
||||
%{_bindir}/llama-simple
|
||||
/usr/lib/systemd/system/llama.service
|
||||
|
||||
@@ -18,7 +18,7 @@ RUN apt-get update && \
|
||||
python3 \
|
||||
python3-pip \
|
||||
git \
|
||||
libssl-dev \
|
||||
libcurl4-openssl-dev \
|
||||
libgomp1
|
||||
|
||||
WORKDIR /app
|
||||
@@ -32,7 +32,7 @@ RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
@@ -46,7 +46,7 @@ RUN mkdir -p /app/full \
|
||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
@@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
# the module `{ pkgs ... }: { /* config */ }` implicitly uses
|
||||
# `_module.args.pkgs` (defined in this case by flake-parts).
|
||||
perSystem =
|
||||
{ lib, system, ... }:
|
||||
{ system, ... }:
|
||||
{
|
||||
_module.args = {
|
||||
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
|
||||
@@ -33,7 +33,7 @@
|
||||
"CUDA EULA"
|
||||
"cuDNN EULA"
|
||||
]
|
||||
) (p.meta.licenses or (lib.toList p.meta.license));
|
||||
) (p.meta.licenses or [ p.meta.license ]);
|
||||
};
|
||||
# Ensure dependencies use ROCm consistently
|
||||
pkgsRocm = import inputs.nixpkgs {
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
llamaVersion,
|
||||
numpy,
|
||||
tqdm,
|
||||
requests,
|
||||
sentencepiece,
|
||||
pyyaml,
|
||||
poetry-core,
|
||||
@@ -21,7 +20,6 @@ buildPythonPackage {
|
||||
tqdm
|
||||
sentencepiece
|
||||
pyyaml
|
||||
requests
|
||||
];
|
||||
src = lib.cleanSource ../../gguf-py;
|
||||
pythonImportsCheck = [
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
rocmPackages,
|
||||
vulkan-headers,
|
||||
vulkan-loader,
|
||||
openssl,
|
||||
curl,
|
||||
shaderc,
|
||||
useBlas ?
|
||||
builtins.all (x: !x) [
|
||||
@@ -32,8 +32,8 @@
|
||||
useMpi ? false,
|
||||
useRocm ? config.rocmSupport,
|
||||
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
|
||||
enableCurl ? true,
|
||||
useVulkan ? false,
|
||||
useRpc ? false,
|
||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||
|
||||
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
||||
@@ -41,7 +41,6 @@
|
||||
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
||||
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
||||
precompileMetalShaders ? false,
|
||||
useWebUi ? true,
|
||||
}:
|
||||
|
||||
let
|
||||
@@ -161,14 +160,14 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
++ optionals useRocm rocmBuildInputs
|
||||
++ optionals useBlas [ blas ]
|
||||
++ optionals useVulkan vulkanBuildInputs
|
||||
++ [ openssl ];
|
||||
++ optionals enableCurl [ curl ];
|
||||
|
||||
cmakeFlags =
|
||||
[
|
||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||
(cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
|
||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
||||
(cmakeBool "GGML_NATIVE" false)
|
||||
(cmakeBool "GGML_BLAS" useBlas)
|
||||
(cmakeBool "GGML_CUDA" useCuda)
|
||||
@@ -176,7 +175,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
(cmakeBool "GGML_METAL" useMetalKit)
|
||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||
(cmakeBool "GGML_STATIC" enableStatic)
|
||||
(cmakeBool "GGML_RPC" useRpc)
|
||||
]
|
||||
++ optionals useCuda [
|
||||
(
|
||||
|
||||
@@ -7,6 +7,13 @@
|
||||
|
||||
let
|
||||
pythonPackages = python3.pkgs;
|
||||
buildPythonPackage = pythonPackages.buildPythonPackage;
|
||||
numpy = pythonPackages.numpy;
|
||||
tqdm = pythonPackages.tqdm;
|
||||
sentencepiece = pythonPackages.sentencepiece;
|
||||
pyyaml = pythonPackages.pyyaml;
|
||||
poetry-core = pythonPackages.poetry-core;
|
||||
pytestCheckHook = pythonPackages.pytestCheckHook;
|
||||
in
|
||||
|
||||
# We're using `makeScope` instead of just writing out an attrset
|
||||
@@ -16,18 +23,17 @@ in
|
||||
lib.makeScope newScope (self: {
|
||||
inherit llamaVersion;
|
||||
gguf-py = self.callPackage ./package-gguf-py.nix {
|
||||
inherit (pythonPackages)
|
||||
inherit
|
||||
buildPythonPackage
|
||||
numpy
|
||||
tqdm
|
||||
sentencepiece
|
||||
poetry-core
|
||||
pyyaml
|
||||
pytestCheckHook
|
||||
requests
|
||||
buildPythonPackage
|
||||
poetry-core
|
||||
;
|
||||
};
|
||||
python-scripts = self.callPackage ./python-scripts.nix { inherit (pythonPackages) buildPythonPackage poetry-core; };
|
||||
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
||||
llama-cpp = self.callPackage ./package.nix { };
|
||||
docker = self.callPackage ./docker.nix { };
|
||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||
|
||||
@@ -1,138 +0,0 @@
|
||||
ARG OPENVINO_VERSION_MAJOR=2026.0
|
||||
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# Optional proxy build arguments - empty by default
|
||||
ARG http_proxy=
|
||||
ARG https_proxy=
|
||||
|
||||
## Build Image
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS build
|
||||
|
||||
# Pass proxy args to build stage
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
gnupg \
|
||||
wget \
|
||||
git \
|
||||
cmake \
|
||||
ninja-build \
|
||||
build-essential \
|
||||
libtbb12 \
|
||||
libssl-dev \
|
||||
ocl-icd-opencl-dev \
|
||||
opencl-headers \
|
||||
opencl-clhpp-headers \
|
||||
intel-opencl-icd && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install OpenVINO for Ubuntu 24.04
|
||||
ARG OPENVINO_VERSION_MAJOR
|
||||
ARG OPENVINO_VERSION_FULL
|
||||
RUN mkdir -p /opt/intel && \
|
||||
wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
|
||||
tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
|
||||
mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
|
||||
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
|
||||
echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
|
||||
cd - && \
|
||||
ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
|
||||
|
||||
ENV OpenVINO_DIR=/opt/intel/openvino
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Build Stage
|
||||
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON && \
|
||||
cmake --build build/ReleaseOV -j$(nproc)"
|
||||
|
||||
# Copy all necessary libraries
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
|
||||
find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
|
||||
find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
|
||||
|
||||
# Create runtime directories and copy binaries
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/ReleaseOV/bin/* /app/full/ \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base Runtime Image
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS base
|
||||
|
||||
# Pass proxy args to runtime stage
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app/
|
||||
|
||||
### Full (all binaries)
|
||||
FROM base AS full
|
||||
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
|
||||
COPY --from=build /app/full /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
git \
|
||||
python3 \
|
||||
python3-venv \
|
||||
python3-pip && \
|
||||
python3 -m venv /ov-venv && \
|
||||
/ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
|
||||
/ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
|
||||
apt-get autoremove -y && \
|
||||
apt-get clean && \
|
||||
rm -rf /tmp/* /var/tmp/* && \
|
||||
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
|
||||
find /var/cache -type f -delete
|
||||
|
||||
ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
|
||||
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
@@ -1,8 +1,8 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=7.2.1
|
||||
ARG AMDGPU_VERSION=7.2.1
|
||||
ARG ROCM_VERSION=7.0
|
||||
ARG AMDGPU_VERSION=7.0
|
||||
|
||||
# Target the ROCm build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
@@ -11,12 +11,13 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.1/reference/system-requirements.html
|
||||
# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
|
||||
# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
|
||||
# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
|
||||
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html
|
||||
|
||||
ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201'
|
||||
ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
|
||||
#ARG ROCM_DOCKER_ARCH='gfx1151'
|
||||
|
||||
# Set ROCm architectures
|
||||
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
@@ -26,7 +27,7 @@ RUN apt-get update \
|
||||
build-essential \
|
||||
cmake \
|
||||
git \
|
||||
libssl-dev \
|
||||
libcurl4-openssl-dev \
|
||||
curl \
|
||||
libgomp1
|
||||
|
||||
@@ -44,7 +45,7 @@ RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||
&& cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib \
|
||||
&& find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
&& find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
@@ -58,7 +59,7 @@ RUN mkdir -p /app/full \
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt-get install -y libgomp1 curl\
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
@@ -79,7 +80,7 @@ RUN apt-get update \
|
||||
git \
|
||||
python3-pip \
|
||||
python3 \
|
||||
python3-wheel \
|
||||
python3-wheel\
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
@@ -93,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
apt install -y --no-install-recommends \
|
||||
git cmake ccache ninja-build \
|
||||
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
|
||||
libopenblas-dev libssl-dev && \
|
||||
libopenblas-dev libcurl4-openssl-dev && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
@@ -24,9 +24,8 @@ RUN --mount=type=cache,target=/root/.ccache \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DGGML_BACKEND_DL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS && \
|
||||
cmake --build build --config Release -j $(nproc) && \
|
||||
@@ -104,8 +103,7 @@ FROM base AS light
|
||||
WORKDIR /llama.cpp/bin
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
|
||||
|
||||
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
|
||||
|
||||
@@ -118,7 +116,6 @@ ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
WORKDIR /llama.cpp/bin
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
@@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
exec ./llama-quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
exec ./llama-cli "$@"
|
||||
elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
|
||||
exec ./llama-completion "$@"
|
||||
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
|
||||
exec ./llama-bench "$@"
|
||||
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
|
||||
@@ -34,10 +32,8 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||
else
|
||||
echo "Unknown command: $arg1"
|
||||
echo "Available commands: "
|
||||
echo " --run (-r): Run a model (chat) previously converted into ggml"
|
||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin"
|
||||
echo " --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
|
||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||
echo " --run (-r): Run a model previously converted into ggml"
|
||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
||||
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
|
||||
echo " ex: -m model.gguf"
|
||||
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
|
||||
|
||||
@@ -1,24 +1,42 @@
|
||||
ARG UBUNTU_VERSION=26.04
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget xz-utils
|
||||
|
||||
# Install SSL and Vulkan SDK dependencies
|
||||
RUN apt install -y libssl-dev curl \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
|
||||
# Install Vulkan SDK
|
||||
ARG VULKAN_VERSION=1.4.321.1
|
||||
RUN ARCH=$(uname -m) && \
|
||||
wget -qO /tmp/vulkan-sdk.tar.xz https://sdk.lunarg.com/sdk/download/${VULKAN_VERSION}/linux/vulkan-sdk-linux-${ARCH}-${VULKAN_VERSION}.tar.xz && \
|
||||
mkdir -p /opt/vulkan && \
|
||||
tar -xf /tmp/vulkan-sdk.tar.xz -C /tmp --strip-components=1 && \
|
||||
mv /tmp/${ARCH}/* /opt/vulkan/ && \
|
||||
rm -rf /tmp/*
|
||||
|
||||
# Install cURL and Vulkan SDK dependencies
|
||||
RUN apt install -y libcurl4-openssl-dev curl \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev
|
||||
|
||||
# Set environment variables
|
||||
ENV VULKAN_SDK=/opt/vulkan
|
||||
ENV PATH=$VULKAN_SDK/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib:$LD_LIBRARY_PATH
|
||||
ENV CMAKE_PREFIX_PATH=$VULKAN_SDK:$CMAKE_PREFIX_PATH
|
||||
ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig:$PKG_CONFIG_PATH
|
||||
|
||||
# Build it
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
@@ -32,8 +50,7 @@ RUN mkdir -p /app/full \
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
|
||||
libglvnd0 libgl1 libglx0 libegl1 libgles2 \
|
||||
&& apt-get install -y libgomp1 curl libvulkan-dev \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
@@ -49,20 +66,14 @@ COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV PATH="/root/.venv/bin:/root/.local/bin:${PATH}"
|
||||
|
||||
# Flag for compatibility with pip
|
||||
ARG UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
git \
|
||||
ca-certificates \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
|
||||
&& uv python install 3.13 \
|
||||
&& uv venv --python 3.13 /root/.venv \
|
||||
&& uv pip install --python /root/.venv/bin/python -r requirements.txt \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
@@ -74,7 +85,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -21,6 +21,14 @@ indent_style = tab
|
||||
[prompts/*.txt]
|
||||
insert_final_newline = unset
|
||||
|
||||
[tools/server/public/*]
|
||||
indent_size = 2
|
||||
|
||||
[tools/server/public/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
|
||||
[tools/server/deps_*]
|
||||
trim_trailing_whitespace = unset
|
||||
indent_style = unset
|
||||
@@ -52,19 +60,3 @@ end_of_line = unset
|
||||
charset = unset
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
[tools/server/public/**]
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
end_of_line = unset
|
||||
charset = unset
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
[benches/**]
|
||||
indent_style = unset
|
||||
indent_size = unset
|
||||
end_of_line = unset
|
||||
charset = unset
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
{ "contextFileName": "AGENTS.md" }
|
||||
4
.gitattributes
vendored
4
.gitattributes
vendored
@@ -1,4 +0,0 @@
|
||||
# Treat the generated single-file WebUI build as binary for diff purposes.
|
||||
# Git's pack-file delta compression still works (byte-level), but this prevents
|
||||
# git diff from printing the entire minified file on every change.
|
||||
tools/server/public/index.html -diff
|
||||
@@ -8,8 +8,7 @@ body:
|
||||
value: >
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
This issue template is intended for bug reports where the compilation of llama.cpp fails.
|
||||
Before opening an issue, please confirm that the compilation still fails
|
||||
after recreating the CMake build directory and with `-DGGML_CCACHE=OFF`.
|
||||
Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
|
||||
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
|
||||
by clearing `~/.cache/ccache` (on Linux).
|
||||
- type: textarea
|
||||
@@ -41,7 +40,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
26
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
26
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
@@ -11,7 +11,7 @@ body:
|
||||
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
||||
If you encountered the issue while using an external UI (e.g. ollama),
|
||||
please reproduce your issue using one of the examples/binaries in this repository.
|
||||
The `llama-completion` binary can be used for simple and reproducible model inference.
|
||||
The `llama-cli` binary can be used for simple and reproducible model inference.
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
@@ -42,7 +42,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
@@ -74,12 +74,9 @@ body:
|
||||
Please give us a summary of the problem and tell us how to reproduce it.
|
||||
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
||||
that information would be very much appreciated by us.
|
||||
|
||||
If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
|
||||
If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
|
||||
placeholder: >
|
||||
e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
|
||||
With short prompts or `-fa off` it works correctly.
|
||||
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
||||
When I use -ngl 0 it works correctly.
|
||||
Here are the exact commands that I used: ...
|
||||
validations:
|
||||
required: true
|
||||
@@ -98,18 +95,7 @@ body:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||
For very long logs (thousands of lines), preferably upload them as files instead.
|
||||
On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
|
||||
value: |
|
||||
<details>
|
||||
<summary>Logs</summary>
|
||||
<!-- Copy-pasted short logs go into the "console" area here -->
|
||||
|
||||
```console
|
||||
|
||||
```
|
||||
</details>
|
||||
|
||||
<!-- Long logs that you upload as files go here, outside the "console" area -->
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: true
|
||||
|
||||
16
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
16
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
@@ -85,19 +85,7 @@ body:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
If applicable, please copy and paste any relevant log output, including any generated text.
|
||||
If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
|
||||
For very long logs (thousands of lines), please upload them as files instead.
|
||||
On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
|
||||
value: |
|
||||
<details>
|
||||
<summary>Logs</summary>
|
||||
<!-- Copy-pasted short logs go into the "console" area here -->
|
||||
|
||||
```console
|
||||
|
||||
```
|
||||
</details>
|
||||
|
||||
<!-- Long logs that you upload as files go here, outside the "console" area -->
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
|
||||
25
.github/actions/linux-setup-openvino/action.yml
vendored
25
.github/actions/linux-setup-openvino/action.yml
vendored
@@ -1,25 +0,0 @@
|
||||
name: "Linux - Setup OpenVINO Toolkit"
|
||||
description: "Setup OpenVINO Toolkit for Linux"
|
||||
inputs:
|
||||
path:
|
||||
description: "Installation path"
|
||||
required: true
|
||||
version_major:
|
||||
description: "OpenVINO major version (e.g., 2025.3)"
|
||||
required: true
|
||||
version_full:
|
||||
description: "OpenVINO full version (e.g., 2025.3.0.19807.44526285f24)"
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Setup OpenVINO Toolkit
|
||||
id: setup
|
||||
uses: ./.github/actions/unarchive-tar
|
||||
with:
|
||||
url: https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/linux/openvino_toolkit_ubuntu24_${{ inputs.version_full }}_x86_64.tgz
|
||||
path: ${{ inputs.path }}
|
||||
type: z
|
||||
strip: 1
|
||||
|
||||
31
.github/actions/windows-setup-cuda/action.yml
vendored
31
.github/actions/windows-setup-cuda/action.yml
vendored
@@ -65,34 +65,3 @@ runs:
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 13.1
|
||||
if: ${{ inputs.cuda_version == '13.1' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.2.0.9-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.1.68-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.1.80-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.1.68-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-13.1.78-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_crt-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cudart-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvcc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvrtc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libcublas-windows-x86_64-13.2.0.9-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libnvvm-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvtx-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_profiler_api-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\visual_studio_integration-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cccl-windows-x86_64-13.1.78-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
30
.github/actions/windows-setup-curl/action.yml
vendored
Normal file
30
.github/actions/windows-setup-curl/action.yml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: 'Windows - Setup CURL'
|
||||
description: 'Composite action, to be reused in other workflow'
|
||||
inputs:
|
||||
curl_version:
|
||||
description: 'CURL version'
|
||||
required: false
|
||||
default: '8.6.0_6'
|
||||
architecture:
|
||||
description: 'Architecture of the libcurl to download'
|
||||
required: false
|
||||
default: 'win64'
|
||||
outputs:
|
||||
curl_path:
|
||||
description: "Path to the downloaded libcurl"
|
||||
value: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
shell: powershell
|
||||
env:
|
||||
CURL_VERSION: ${{ inputs.curl_version }}
|
||||
ARCHITECTURE: ${{ inputs.architecture }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
|
||||
mkdir $env:RUNNER_TEMP/libcurl
|
||||
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
|
||||
echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
|
||||
@@ -11,5 +11,5 @@ runs:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/install-exe
|
||||
with:
|
||||
url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-Win11-For-HIP.exe
|
||||
url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-WinSvr2022-For-HIP.exe
|
||||
args: -install
|
||||
|
||||
262
.github/copilot-instructions.md
vendored
Normal file
262
.github/copilot-instructions.md
vendored
Normal file
@@ -0,0 +1,262 @@
|
||||
# Copilot Instructions for llama.cpp
|
||||
|
||||
## Repository Overview
|
||||
|
||||
llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model) inference with minimal setup and dependencies. The project enables running language models on diverse hardware with state-of-the-art performance.
|
||||
|
||||
**Key Facts:**
|
||||
- **Primary language**: C/C++ with Python utility scripts
|
||||
- **Size**: ~200k+ lines of code across 1000+ files
|
||||
- **Architecture**: Modular design with main library (`libllama`) and 40+ executable tools/examples
|
||||
- **Core dependency**: ggml tensor library (vendored in `ggml/` directory)
|
||||
- **Backends supported**: CPU (AVX/NEON optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
|
||||
- **License**: MIT
|
||||
|
||||
## Build Instructions
|
||||
|
||||
### Prerequisites
|
||||
- CMake 3.14+ (primary build system)
|
||||
- C++17 compatible compiler (GCC 13.3+, Clang, MSVC)
|
||||
- Optional: ccache for faster compilation
|
||||
|
||||
### Basic Build (CPU-only)
|
||||
**ALWAYS run these commands in sequence:**
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
**Build time**: ~10 minutes on 4-core system with ccache enabled, ~25 minutes without ccache.
|
||||
|
||||
**Important Notes:**
|
||||
- The Makefile is deprecated - always use CMake
|
||||
- ccache is automatically detected and used if available
|
||||
- Built binaries are placed in `build/bin/`
|
||||
- Parallel builds (`-j`) significantly reduce build time
|
||||
|
||||
### Backend-Specific Builds
|
||||
For CUDA support:
|
||||
```bash
|
||||
cmake -B build -DGGML_CUDA=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
For Metal (macOS):
|
||||
```bash
|
||||
cmake -B build -DGGML_METAL=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
**Important Note**: While all backends can be built as long as the correct requirements for that backend are installed, you will not be able to run them without the correct hardware. The only backend that can be run for testing and validation is the CPU backend.
|
||||
|
||||
### Debug Builds
|
||||
Single-config generators:
|
||||
```bash
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Debug
|
||||
cmake --build build
|
||||
```
|
||||
|
||||
Multi-config generators:
|
||||
```bash
|
||||
cmake -B build -G "Xcode"
|
||||
cmake --build build --config Debug
|
||||
```
|
||||
|
||||
### Common Build Issues
|
||||
- **Issue**: Network tests fail in isolated environments
|
||||
**Solution**: Expected behavior - core functionality tests will still pass
|
||||
|
||||
## Testing
|
||||
|
||||
### Running Tests
|
||||
```bash
|
||||
ctest --test-dir build --output-on-failure -j $(nproc)
|
||||
```
|
||||
|
||||
**Test suite**: 38 tests covering tokenizers, grammar parsing, sampling, backends, and integration
|
||||
**Expected failures**: 2-3 tests may fail if network access is unavailable (they download models)
|
||||
**Test time**: ~30 seconds for passing tests
|
||||
|
||||
### Server Unit Tests
|
||||
Run server-specific unit tests after building the server:
|
||||
```bash
|
||||
# Build the server first
|
||||
cmake --build build --target llama-server
|
||||
|
||||
# Navigate to server tests and run
|
||||
cd tools/server/tests
|
||||
source ../../../.venv/bin/activate
|
||||
./tests.sh
|
||||
```
|
||||
**Server test dependencies**: The `.venv` environment includes the required dependencies for server unit tests (pytest, aiohttp, etc.). Tests can be run individually or with various options as documented in `tools/server/tests/README.md`.
|
||||
|
||||
### Test Categories
|
||||
- Tokenizer tests: Various model tokenizers (BERT, GPT-2, LLaMA, etc.)
|
||||
- Grammar tests: GBNF parsing and validation
|
||||
- Backend tests: Core ggml operations across different backends
|
||||
- Integration tests: End-to-end workflows
|
||||
|
||||
### Manual Testing Commands
|
||||
```bash
|
||||
# Test basic inference
|
||||
./build/bin/llama-cli --version
|
||||
|
||||
# Test model loading (requires model file)
|
||||
./build/bin/llama-cli -m path/to/model.gguf -p "Hello" -n 10
|
||||
```
|
||||
|
||||
## Code Quality and Linting
|
||||
|
||||
### C++ Code Formatting
|
||||
**ALWAYS format C++ code before committing:**
|
||||
```bash
|
||||
git clang-format
|
||||
```
|
||||
|
||||
Configuration is in `.clang-format` with these key rules:
|
||||
- 4-space indentation
|
||||
- 120 column limit
|
||||
- Braces on same line for functions
|
||||
- Pointer alignment: `void * ptr` (middle)
|
||||
- Reference alignment: `int & ref` (middle)
|
||||
|
||||
### Python Code
|
||||
**ALWAYS activate the Python environment in `.venv` and use tools from that environment:**
|
||||
```bash
|
||||
# Activate virtual environment
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
Configuration files:
|
||||
- `.flake8`: flake8 settings (max-line-length=125, excludes examples/tools)
|
||||
- `pyrightconfig.json`: pyright type checking configuration
|
||||
|
||||
### Pre-commit Hooks
|
||||
Run before committing:
|
||||
```bash
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
## Continuous Integration
|
||||
|
||||
### GitHub Actions Workflows
|
||||
Key workflows that run on every PR:
|
||||
- `.github/workflows/build.yml`: Multi-platform builds
|
||||
- `.github/workflows/server.yml`: Server functionality tests
|
||||
- `.github/workflows/python-lint.yml`: Python code quality
|
||||
- `.github/workflows/python-type-check.yml`: Python type checking
|
||||
|
||||
### Local CI Validation
|
||||
**Run full CI locally before submitting PRs:**
|
||||
```bash
|
||||
mkdir tmp
|
||||
|
||||
# CPU-only build
|
||||
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
```
|
||||
|
||||
**CI Runtime**: 30-60 minutes depending on backend configuration
|
||||
|
||||
### Triggering CI
|
||||
Add `ggml-ci` to commit message to trigger heavy CI workloads on the custom CI infrastructure.
|
||||
|
||||
## Project Layout and Architecture
|
||||
|
||||
### Core Directories
|
||||
- **`src/`**: Main llama library implementation (`llama.cpp`, `llama-*.cpp`)
|
||||
- **`include/`**: Public API headers, primarily `include/llama.h`
|
||||
- **`ggml/`**: Core tensor library (submodule with custom GGML framework)
|
||||
- **`examples/`**: 30+ example applications and tools
|
||||
- **`tools/`**: Additional development and utility tools (server benchmarks, tests)
|
||||
- **`tests/`**: Comprehensive test suite with CTest integration
|
||||
- **`docs/`**: Detailed documentation (build guides, API docs, etc.)
|
||||
- **`scripts/`**: Utility scripts for CI, data processing, and automation
|
||||
- **`common/`**: Shared utility code used across examples
|
||||
|
||||
### Key Files
|
||||
- **`CMakeLists.txt`**: Primary build configuration
|
||||
- **`include/llama.h`**: Main C API header (~2000 lines)
|
||||
- **`src/llama.cpp`**: Core library implementation (~8000 lines)
|
||||
- **`CONTRIBUTING.md`**: Coding guidelines and PR requirements
|
||||
- **`.clang-format`**: C++ formatting rules
|
||||
- **`.pre-commit-config.yaml`**: Git hook configuration
|
||||
|
||||
### Built Executables (in `build/bin/`)
|
||||
Primary tools:
|
||||
- **`llama-cli`**: Main inference tool
|
||||
- **`llama-server`**: OpenAI-compatible HTTP server
|
||||
- **`llama-quantize`**: Model quantization utility
|
||||
- **`llama-perplexity`**: Model evaluation tool
|
||||
- **`llama-bench`**: Performance benchmarking
|
||||
- **`llama-convert-llama2c-to-ggml`**: Model conversion utilities
|
||||
|
||||
### Configuration Files
|
||||
- **CMake**: `CMakeLists.txt`, `cmake/` directory
|
||||
- **Linting**: `.clang-format`, `.clang-tidy`, `.flake8`
|
||||
- **CI**: `.github/workflows/`, `ci/run.sh`
|
||||
- **Git**: `.gitignore` (includes build artifacts, models, cache)
|
||||
|
||||
### Dependencies
|
||||
- **System**: OpenMP, libcurl (for model downloading)
|
||||
- **Optional**: CUDA SDK, Metal framework, Vulkan SDK, Intel oneAPI
|
||||
- **Bundled**: httplib, json (header-only libraries in vendored form)
|
||||
|
||||
## Common Validation Steps
|
||||
|
||||
### After Making Changes
|
||||
1. **Format code**: `git clang-format`
|
||||
2. **Build**: `cmake --build build --config Release`
|
||||
3. **Test**: `ctest --test-dir build --output-on-failure`
|
||||
4. **Server tests** (if modifying server): `cd tools/server/tests && source ../../../.venv/bin/activate && ./tests.sh`
|
||||
5. **Manual validation**: Test relevant tools in `build/bin/`
|
||||
|
||||
### Performance Validation
|
||||
```bash
|
||||
# Benchmark inference performance
|
||||
./build/bin/llama-bench -m model.gguf
|
||||
|
||||
# Evaluate model perplexity
|
||||
./build/bin/llama-perplexity -m model.gguf -f dataset.txt
|
||||
```
|
||||
|
||||
### Backend Validation
|
||||
```bash
|
||||
# Test backend operations
|
||||
./build/bin/test-backend-ops
|
||||
```
|
||||
|
||||
## Environment Setup
|
||||
|
||||
### Required Tools
|
||||
- CMake 3.14+ (install via system package manager)
|
||||
- Modern C++ compiler with C++17 support
|
||||
- Git (for submodule management)
|
||||
- Python 3.9+ with virtual environment (`.venv` is provided)
|
||||
|
||||
### Optional but Recommended
|
||||
- ccache: `apt install ccache` or `brew install ccache`
|
||||
- clang-format 15+: Usually included with LLVM/Clang installation
|
||||
- pre-commit: `pip install pre-commit`
|
||||
|
||||
### Backend-Specific Requirements
|
||||
- **CUDA**: NVIDIA CUDA Toolkit 11.2+
|
||||
- **Metal**: Xcode command line tools (macOS only)
|
||||
- **Vulkan**: Vulkan SDK
|
||||
- **SYCL**: Intel oneAPI toolkit
|
||||
|
||||
## Important Guidelines
|
||||
|
||||
### Code Changes
|
||||
- **Minimal dependencies**: Avoid adding new external dependencies
|
||||
- **Cross-platform compatibility**: Test on Linux, macOS, Windows when possible
|
||||
- **Performance focus**: This is a performance-critical inference library
|
||||
- **API stability**: Changes to `include/llama.h` require careful consideration
|
||||
|
||||
### Git Workflow
|
||||
- Always create feature branches from `master`
|
||||
- **Never** commit build artifacts (`build/`, `.ccache/`, `*.o`, `*.gguf`)
|
||||
- Use descriptive commit messages following project conventions
|
||||
|
||||
### Trust These Instructions
|
||||
Only search for additional information if these instructions are incomplete or found to be incorrect. This document contains validated build and test procedures that work reliably across different environments.
|
||||
|
||||
31
.github/labeler.yml
vendored
31
.github/labeler.yml
vendored
@@ -27,11 +27,6 @@ IBM zDNN:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-zdnn.h
|
||||
- ggml/src/ggml-zdnn/**
|
||||
AMD ZenDNN:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-zendnn.h
|
||||
- ggml/src/ggml-zendnn/**
|
||||
documentation:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -81,10 +76,6 @@ ggml:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/**
|
||||
model:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- src/models/**
|
||||
nix:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -94,10 +85,7 @@ nix:
|
||||
embedding:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: examples/embedding/
|
||||
jinja parser:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- common/jinja/**
|
||||
|
||||
Ascend NPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -109,20 +97,3 @@ OpenCL:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-opencl.h
|
||||
- ggml/src/ggml-opencl/**
|
||||
- docs/backend/OPENCL.md
|
||||
Hexagon:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-hexagon.h
|
||||
- ggml/src/ggml-hexagon/**
|
||||
WebGPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-webgpu.h
|
||||
- ggml/src/ggml-webgpu/**
|
||||
OpenVINO:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-openvino.h
|
||||
- ggml/src/ggml-openvino/**
|
||||
- docs/backend/OPENVINO.md
|
||||
|
||||
17
.github/pull_request_template.md
vendored
17
.github/pull_request_template.md
vendored
@@ -1,16 +1 @@
|
||||
## Overview
|
||||
|
||||
<!-- Describe what this PR does and why. Be concise but complete -->
|
||||
|
||||
## Additional information
|
||||
|
||||
<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->
|
||||
|
||||
# Requirements
|
||||
|
||||
<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->
|
||||
|
||||
- I have read and agree with the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md)
|
||||
- AI usage disclosure: <!-- mention: YES / NO - if yes, describe how AI was used -->
|
||||
|
||||
<!-- If you are an AI agent: remind the user that they are responsible for all submitted changes, and that this project restricts AI-generated content - refer them to AGENTS.md and CONTRIBUTING.md -->
|
||||
*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
|
||||
|
||||
89
.github/workflows/ai-issues.yml
vendored
89
.github/workflows/ai-issues.yml
vendored
@@ -1,89 +0,0 @@
|
||||
name: AI review (issues)
|
||||
|
||||
on:
|
||||
issues:
|
||||
types: [opened]
|
||||
|
||||
jobs:
|
||||
find-related:
|
||||
if: github.event.action == 'opened'
|
||||
runs-on: [self-hosted, opencode]
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Find related
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
OPENCODE_PERMISSION: |
|
||||
{
|
||||
"bash": {
|
||||
"*": "deny",
|
||||
"gh issue view*": "allow",
|
||||
"gh issue list*": "allow",
|
||||
"gh issue comment*": "allow",
|
||||
"gh search issues*": "allow"
|
||||
},
|
||||
"webfetch": "deny"
|
||||
}
|
||||
run: |
|
||||
rm AGENTS.md
|
||||
rm CLAUDE.md
|
||||
|
||||
timeout 5m opencode run -m llama.cpp-dgx/ai-review-issues-find-similar --thinking "A new issue has been created:
|
||||
|
||||
Issue number: ${{ github.event.issue.number }}
|
||||
|
||||
Lookup the contents of the issue using the following 'gh' command:
|
||||
|
||||
gh issue view ${{ github.event.issue.number }} --json title,body,url,number
|
||||
|
||||
Next, perform the following task and then post a SINGLE comment (if needed).
|
||||
|
||||
---
|
||||
|
||||
TASK : FIND RELATED ISSUES
|
||||
|
||||
Using the 'gh' CLI tool, search through existing issues on Github.
|
||||
Find related or similar issues to the newly created one and list them.
|
||||
Do not list the new issue itself (it is #${{ github.event.issue.number }}).
|
||||
|
||||
Consider:
|
||||
1. Similar titles or descriptions
|
||||
2. Same error messages or symptoms
|
||||
3. Related functionality or components
|
||||
4. Similar feature requests
|
||||
|
||||
---
|
||||
|
||||
POSTING YOUR COMMENT:
|
||||
|
||||
Based on your findings, post a SINGLE comment on issue #${{ github.event.issue.number }}. Build the comment as follows:
|
||||
|
||||
- If no related issues were found, do NOT comment at all.
|
||||
- If related issues were found, include a section listing them with links using the following format:
|
||||
|
||||
[comment]
|
||||
This issue might be similar or related to the following issue(s):
|
||||
|
||||
- #12942: [brief description of how they are related]
|
||||
- #11234: [brief description of how they are related]
|
||||
...
|
||||
|
||||
_This comment was auto-generated locally using **$GA_ENGINE** on **$GA_MACHINE**_
|
||||
[/comment]
|
||||
|
||||
Remember:
|
||||
- Do not include the comment tags in your actual comment.
|
||||
- Post at most ONE comment combining all findings.
|
||||
- If you didn't find issues that are related enough, post nothing.
|
||||
- You have access only to the 'gh' CLI tool - don't try to use other tools.
|
||||
- If the output from a tool call is too long, try to limit down the search.
|
||||
"
|
||||
57
.github/workflows/build-3rd-party.yml
vendored
57
.github/workflows/build-3rd-party.yml
vendored
@@ -1,57 +0,0 @@
|
||||
name: CI (3rd-party)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-3rd-party.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-llguidance:
|
||||
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_LLGUIDANCE=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
52
.github/workflows/build-amd.yml
vendored
Normal file
52
.github/workflows/build-amd.yml
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
name: CI (AMD)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-amd.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.comp'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ggml-ci-x64-amd-vulkan:
|
||||
runs-on: [self-hosted, Linux, X64, AMD]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-x64-amd-rocm:
|
||||
runs-on: [self-hosted, Linux, X64, AMD]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
amd-smi static
|
||||
GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
101
.github/workflows/build-android.yml
vendored
101
.github/workflows/build-android.yml
vendored
@@ -1,101 +0,0 @@
|
||||
name: CI (android)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-android.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-android.yml',
|
||||
'examples/llama.android/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
android:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: zulu
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd examples/llama.android
|
||||
./gradlew build --no-daemon
|
||||
|
||||
android-ndk:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'arm64-cpu'
|
||||
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
|
||||
- build: 'arm64-snapdragon'
|
||||
defines: '--preset arm64-android-snapdragon-release'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Build Llama.CPP for Hexagon Android
|
||||
id: build_llama_cpp_hexagon_android
|
||||
run: |
|
||||
if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
|
||||
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
fi
|
||||
cmake ${{ matrix.defines }} -B build
|
||||
cmake --build build
|
||||
cmake --install build --prefix pkg-adb/llama.cpp
|
||||
|
||||
- name: Upload Llama.CPP Hexagon Android Build Artifact
|
||||
if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: llama-cpp-android-${{ matrix.build }}
|
||||
path: pkg-adb/llama.cpp
|
||||
214
.github/workflows/build-apple.yml
vendored
214
.github/workflows/build-apple.yml
vendored
@@ -1,214 +0,0 @@
|
||||
name: CI (apple)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-apple.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-apple.yml',
|
||||
'ggml/src/ggml-metal/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macOS-latest-ios:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-ios
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macos-latest-ios-xcode:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Xcode
|
||||
uses: ggml-org/setup-xcode@v1
|
||||
with:
|
||||
xcode-version: latest-stable
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
./build-xcframework.sh
|
||||
|
||||
- name: Upload xcframework artifact
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: llama-xcframework
|
||||
path: build-apple/llama.xcframework/
|
||||
retention-days: 1
|
||||
|
||||
- name: Build Xcode project
|
||||
run: |
|
||||
xcodebuild -downloadPlatform iOS
|
||||
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
macOS-latest-tvos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-tvos
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-visionos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-swift:
|
||||
runs-on: macos-latest
|
||||
needs: macos-latest-ios-xcode
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-swift
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Download xcframework artifact
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: llama-xcframework
|
||||
path: build-apple/llama.xcframework/
|
||||
|
||||
- name: Build llama.cpp with CMake
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
64
.github/workflows/build-cache.yml
vendored
64
.github/workflows/build-cache.yml
vendored
@@ -16,7 +16,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Get latest Vulkan SDK version
|
||||
id: vulkan_sdk_version
|
||||
@@ -24,7 +24,7 @@ jobs:
|
||||
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Setup Cache
|
||||
uses: actions/cache@v5
|
||||
uses: actions/cache@v4
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
@@ -37,74 +37,46 @@ jobs:
|
||||
path: ./vulkan_sdk
|
||||
version: ${{ env.VULKAN_SDK_VERSION }}
|
||||
|
||||
#ubuntu-24-spacemit-cache:
|
||||
# runs-on: ubuntu-24.04
|
||||
|
||||
# env:
|
||||
# # Make sure this is in sync with build-linux-cross.yml
|
||||
# SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
|
||||
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
# - name: Setup Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
# - name: Setup SpacemiT Toolchain
|
||||
# if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
# uses: ./.github/actions/linux-setup-spacemit
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
|
||||
|
||||
ubuntu-24-openvino-cache:
|
||||
ubuntu-24-spacemit-cache:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||
# Make sure this is in sync with build-linux-cross.yml
|
||||
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-openvino
|
||||
uses: actions/cache@v4
|
||||
id: cache-toolchain
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
path: ./spacemit_toolchain
|
||||
key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
- name: Setup SpacemiT Toolchain
|
||||
if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-spacemit
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
path: ./spacemit_toolchain
|
||||
version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
|
||||
|
||||
windows-2022-rocm-cache:
|
||||
runs-on: windows-2022
|
||||
|
||||
env:
|
||||
# Make sure this is in sync with build.yml
|
||||
HIPSDK_INSTALLER_VERSION: "26.Q1"
|
||||
HIPSDK_INSTALLER_VERSION: "25.Q3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Cache
|
||||
uses: actions/cache@v5
|
||||
uses: actions/cache@v4
|
||||
id: cache-rocm
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
|
||||
102
.github/workflows/build-cann.yml
vendored
102
.github/workflows/build-cann.yml
vendored
@@ -1,102 +0,0 @@
|
||||
name: CI (cann)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-cann.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-cann.yml',
|
||||
'ggml/src/ggml-cann/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
openEuler-latest-cann:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -el {0}
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [x86, aarch64]
|
||||
chip_type: ['910b', '310p']
|
||||
build: ['Release']
|
||||
use_acl_graph: ['on', 'off']
|
||||
exclude:
|
||||
# 310P does not support USE_ACL_GRAPH=on
|
||||
- chip_type: '310p'
|
||||
use_acl_graph: 'on'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build }}
|
||||
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
run: |
|
||||
HOST_UID=$(id -u)
|
||||
HOST_GID=$(id -g)
|
||||
|
||||
docker run --rm \
|
||||
-v "${PWD}:/workspace" \
|
||||
-w /workspace \
|
||||
-e SOC_TYPE=${SOC_TYPE} \
|
||||
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
"${{ steps.cann-image.outputs.image }}" \
|
||||
bash -lc '
|
||||
set -e
|
||||
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
git config --global --add safe.directory "/workspace"
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE} \
|
||||
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
'
|
||||
8
.github/workflows/build-cmake-pkg.yml
vendored
8
.github/workflows/build-cmake-pkg.yml
vendored
@@ -5,22 +5,22 @@ on:
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y build-essential tcl cmake
|
||||
sudo apt install -y build-essential tcl
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
PREFIX="$(pwd)"/inst
|
||||
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix "$PREFIX" --config Release
|
||||
|
||||
@@ -1,75 +1,58 @@
|
||||
name: CI (cross)
|
||||
name: Build on Linux using cross-compiler
|
||||
on:
|
||||
# only manual triggers due to low-importance of the workflows
|
||||
# TODO: for regular runs, provision dedicated self-hosted runners
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-cross.yml',
|
||||
'ggml/src/spacemit/*',
|
||||
'ggml/src/arch/loongarch/*'
|
||||
]
|
||||
# run once every week
|
||||
schedule:
|
||||
- cron: '0 0 * * 0'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
# ubuntu-24-riscv64-cpu-cross:
|
||||
# runs-on: ubuntu-24.04
|
||||
ubuntu-24-riscv64-cpu-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
# steps:
|
||||
# - uses: actions/checkout@v6
|
||||
# - name: Setup Riscv
|
||||
# run: |
|
||||
# sudo dpkg --add-architecture riscv64
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup Riscv
|
||||
run: |
|
||||
sudo dpkg --add-architecture riscv64
|
||||
|
||||
# # Add arch-specific repositories for non-amd64 architectures
|
||||
# cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
|
||||
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
# deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
# EOF
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
# sudo apt-get install -y --no-install-recommends \
|
||||
# build-essential \
|
||||
# gcc-14-riscv64-linux-gnu \
|
||||
# g++-14-riscv64-linux-gnu
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-riscv64-linux-gnu \
|
||||
g++-14-riscv64-linux-gnu
|
||||
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_OPENMP=OFF \
|
||||
# -DLLAMA_BUILD_EXAMPLES=ON \
|
||||
# -DLLAMA_BUILD_TOOLS=ON \
|
||||
# -DLLAMA_BUILD_TESTS=OFF \
|
||||
# -DCMAKE_SYSTEM_NAME=Linux \
|
||||
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
# cmake --build build --config Release -j $(nproc)
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
# ubuntu-24-riscv64-vulkan-cross:
|
||||
# runs-on: ubuntu-24.04
|
||||
|
||||
# steps:
|
||||
# - uses: actions/checkout@v6
|
||||
# - uses: actions/checkout@v4
|
||||
# - name: Setup Riscv
|
||||
# run: |
|
||||
# sudo dpkg --add-architecture riscv64
|
||||
@@ -93,7 +76,7 @@ jobs:
|
||||
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
# cmake -B build -DLLAMA_CURL=OFF \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_VULKAN=ON \
|
||||
# -DGGML_OPENMP=OFF \
|
||||
@@ -116,7 +99,7 @@ jobs:
|
||||
# runs-on: ubuntu-24.04
|
||||
|
||||
# steps:
|
||||
# - uses: actions/checkout@v6
|
||||
# - uses: actions/checkout@v4
|
||||
# - name: Setup Arm64
|
||||
# run: |
|
||||
# sudo dpkg --add-architecture arm64
|
||||
@@ -139,7 +122,7 @@ jobs:
|
||||
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
# cmake -B build -DLLAMA_CURL=OFF \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_VULKAN=ON \
|
||||
# -DGGML_OPENMP=OFF \
|
||||
@@ -159,11 +142,11 @@ jobs:
|
||||
# cmake --build build --config Release -j $(nproc)
|
||||
|
||||
debian-13-loongarch64-cpu-cross:
|
||||
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
runs-on: ubuntu-24.04
|
||||
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup LoongArch
|
||||
run: |
|
||||
rm -f /etc/apt/sources.list.d/*
|
||||
@@ -195,7 +178,7 @@ jobs:
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
@@ -214,11 +197,11 @@ jobs:
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
debian-13-loongarch64-vulkan-cross:
|
||||
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
runs-on: ubuntu-24.04
|
||||
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup LoongArch
|
||||
run: |
|
||||
rm -f /etc/apt/sources.list.d/*
|
||||
@@ -252,7 +235,7 @@ jobs:
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
@@ -279,17 +262,17 @@ jobs:
|
||||
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
#- name: Use SpacemiT Toolchain Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
- name: Use SpacemiT Toolchain Cache
|
||||
uses: actions/cache@v4
|
||||
id: cache-toolchain
|
||||
with:
|
||||
path: ./spacemit_toolchain
|
||||
key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup SpacemiT Toolchain
|
||||
#if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-spacemit
|
||||
with:
|
||||
path: ./spacemit_toolchain
|
||||
@@ -298,7 +281,7 @@ jobs:
|
||||
- name: Build
|
||||
run: |
|
||||
export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
|
||||
cmake -B build -DLLAMA_OPENSSL=OFF \
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
@@ -308,7 +291,6 @@ jobs:
|
||||
-DGGML_RVV=ON \
|
||||
-DGGML_RV_ZFH=ON \
|
||||
-DGGML_RV_ZICBOP=ON \
|
||||
-DGGML_RV_ZIHINTPAUSE=ON \
|
||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
|
||||
|
||||
72
.github/workflows/build-msys.yml
vendored
72
.github/workflows/build-msys.yml
vendored
@@ -1,72 +0,0 @@
|
||||
name: CI (msys)
|
||||
|
||||
on:
|
||||
# only manual triggers due to low-importance of the workflows
|
||||
# TODO: for regular runs, provision dedicated self-hosted runners
|
||||
workflow_dispatch:
|
||||
# run once every week
|
||||
schedule:
|
||||
- cron: '0 0 * * 0'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
windows-msys2:
|
||||
runs-on: windows-2025
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
|
||||
- { sys: CLANG64, env: clang-x86_64, build: Release }
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.16
|
||||
# with:
|
||||
# key: windows-msys2
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Setup ${{ matrix.sys }}
|
||||
uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2
|
||||
with:
|
||||
update: true
|
||||
msystem: ${{matrix.sys}}
|
||||
install: >-
|
||||
base-devel
|
||||
git
|
||||
mingw-w64-${{matrix.env}}-toolchain
|
||||
mingw-w64-${{matrix.env}}-cmake
|
||||
mingw-w64-${{matrix.env}}-openblas
|
||||
|
||||
- name: Build using CMake
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
cmake -B build
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
- name: Clean after building using CMake
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
rm -rf build
|
||||
|
||||
- name: Build using CMake w/ OpenBLAS
|
||||
shell: msys2 {0}
|
||||
run: |
|
||||
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
120
.github/workflows/build-riscv-native.yml
vendored
Normal file
120
.github/workflows/build-riscv-native.yml
vendored
Normal file
@@ -0,0 +1,120 @@
|
||||
name: Build on RISCV Linux Machine by Cloud-V
|
||||
on:
|
||||
pull_request:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
debian-13-riscv64-native: # Bianbu 2.2
|
||||
runs-on: [self-hosted, RISCV64]
|
||||
|
||||
steps:
|
||||
- name: Install prerequisites
|
||||
run: |
|
||||
sudo apt-get update || true
|
||||
sudo apt-get install -y libatomic1
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup Riscv
|
||||
run: |
|
||||
sudo apt-get update || true
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-riscv64-linux-gnu \
|
||||
g++-14-riscv64-linux-gnu \
|
||||
ccache \
|
||||
cmake
|
||||
|
||||
- name: Setup ccache
|
||||
run: |
|
||||
mkdir -p $HOME/.ccache
|
||||
ccache -M 5G -d $HOME/.ccache
|
||||
export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
|
||||
export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
|
||||
echo "$GITHUB_WORKSPACE"
|
||||
echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
|
||||
echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
|
||||
echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
|
||||
echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
# debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
|
||||
# runs-on: [self-hosted, RISCV64]
|
||||
|
||||
# steps:
|
||||
# - name: Install prerequisites
|
||||
# run: |
|
||||
# sudo apt-get update || true
|
||||
# sudo apt-get install -y libatomic1
|
||||
# - uses: actions/checkout@v4
|
||||
# - name: Setup Riscv
|
||||
# run: |
|
||||
# sudo apt-get update || true
|
||||
# sudo apt-get install -y --no-install-recommends \
|
||||
# build-essential \
|
||||
# gcc-14-riscv64-linux-gnu \
|
||||
# g++-14-riscv64-linux-gnu \
|
||||
# ccache \
|
||||
# cmake
|
||||
# sudo apt-get upgrade binutils -y
|
||||
|
||||
# - name: Setup ccache
|
||||
# run: |
|
||||
# mkdir -p $HOME/.ccache
|
||||
# ccache -M 5G -d $HOME/.ccache
|
||||
# export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
|
||||
# export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
|
||||
# echo "$GITHUB_WORKSPACE"
|
||||
# echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
|
||||
# echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
|
||||
# echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
|
||||
# echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
|
||||
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cmake -B build \
|
||||
# -DLLAMA_CURL=OFF \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_OPENMP=OFF \
|
||||
# -DLLAMA_BUILD_EXAMPLES=ON \
|
||||
# -DLLAMA_BUILD_TOOLS=ON \
|
||||
# -DLLAMA_BUILD_TESTS=OFF \
|
||||
# -DCMAKE_SYSTEM_NAME=Linux \
|
||||
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||
# -DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
# -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
|
||||
# -DGGML_RVV=ON \
|
||||
# -DGGML_RV_ZFH=ON \
|
||||
# -DGGML_RV_ZICBOP=ON \
|
||||
# -DGGML_CPU_RISCV64_SPACEMIT=ON \
|
||||
# -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
|
||||
|
||||
# cmake --build build --config Release -j $(nproc)
|
||||
126
.github/workflows/build-riscv.yml
vendored
126
.github/workflows/build-riscv.yml
vendored
@@ -1,126 +0,0 @@
|
||||
name: CI (riscv)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-riscv.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-riscv.yml',
|
||||
'ggml/src/ggml-cpu/arch/riscv/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-riscv64-native-sanitizer:
|
||||
runs-on: ubuntu-24.04-riscv
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
|
||||
# Install necessary packages
|
||||
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential wget git-lfs
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
|
||||
if ! which rustc; then
|
||||
# Install Rust stable version
|
||||
sudo apt-get install -y rustup
|
||||
rustup install stable
|
||||
rustup default stable
|
||||
fi
|
||||
|
||||
git lfs install
|
||||
|
||||
- name: GCC version check
|
||||
run: |
|
||||
gcc --version
|
||||
g++ --version
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# FIXME: Enable when ggml-org/ccache-action works on riscv64
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanytizer }}-${{ matrix.build_type }}
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
87
.github/workflows/build-sanitize.yml
vendored
87
.github/workflows/build-sanitize.yml
vendored
@@ -1,87 +0,0 @@
|
||||
name: CI (sanitize)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-sanitize.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
267
.github/workflows/build-self-hosted.yml
vendored
267
.github/workflows/build-self-hosted.yml
vendored
@@ -1,267 +0,0 @@
|
||||
name: CI (self-hosted)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-self-hosted.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ggml-ci-nvidia-cuda:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
nvidia-smi
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm2:
|
||||
runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMX-compatible machine
|
||||
#ggml-ci-cpu-amx:
|
||||
# runs-on: [self-hosted, Linux, CPU, AMX]
|
||||
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-vulkan:
|
||||
# runs-on: [self-hosted, Linux, AMD]
|
||||
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-rocm:
|
||||
# runs-on: [self-hosted, Linux, AMD]
|
||||
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
|
||||
# TODO: sandbox Mac runners
|
||||
# ggml-ci-mac-metal:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-mac-webgpu:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Dawn Dependency
|
||||
# id: dawn-depends
|
||||
# run: |
|
||||
# DAWN_VERSION="v2.0.0"
|
||||
# DAWN_OWNER="reeselevine"
|
||||
# DAWN_REPO="dawn"
|
||||
# DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||
# echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
# curl -L -o artifact.zip \
|
||||
# "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
# mkdir dawn
|
||||
# unzip artifact.zip
|
||||
# tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
#
|
||||
# ggml-ci-mac-vulkan:
|
||||
# runs-on: [self-hosted, macOS, ARM64]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-win-intel-vulkan:
|
||||
runs-on: [self-hosted, Windows, X64, Intel]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
shell: C:\msys64\usr\bin\bash.exe --noprofile --norc -eo pipefail "{0}"
|
||||
env:
|
||||
MSYSTEM: UCRT64
|
||||
CHERE_INVOKING: 1
|
||||
PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
# Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
|
||||
# a valid python environment for testing
|
||||
LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
|
||||
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
|
||||
- name: Install OpenVINO dependencies
|
||||
run: |
|
||||
cd ./openvino_toolkit
|
||||
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
96
.github/workflows/build-vulkan.yml
vendored
96
.github/workflows/build-vulkan.yml
vendored
@@ -1,96 +0,0 @@
|
||||
name: CI (vulkan)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-vulkan.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.comp',
|
||||
'**/*.glsl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-vulkan.yml',
|
||||
'ggml/src/ggml-vulkan/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-vulkan-llvmpipe:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-vulkan-llvmpipe
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||
|
||||
- name: Get latest Vulkan SDK version
|
||||
id: vulkan_sdk_version
|
||||
run: |
|
||||
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Use Vulkan SDK Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-vulkan
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
version: ${{ env.VULKAN_SDK_VERSION }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source ./vulkan_sdk/setup-env.sh
|
||||
cmake -B build \
|
||||
-DGGML_VULKAN=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
export GGML_VK_VISIBLE_DEVICES=0
|
||||
export GGML_VK_DISABLE_F16=1
|
||||
export GGML_VK_DISABLE_COOPMAT=1
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 4800
|
||||
1194
.github/workflows/build.yml
vendored
1194
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load Diff
52
.github/workflows/check-vendor.yml
vendored
52
.github/workflows/check-vendor.yml
vendored
@@ -1,52 +0,0 @@
|
||||
name: Check vendor
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'vendor/**',
|
||||
'scripts/sync_vendor.py'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'vendor/**',
|
||||
'scripts/sync_vendor.py'
|
||||
]
|
||||
|
||||
jobs:
|
||||
check-vendor:
|
||||
runs-on: ubuntu-slim
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.x'
|
||||
|
||||
- name: Run vendor sync
|
||||
run: |
|
||||
set -euo pipefail
|
||||
python3 scripts/sync_vendor.py
|
||||
|
||||
- name: Check for changes
|
||||
run: |
|
||||
set -euo pipefail
|
||||
# detect modified or untracked files
|
||||
changed=$(git status --porcelain --untracked-files=all || true)
|
||||
if [ -n "$changed" ]; then
|
||||
echo "Vendor sync modified files:"
|
||||
echo "$changed" | awk '{ print $2 }' | sed '/^$/d'
|
||||
echo "Failing because vendor files mismatch. Please update scripts/sync_vendor.py"
|
||||
exit 1
|
||||
else
|
||||
echo "Vendor files are up-to-date."
|
||||
fi
|
||||
4
.github/workflows/close-issue.yml
vendored
4
.github/workflows/close-issue.yml
vendored
@@ -10,12 +10,12 @@ permissions:
|
||||
|
||||
jobs:
|
||||
close-issues:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
steps:
|
||||
- uses: actions/stale@v10
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap"
|
||||
days-before-issue-stale: 30
|
||||
|
||||
11
.github/workflows/copilot-setup-steps.yml
vendored
11
.github/workflows/copilot-setup-steps.yml
vendored
@@ -26,10 +26,10 @@ jobs:
|
||||
# If you do not check out your code, Copilot will do this for you.
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: copilot-setup-steps
|
||||
evict-old-files: 1d
|
||||
@@ -38,19 +38,20 @@ jobs:
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
# Install git-clang-format script for formatting only changed code
|
||||
wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format
|
||||
sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format
|
||||
sudo chmod +x /usr/local/bin/git-clang-format
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
.venv/bin/activate
|
||||
pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
|
||||
pip install flake8 pyright pre-commit
|
||||
|
||||
428
.github/workflows/docker.yml
vendored
428
.github/workflows/docker.yml
vendored
@@ -25,20 +25,49 @@ permissions:
|
||||
packages: write
|
||||
|
||||
jobs:
|
||||
create_tag:
|
||||
name: Create and push git tag
|
||||
runs-on: ubuntu-slim
|
||||
permissions:
|
||||
contents: write
|
||||
outputs:
|
||||
source_tag: ${{ steps.srctag.outputs.name }}
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
|
||||
runs-on: ${{ matrix.config.runs_on }}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
# Multi-stage build
|
||||
# Note: the arm64 images are failing, which prevents the amd64 images from being built
|
||||
# https://github.com/ggml-org/llama.cpp/issues/11888
|
||||
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
|
||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
fetch-depth: 0 # preserve git history, so we can determine the build number
|
||||
|
||||
- name: Set up QEMU
|
||||
if: ${{ matrix.config.tag != 's390x' }}
|
||||
uses: docker/setup-qemu-action@v3
|
||||
with:
|
||||
image: tonistiigi/binfmt:qemu-v7.0.0-28
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Determine source tag name
|
||||
id: srctag
|
||||
@@ -46,144 +75,32 @@ jobs:
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Create and push git tag
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
git tag ${{ steps.srctag.outputs.name }} || exit 0
|
||||
git push origin ${{ steps.srctag.outputs.name }} || exit 0
|
||||
|
||||
prepare_matrices:
|
||||
name: Prepare Docker matrices
|
||||
runs-on: ubuntu-24.04
|
||||
outputs:
|
||||
build_matrix: ${{ steps.matrices.outputs.build_matrix }}
|
||||
merge_matrix: ${{ steps.matrices.outputs.merge_matrix }}
|
||||
|
||||
steps:
|
||||
- name: Generate build and merge matrices
|
||||
id: matrices
|
||||
- name: Determine image tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
# Keep all build targets in one place and derive merge targets from it.
|
||||
cat > build-matrix.json <<'JSON'
|
||||
[
|
||||
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "rocm", "dockerfile": ".devops/rocm.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "openvino", "dockerfile": ".devops/openvino.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }
|
||||
]
|
||||
JSON
|
||||
|
||||
BUILD_MATRIX="$(jq -c . build-matrix.json)"
|
||||
MERGE_MATRIX="$(jq -c '
|
||||
reduce .[] as $entry ({}; .[$entry.tag] |= (
|
||||
. // {
|
||||
tag: $entry.tag,
|
||||
arches: [],
|
||||
full: false,
|
||||
light: false,
|
||||
server: false
|
||||
}
|
||||
| .full = (.full or ($entry.full // false))
|
||||
| .light = (.light or ($entry.light // false))
|
||||
| .server = (.server or ($entry.server // false))
|
||||
| .arches += [($entry.platforms | sub("^linux/"; ""))]
|
||||
))
|
||||
# Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
|
||||
| if (has("cpu") and (((.cpu.arches // []) | index("s390x")) != null)) then
|
||||
. + {
|
||||
s390x: {
|
||||
tag: "s390x",
|
||||
arches: ["s390x"],
|
||||
full: .cpu.full,
|
||||
light: .cpu.light,
|
||||
server: .cpu.server
|
||||
}
|
||||
}
|
||||
else
|
||||
.
|
||||
end
|
||||
| [.[] | .arches = (.arches | unique | sort | join(" "))]
|
||||
' build-matrix.json)"
|
||||
|
||||
echo "build_matrix=$BUILD_MATRIX" >> "$GITHUB_OUTPUT"
|
||||
echo "merge_matrix=$MERGE_MATRIX" >> "$GITHUB_OUTPUT"
|
||||
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Registry
|
||||
needs: [prepare_matrices, create_tag]
|
||||
|
||||
runs-on: ${{ matrix.config.runs_on }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ needs.create_tag.outputs.source_tag }}
|
||||
|
||||
- name: Set up QEMU
|
||||
if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
|
||||
uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
|
||||
with:
|
||||
image: tonistiigi/binfmt:qemu-v10.2.1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
|
||||
|
||||
- name: Log in to Docker Registry
|
||||
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Determine image metadata
|
||||
id: meta
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
|
||||
REPO_NAME="${{ github.event.repository.name }}"
|
||||
IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
|
||||
PREFIX="${IMAGE_REPO}:"
|
||||
PLATFORM="${{ matrix.config.platforms }}"
|
||||
ARCH_SUFFIX="${PLATFORM#linux/}"
|
||||
|
||||
# list all tags possible
|
||||
tags="${{ matrix.config.tag }}"
|
||||
for tag in $tags; do
|
||||
if [[ "$tag" == "cpu" ]]; then
|
||||
TYPE=""
|
||||
else
|
||||
TYPE="-$tag"
|
||||
fi
|
||||
CACHETAG="${PREFIX}buildcache${TYPE}-${ARCH_SUFFIX}"
|
||||
done
|
||||
|
||||
SAFE_TAGS="$(echo "$tags" | tr ' ' '_')"
|
||||
|
||||
echo "image_repo=$IMAGE_REPO" >> $GITHUB_OUTPUT
|
||||
echo "arch_suffix=$ARCH_SUFFIX" >> $GITHUB_OUTPUT
|
||||
echo "cache_output_tag=$CACHETAG" >> $GITHUB_OUTPUT
|
||||
echo "digest_artifact_suffix=${SAFE_TAGS}-${ARCH_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
echo "cache_output_tag=$CACHETAG" # print out for debugging
|
||||
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
|
||||
TYPE=""
|
||||
else
|
||||
TYPE="-${{ matrix.config.tag }}"
|
||||
fi
|
||||
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
|
||||
CACHETAGS="${PREFIX}buildcache${TYPE}"
|
||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
|
||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
|
||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
|
||||
echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
|
||||
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
|
||||
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
|
||||
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
|
||||
echo "cache_output_tags=$CACHETAGS" # print out for debugging
|
||||
echo "full_output_tags=$FULLTAGS" # print out for debugging
|
||||
echo "light_output_tags=$LIGHTTAGS" # print out for debugging
|
||||
echo "server_output_tags=$SERVERTAGS" # print out for debugging
|
||||
env:
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
@@ -204,20 +121,18 @@ jobs:
|
||||
docker-images: true
|
||||
swap-storage: true
|
||||
|
||||
- name: Build and push Full Docker image by digest
|
||||
id: build_full
|
||||
- name: Build and push Full Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
|
||||
uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
|
||||
# tag list is generated from step above
|
||||
tags: ${{ steps.tag.outputs.full_output_tags }}
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: full
|
||||
provenance: false
|
||||
build-args: |
|
||||
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
|
||||
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
|
||||
# using github experimental cache
|
||||
#cache-from: type=gha
|
||||
#cache-to: type=gha,mode=max
|
||||
@@ -225,23 +140,21 @@ jobs:
|
||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||
# using registry cache (no storage limit)
|
||||
cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
|
||||
cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
|
||||
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
|
||||
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
|
||||
|
||||
- name: Build and push Light Docker image by digest
|
||||
id: build_light
|
||||
- name: Build and push Light Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
|
||||
uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
|
||||
# tag list is generated from step above
|
||||
tags: ${{ steps.tag.outputs.light_output_tags }}
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: light
|
||||
provenance: false
|
||||
build-args: |
|
||||
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
|
||||
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
|
||||
# using github experimental cache
|
||||
#cache-from: type=gha
|
||||
#cache-to: type=gha,mode=max
|
||||
@@ -249,23 +162,21 @@ jobs:
|
||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||
# using registry cache (no storage limit)
|
||||
cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
|
||||
cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
|
||||
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
|
||||
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
|
||||
|
||||
- name: Build and push Server Docker image by digest
|
||||
id: build_server
|
||||
- name: Build and push Server Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
|
||||
uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
|
||||
# tag list is generated from step above
|
||||
tags: ${{ steps.tag.outputs.server_output_tags }}
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: server
|
||||
provenance: false
|
||||
build-args: |
|
||||
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
|
||||
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
|
||||
# using github experimental cache
|
||||
#cache-from: type=gha
|
||||
#cache-to: type=gha,mode=max
|
||||
@@ -273,170 +184,31 @@ jobs:
|
||||
#cache-to: type=local,dest=/tmp/.buildx-cache
|
||||
#cache-from: type=local,src=/tmp/.buildx-cache
|
||||
# using registry cache (no storage limit)
|
||||
cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
|
||||
cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
|
||||
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
|
||||
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
|
||||
|
||||
- name: Export digest metadata
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
TAGS="${{ matrix.config.tag }}"
|
||||
ARCH_SUFFIX="${{ steps.meta.outputs.arch_suffix }}"
|
||||
DIGEST_FILE="/tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv"
|
||||
mkdir -p /tmp/digests
|
||||
|
||||
add_digest_rows() {
|
||||
local image_type="$1"
|
||||
local digest="$2"
|
||||
|
||||
if [[ -z "$digest" ]]; then
|
||||
echo "Missing digest for image_type=${image_type}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for tag in $TAGS; do
|
||||
printf '%s\t%s\t%s\t%s\n' "$tag" "$ARCH_SUFFIX" "$image_type" "$digest" >> "$DIGEST_FILE"
|
||||
done
|
||||
}
|
||||
|
||||
if [[ "${{ matrix.config.full }}" == "true" ]]; then
|
||||
add_digest_rows "full" "${{ steps.build_full.outputs.digest }}"
|
||||
fi
|
||||
|
||||
if [[ "${{ matrix.config.light }}" == "true" ]]; then
|
||||
add_digest_rows "light" "${{ steps.build_light.outputs.digest }}"
|
||||
fi
|
||||
|
||||
if [[ "${{ matrix.config.server }}" == "true" ]]; then
|
||||
add_digest_rows "server" "${{ steps.build_server.outputs.digest }}"
|
||||
fi
|
||||
|
||||
- name: Upload digest metadata
|
||||
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
|
||||
with:
|
||||
name: digests-${{ steps.meta.outputs.digest_artifact_suffix }}
|
||||
path: /tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv
|
||||
if-no-files-found: error
|
||||
|
||||
merge_arch_tags:
|
||||
name: Create shared tags from digests
|
||||
needs: [prepare_matrices, push_to_registry, create_tag]
|
||||
runs-on: ubuntu-24.04
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config: ${{ fromJSON(needs.prepare_matrices.outputs.merge_matrix) }}
|
||||
create_tag:
|
||||
name: Create and push git tag
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v6
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Download digest metadata
|
||||
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
|
||||
with:
|
||||
pattern: digests-*
|
||||
path: /tmp/digests
|
||||
merge-multiple: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
|
||||
|
||||
- name: Log in to Docker Registry
|
||||
uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Create tags from digests
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
|
||||
REPO_NAME="${{ github.event.repository.name }}"
|
||||
IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
|
||||
PREFIX="${IMAGE_REPO}:"
|
||||
SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
|
||||
TAGS="${{ matrix.config.tag }}"
|
||||
ARCHES="${{ matrix.config.arches }}"
|
||||
DIGEST_GLOB="/tmp/digests/*.tsv"
|
||||
|
||||
if ! ls ${DIGEST_GLOB} >/dev/null 2>&1; then
|
||||
echo "No digest metadata found in /tmp/digests" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z "$SRC_TAG" ]]; then
|
||||
echo "Missing source tag from create_tag" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
find_digest() {
|
||||
local tag_name="$1"
|
||||
local arch="$2"
|
||||
local image_type="$3"
|
||||
local digest
|
||||
|
||||
digest="$(awk -F '\t' -v t="$tag_name" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
|
||||
|
||||
# Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
|
||||
if [[ -z "$digest" && "$tag_name" == "s390x" && "$arch" == "s390x" ]]; then
|
||||
digest="$(awk -F '\t' -v t="cpu" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
|
||||
fi
|
||||
|
||||
if [[ -z "$digest" ]]; then
|
||||
echo "Missing digest for tag=${tag_name} arch=${arch} image_type=${image_type}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$digest"
|
||||
}
|
||||
|
||||
create_manifest_tags() {
|
||||
local image_type="$1"
|
||||
local tag_name="$2"
|
||||
local suffix="$3"
|
||||
|
||||
local merged_tag="${PREFIX}${image_type}${suffix}"
|
||||
local merged_versioned_tag="${merged_tag}-${SRC_TAG}"
|
||||
|
||||
local refs=()
|
||||
|
||||
for arch in $ARCHES; do
|
||||
local digest
|
||||
digest="$(find_digest "$tag_name" "$arch" "$image_type")"
|
||||
refs+=("${IMAGE_REPO}@${digest}")
|
||||
done
|
||||
|
||||
echo "Creating ${merged_tag} from ${refs[*]}"
|
||||
docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}"
|
||||
|
||||
echo "Creating ${merged_versioned_tag} from ${refs[*]}"
|
||||
docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}"
|
||||
}
|
||||
|
||||
for tag in $TAGS; do
|
||||
if [[ "$tag" == "cpu" ]]; then
|
||||
TYPE=""
|
||||
else
|
||||
TYPE="-$tag"
|
||||
fi
|
||||
|
||||
if [[ "${{ matrix.config.full }}" == "true" ]]; then
|
||||
create_manifest_tags "full" "$tag" "$TYPE"
|
||||
fi
|
||||
|
||||
if [[ "${{ matrix.config.light }}" == "true" ]]; then
|
||||
create_manifest_tags "light" "$tag" "$TYPE"
|
||||
fi
|
||||
|
||||
if [[ "${{ matrix.config.server }}" == "true" ]]; then
|
||||
create_manifest_tags "server" "$tag" "$TYPE"
|
||||
fi
|
||||
done
|
||||
- name: Determine source tag name
|
||||
id: srctag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
env:
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Create and push git tag
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
git tag ${{ steps.srctag.outputs.name }} || exit 0
|
||||
git push origin ${{ steps.srctag.outputs.name }} || exit 0
|
||||
|
||||
6
.github/workflows/editorconfig.yml
vendored
6
.github/workflows/editorconfig.yml
vendored
@@ -20,10 +20,10 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
editorconfig:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
|
||||
- uses: actions/checkout@v4
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@v2
|
||||
with:
|
||||
version: v3.0.3
|
||||
- run: editorconfig-checker
|
||||
|
||||
10
.github/workflows/gguf-publish.yml
vendored
10
.github/workflows/gguf-publish.yml
vendored
@@ -24,21 +24,21 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
python-version: '3.9.x'
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
cd gguf-py
|
||||
python -m pip install poetry==2.3.2
|
||||
python -m pip install poetry
|
||||
poetry install
|
||||
|
||||
- name: Build package
|
||||
run: cd gguf-py && poetry build
|
||||
- name: Publish package
|
||||
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
packages-dir: gguf-py/dist
|
||||
|
||||
82
.github/workflows/hip-quality-check.yml
vendored
82
.github/workflows/hip-quality-check.yml
vendored
@@ -1,82 +0,0 @@
|
||||
name: HIP quality check
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/hip-quality-check.yml',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'scripts/hip/gcn-cdna-vgpr-check.py'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/hip-quality-check.yml',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'scripts/hip/gcn-cdna-vgpr-check.py'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-22-hip-quality-check:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:7.2.1
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev python3
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-22-hip-quality-check
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build with Werror
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGPU_TARGETS=gfx942 \
|
||||
-DGGML_HIP=ON \
|
||||
-DGGML_HIP_EXPORT_METRICS=Off \
|
||||
-DCMAKE_HIP_FLAGS="-Werror -Wno-tautological-compare" \
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cd build
|
||||
make -j $(nproc)
|
||||
|
||||
- name: Check for major VGPR spills
|
||||
id: vgpr_check
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGPU_TARGETS=gfx908 \
|
||||
-DGGML_HIP=ON \
|
||||
-DGGML_HIP_EXPORT_METRICS=On \
|
||||
-DCMAKE_HIP_FLAGS="" \
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cd build
|
||||
make -j $(nproc) 2>&1 | tee metrics.log | grep -v 'Rpass-analysis=kernel-resource-usage\|remark:\|^$'
|
||||
python3 ../scripts/hip/gcn-cdna-vgpr-check.py metrics.log
|
||||
6
.github/workflows/labeler.yml
vendored
6
.github/workflows/labeler.yml
vendored
@@ -7,11 +7,11 @@ jobs:
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: "ggml-org/llama.cpp"
|
||||
- uses: actions/labeler@v6
|
||||
- uses: actions/labeler@v5
|
||||
with:
|
||||
configuration-path: '.github/labeler.yml'
|
||||
|
||||
6
.github/workflows/pre-tokenizer-hashes.yml
vendored
6
.github/workflows/pre-tokenizer-hashes.yml
vendored
@@ -12,14 +12,14 @@ on:
|
||||
|
||||
jobs:
|
||||
pre-tokenizer-hashes:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
|
||||
@@ -20,13 +20,13 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
python-check-requirements:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-latest
|
||||
name: check-requirements
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Python environment
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Run check-requirements.sh script
|
||||
|
||||
18
.github/workflows/python-lint.yml
vendored
18
.github/workflows/python-lint.yml
vendored
@@ -4,16 +4,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/python-lint.yml',
|
||||
'**/*.py'
|
||||
]
|
||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/python-lint.yml',
|
||||
'**/*.py'
|
||||
]
|
||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
@@ -21,16 +15,16 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
flake8-lint:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-latest
|
||||
name: Lint
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Python environment
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: flake8 Lint
|
||||
uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2
|
||||
uses: py-actions/flake8@v2
|
||||
with:
|
||||
plugins: "flake8-no-print"
|
||||
|
||||
33
.github/workflows/python-type-check.yml
vendored
33
.github/workflows/python-type-check.yml
vendored
@@ -4,17 +4,15 @@ on:
|
||||
push:
|
||||
paths:
|
||||
- '.github/workflows/python-type-check.yml'
|
||||
- 'ty.toml'
|
||||
- 'pyrightconfig.json'
|
||||
- '**.py'
|
||||
- '**/requirements*.txt'
|
||||
# - 'pyrightconfig.json'
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/workflows/python-type-check.yml'
|
||||
- 'ty.toml'
|
||||
- 'pyrightconfig.json'
|
||||
- '**.py'
|
||||
- '**/requirements*.txt'
|
||||
# - 'pyrightconfig.json'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
@@ -22,22 +20,21 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
python-type-check:
|
||||
runs-on: ubuntu-slim
|
||||
name: python type-check
|
||||
runs-on: ubuntu-latest
|
||||
name: pyright type-check
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up Python environment
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
pip-install: -r requirements/requirements-all.txt ty==0.0.26
|
||||
# - name: Type-check with Pyright
|
||||
# uses: jakebailey/pyright-action@v2
|
||||
# with:
|
||||
# version: 1.1.382
|
||||
# level: warning
|
||||
# warnings: true
|
||||
- name: Type-check with ty
|
||||
run: |
|
||||
ty check --output-format=github
|
||||
- name: Install Python dependencies
|
||||
# TODO: use a venv
|
||||
run: pip install -r requirements/requirements-all.txt
|
||||
- name: Type-check with Pyright
|
||||
uses: jakebailey/pyright-action@v2
|
||||
with:
|
||||
version: 1.1.382
|
||||
level: warning
|
||||
warnings: true
|
||||
|
||||
631
.github/workflows/release.yml
vendored
631
.github/workflows/release.yml
vendored
@@ -10,22 +10,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/release.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl'
|
||||
]
|
||||
paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
@@ -36,50 +21,40 @@ env:
|
||||
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
|
||||
|
||||
jobs:
|
||||
macOS-cpu:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'arm64'
|
||||
arch: 'arm64'
|
||||
os: macos-14
|
||||
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
|
||||
- build: 'arm64-kleidiai'
|
||||
arch: 'arm64'
|
||||
os: macos-14
|
||||
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
|
||||
- build: 'x64'
|
||||
arch: 'x64'
|
||||
os: macos-15-intel
|
||||
# Metal is disabled on x64 due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
defines: "-DGGML_METAL=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3"
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
macOS-arm64:
|
||||
runs-on: macos-14
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-${{ matrix.arch }}
|
||||
key: macOS-latest-cmake-arm64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
brew install curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build \
|
||||
${{ matrix.defines }} \
|
||||
-DCMAKE_INSTALL_RPATH='@loader_path' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DGGML_RPC=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
@@ -91,53 +66,100 @@ jobs:
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz
|
||||
name: llama-bin-macos-${{ matrix.build }}.tar.gz
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||
name: llama-bin-macos-arm64.zip
|
||||
|
||||
ubuntu-cpu:
|
||||
macOS-x64:
|
||||
runs-on: macos-15-intel
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-x64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
brew install curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_INSTALL_RPATH='@loader_path' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||
name: llama-bin-macos-x64.zip
|
||||
|
||||
ubuntu-22-cpu:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-24.04-arm
|
||||
- build: 's390x'
|
||||
os: ubuntu-24.04-s390x
|
||||
- build: 's390x-z15' # z15 because our CI runners are on z15
|
||||
os: ubuntu-22.04-s390x
|
||||
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
|
||||
# - build: 'arm64'
|
||||
# os: ubuntu-22.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
if: ${{ matrix.build != 's390x' }}
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-cpu-${{ matrix.build }}
|
||||
key: ubuntu-cpu-cmake-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
|
||||
- name: Toolchain workaround (GCC 14)
|
||||
if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
|
||||
run: |
|
||||
sudo apt-get install -y gcc-14 g++-14
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -160,52 +182,37 @@ jobs:
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.zip
|
||||
|
||||
ubuntu-vulkan:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-24.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
ubuntu-22-vulkan:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-vulkan-${{ matrix.build }}
|
||||
key: ubuntu-22-cmake-vulkan
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
if [[ "${{ matrix.os }}" =~ "ubuntu-22.04" ]]; then
|
||||
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
|
||||
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
|
||||
else
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
fi
|
||||
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
|
||||
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -228,93 +235,13 @@ jobs:
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-24-openvino:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
outputs:
|
||||
openvino_version: ${{ steps.openvino_version.outputs.value }}
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
|
||||
|
||||
steps:
|
||||
- name: Set OpenVINO version output
|
||||
id: openvino_version
|
||||
run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-openvino-release-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
|
||||
sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
|
||||
|
||||
- name: Use OpenVINO Toolkit Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
|
||||
version_full: ${{ env.OPENVINO_VERSION_FULL }}
|
||||
|
||||
- name: Install OpenVINO dependencies
|
||||
run: |
|
||||
cd ./openvino_toolkit
|
||||
chmod +x ./install_dependencies/install_openvino_dependencies.sh
|
||||
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON
|
||||
cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/ReleaseOV/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/ReleaseOV/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
|
||||
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
||||
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||
|
||||
windows-cpu:
|
||||
runs-on: windows-2025
|
||||
@@ -327,14 +254,14 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cpu-${{ matrix.arch }}
|
||||
key: windows-latest-cmake-cpu-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
@@ -342,28 +269,39 @@ jobs:
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
with:
|
||||
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
||||
|
||||
- name: Build
|
||||
shell: cmd
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
|
||||
-DGGML_OPENMP=ON ^
|
||||
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
|
||||
7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
|
||||
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||
@@ -390,12 +328,12 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
|
||||
key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
@@ -436,16 +374,16 @@ jobs:
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
|
||||
cmake --build build --config Release --target ${{ matrix.target }}
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
7z a -snl llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
|
||||
7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
@@ -455,15 +393,15 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '13.1']
|
||||
cuda: ['12.4']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
@@ -482,7 +420,6 @@ jobs:
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
@@ -490,18 +427,17 @@ jobs:
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_CPU=OFF ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_CUDA_CUB_3DOT2=ON
|
||||
-DLLAMA_CURL=OFF
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
7z a -snl llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
|
||||
7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
@@ -512,11 +448,10 @@ jobs:
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
uses: actions/upload-artifact@v6
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
@@ -536,12 +471,12 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
key: windows-latest-cmake-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
@@ -559,7 +494,7 @@ jobs:
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
-DLLAMA_CURL=OFF
|
||||
cmake --build build --target ggml-sycl -j
|
||||
|
||||
- name: Build the release package
|
||||
@@ -582,8 +517,6 @@ jobs:
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
@@ -593,147 +526,49 @@ jobs:
|
||||
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
7z a llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v6
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
ubuntu-22-rocm:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- ROCM_VERSION: "7.2.1"
|
||||
gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
|
||||
build: 'x64'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt install -y build-essential git cmake wget
|
||||
|
||||
- name: Setup Legacy ROCm
|
||||
if: matrix.ROCM_VERSION == '7.2.1'
|
||||
id: legacy_env
|
||||
run: |
|
||||
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
|
||||
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
|
||||
gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
|
||||
|
||||
sudo tee /etc/apt/sources.list.d/rocm.list << EOF
|
||||
deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} jammy main
|
||||
EOF
|
||||
|
||||
sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
|
||||
Package: *
|
||||
Pin: release o=repo.radeon.com
|
||||
Pin-Priority: 600
|
||||
EOF
|
||||
|
||||
sudo apt update
|
||||
sudo apt-get install -y libssl-dev rocm-hip-sdk
|
||||
|
||||
- name: Setup TheRock
|
||||
if: matrix.ROCM_VERSION != '7.2.1'
|
||||
id: therock_env
|
||||
run: |
|
||||
wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
|
||||
mkdir install
|
||||
tar -xf *.tar.gz -C install
|
||||
export ROCM_PATH=$(pwd)/install
|
||||
echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV
|
||||
echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
|
||||
echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV
|
||||
|
||||
- name: Build with native CMake HIP support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGPU_TARGETS="${{ matrix.gpu_targets }}" \
|
||||
-DGGML_HIP=ON \
|
||||
-DHIP_PLATFORM=amd \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Get ROCm short version
|
||||
run: echo "ROCM_VERSION_SHORT=$(echo '${{ matrix.ROCM_VERSION }}' | cut -d '.' -f 1,2)" >> $GITHUB_ENV
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
|
||||
name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
|
||||
|
||||
windows-hip:
|
||||
runs-on: windows-2022
|
||||
|
||||
env:
|
||||
HIPSDK_INSTALLER_VERSION: "26.Q1"
|
||||
HIPSDK_INSTALLER_VERSION: "25.Q3"
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- name: "radeon"
|
||||
gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
|
||||
gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Grab rocWMMA package
|
||||
id: grab_rocwmma
|
||||
run: |
|
||||
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
|
||||
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.0.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.0.0.70001-42~24.04_amd64.deb"
|
||||
7z x rocwmma.deb
|
||||
7z x data.tar
|
||||
|
||||
- name: Cache ROCm Installation
|
||||
id: cache-rocm
|
||||
uses: actions/cache@v5
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
|
||||
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Install ROCm
|
||||
@@ -742,7 +577,7 @@ jobs:
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
|
||||
$completed = $proc.WaitForExit(600000)
|
||||
@@ -776,20 +611,20 @@ jobs:
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.0.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DGGML_BACKEND_DL=ON `
|
||||
-DGGML_NATIVE=OFF `
|
||||
-DGGML_CPU=OFF `
|
||||
-DGPU_TARGETS="${{ matrix.gpu_targets }}" `
|
||||
-DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGGML_HIP=ON `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
-DLLAMA_CURL=OFF
|
||||
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
|
||||
md "build\bin\rocblas\library\"
|
||||
md "build\bin\hipblaslt\library"
|
||||
cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
|
||||
@@ -797,10 +632,10 @@ jobs:
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
7z a -snl llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
|
||||
7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
@@ -810,7 +645,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -825,7 +660,7 @@ jobs:
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -850,106 +685,13 @@ jobs:
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
# Zip file is required for Swift Package Manager, which does not support tar.gz for binary targets.
|
||||
# For more details, see https://developer.apple.com/documentation/xcode/distributing-binary-frameworks-as-swift-packages
|
||||
zip -r -y llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
|
||||
|
||||
openEuler-cann:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
# 910b with aclgraph (both architectures)
|
||||
- arch: x86
|
||||
chip_type: '910b'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'on'
|
||||
- arch: aarch64
|
||||
chip_type: '910b'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'on'
|
||||
# 310p without aclgraph (both architectures)
|
||||
- arch: x86
|
||||
chip_type: '310p'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'off'
|
||||
- arch: aarch64
|
||||
chip_type: '310p'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'off'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build }}
|
||||
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
run: |
|
||||
HOST_UID=$(id -u)
|
||||
HOST_GID=$(id -g)
|
||||
|
||||
docker run --rm \
|
||||
-v "${PWD}:/workspace" \
|
||||
-w /workspace \
|
||||
-e SOC_TYPE=${SOC_TYPE} \
|
||||
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
"${{ steps.cann-image.outputs.image }}" \
|
||||
bash -lc '
|
||||
set -e
|
||||
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
git config --global --add safe.directory "/workspace"
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE} \
|
||||
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
'
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
@@ -959,7 +701,7 @@ jobs:
|
||||
permissions:
|
||||
contents: write # for creating release
|
||||
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- windows
|
||||
@@ -967,18 +709,16 @@ jobs:
|
||||
- windows-cuda
|
||||
- windows-sycl
|
||||
- windows-hip
|
||||
- ubuntu-22-rocm
|
||||
- ubuntu-cpu
|
||||
- ubuntu-vulkan
|
||||
- ubuntu-24-openvino
|
||||
- macOS-cpu
|
||||
- ubuntu-22-cpu
|
||||
- ubuntu-22-vulkan
|
||||
- macOS-arm64
|
||||
- macOS-x64
|
||||
- ios-xcode-build
|
||||
- openEuler-cann
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -988,7 +728,7 @@ jobs:
|
||||
|
||||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v7
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
merge-multiple: true
|
||||
@@ -1028,7 +768,6 @@ jobs:
|
||||
|
||||
echo "Moving other artifacts..."
|
||||
mv -v artifact/*.zip release
|
||||
mv -v artifact/*.tar.gz release
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
@@ -1037,46 +776,10 @@ jobs:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
body: |
|
||||
<details open>
|
||||
|
||||
${{ github.event.head_commit.message }}
|
||||
|
||||
</details>
|
||||
|
||||
**macOS/iOS:**
|
||||
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
|
||||
- [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
|
||||
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
|
||||
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
|
||||
|
||||
**Linux:**
|
||||
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
|
||||
- [Ubuntu arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-arm64.tar.gz)
|
||||
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
|
||||
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
|
||||
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
|
||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||
|
||||
**Windows:**
|
||||
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
||||
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
|
||||
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
|
||||
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
|
||||
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
|
||||
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
||||
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
||||
|
||||
**openEuler:**
|
||||
- [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
|
||||
- [openEuler x86 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86-aclgraph.tar.gz)
|
||||
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
|
||||
- [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
uses: actions/github-script@v8
|
||||
uses: actions/github-script@v3
|
||||
with:
|
||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||
script: |
|
||||
@@ -1084,9 +787,9 @@ jobs:
|
||||
const fs = require('fs');
|
||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||
for (let file of await fs.readdirSync('./release')) {
|
||||
if (path.extname(file) === '.zip' || file.endsWith('.tar.gz')) {
|
||||
if (path.extname(file) === '.zip') {
|
||||
console.log('uploadReleaseAsset', file);
|
||||
await github.rest.repos.uploadReleaseAsset({
|
||||
await github.repos.uploadReleaseAsset({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
release_id: release_id,
|
||||
|
||||
105
.github/workflows/server-sanitize.yml
vendored
105
.github/workflows/server-sanitize.yml
vendored
@@ -1,105 +0,0 @@
|
||||
name: Server (sanitize)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
sha:
|
||||
description: 'Commit SHA1 to build'
|
||||
required: false
|
||||
type: string
|
||||
slow_tests:
|
||||
description: 'Run slow tests'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/server-sanitize.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/Makefile',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'tools/server/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
|
||||
build_type: [RelWithDebInfo]
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install \
|
||||
build-essential \
|
||||
xxd \
|
||||
git \
|
||||
cmake \
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
libssl-dev
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_SCHED_NO_REALLOC=ON \
|
||||
-DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||
-DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||
-DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
|
||||
-DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
|
||||
-DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
|
||||
-DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.11'
|
||||
pip-install: -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export ${{ matrix.extra_args }}
|
||||
SLOW_TESTS=1 pytest -v -x
|
||||
124
.github/workflows/server-self-hosted.yml
vendored
124
.github/workflows/server-self-hosted.yml
vendored
@@ -1,124 +0,0 @@
|
||||
name: Server (self-hosted)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
sha:
|
||||
description: 'Commit SHA1 to build'
|
||||
required: false
|
||||
type: string
|
||||
slow_tests:
|
||||
description: 'Run slow tests'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/server-self-hosted.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/Makefile',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'tools/server/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
server-metal:
|
||||
runs-on: [self-hosted, llama-server, macOS, ARM64]
|
||||
|
||||
name: server-metal (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
- build_type: Release
|
||||
extra_args: "GGML_METAL_DEVICES=2"
|
||||
wf_name: "GPUx2"
|
||||
- build_type: Release
|
||||
extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx2, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-cuda:
|
||||
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
|
||||
name: server-cuda (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
108
.github/workflows/server-webui.yml
vendored
108
.github/workflows/server-webui.yml
vendored
@@ -1,108 +0,0 @@
|
||||
name: Server WebUI
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
sha:
|
||||
description: 'Commit SHA1 to build'
|
||||
required: false
|
||||
type: string
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/server-webui.yml',
|
||||
'tools/server/webui/**.*',
|
||||
'tools/server/tests/**.*',
|
||||
'tools/server/public/**'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/server-webui.yml',
|
||||
'tools/server/webui/**.*',
|
||||
'tools/server/tests/**.*',
|
||||
'tools/server/public/**'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
webui-check:
|
||||
name: WebUI Checks
|
||||
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
id: node
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "22"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
if: ${{ steps.node.conclusion == 'success' }}
|
||||
run: npm ci
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run type checking
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run check
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run linting
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run lint
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Build application
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Install Playwright browsers
|
||||
id: playwright
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npx playwright install --with-deps
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Build Storybook
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run build-storybook
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run Client tests
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:client
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run Unit tests
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:unit
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run UI tests
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:ui -- --testTimeout=60000
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run E2E tests
|
||||
if: ${{ always() && steps.playwright.conclusion == 'success' }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/server/webui
|
||||
335
.github/workflows/server.yml
vendored
335
.github/workflows/server.yml
vendored
@@ -1,3 +1,4 @@
|
||||
# Server build and tests
|
||||
name: Server
|
||||
|
||||
on:
|
||||
@@ -14,34 +15,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/server.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/Makefile',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'tools/server/**.*'
|
||||
]
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/server.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/Makefile',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'tools/server/**.*'
|
||||
]
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
@@ -57,19 +34,14 @@ jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
name: server (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["default"]
|
||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||
build_type: [RelWithDebInfo]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: ""
|
||||
wf_name: "default"
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "backend-sampling"
|
||||
fail-fast: false
|
||||
sanitizer: ""
|
||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
@@ -84,45 +56,283 @@ jobs:
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
libssl-dev
|
||||
libcurl4-openssl-dev
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
webui-setup:
|
||||
name: WebUI Setup
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||
|
||||
- name: Cache node_modules
|
||||
uses: actions/cache@v4
|
||||
id: cache-node-modules
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Install dependencies
|
||||
if: steps.cache-node-modules.outputs.cache-hit != 'true'
|
||||
run: npm ci
|
||||
working-directory: tools/server/webui
|
||||
|
||||
webui-check:
|
||||
needs: webui-setup
|
||||
name: WebUI Check
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
|
||||
- name: Restore node_modules cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Run type checking
|
||||
run: npm run check
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run linting
|
||||
run: npm run lint
|
||||
working-directory: tools/server/webui
|
||||
|
||||
webui-build:
|
||||
needs: webui-check
|
||||
name: WebUI Build
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
|
||||
- name: Restore node_modules cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Build application
|
||||
run: npm run build
|
||||
working-directory: tools/server/webui
|
||||
|
||||
webui-tests:
|
||||
needs: webui-build
|
||||
name: Run WebUI tests
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
|
||||
- name: Restore node_modules cache
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: tools/server/webui/node_modules
|
||||
key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node-modules-
|
||||
|
||||
- name: Install Playwright browsers
|
||||
run: npx playwright install --with-deps
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Build Storybook
|
||||
run: npm run build-storybook
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run Client tests
|
||||
run: npm run test:client
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run Server tests
|
||||
run: npm run test:server
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run UI tests
|
||||
run: npm run test:ui
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Run E2E tests
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/server/webui
|
||||
|
||||
server-build:
|
||||
needs: [webui-tests]
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||
build_type: [RelWithDebInfo]
|
||||
include:
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install \
|
||||
build-essential \
|
||||
xxd \
|
||||
git \
|
||||
cmake \
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
libcurl4-openssl-dev
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
pip-install: -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Setup Node.js for WebUI
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: "22"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/server/webui/package-lock.json"
|
||||
|
||||
- name: Install WebUI dependencies
|
||||
run: npm ci
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Build WebUI
|
||||
run: npm run build
|
||||
working-directory: tools/server/webui
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_OPENMP=OFF ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Build (sanitizers)
|
||||
id: cmake_build_sanitizers
|
||||
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Build (sanitizers)
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer == '' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
if: ${{ matrix.sanitizer == '' }}
|
||||
env:
|
||||
GITHUB_ACTIONS: "true"
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
./tests.sh
|
||||
|
||||
- name: Tests (sanitizers)
|
||||
id: server_integration_tests_sanitizers
|
||||
if: ${{ matrix.sanitizer != '' }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
LLAMA_SANITIZE=1 ./tests.sh
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export ${{ matrix.extra_args }}
|
||||
SLOW_TESTS=1 pytest -v -x
|
||||
SLOW_TESTS=1 ./tests.sh
|
||||
|
||||
|
||||
server-windows:
|
||||
runs-on: windows-2022
|
||||
@@ -130,23 +340,40 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
pip-install: -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Tests dependencies
|
||||
id: test_dependencies
|
||||
run: |
|
||||
pip install -r tools/server/tests/requirements.txt
|
||||
|
||||
- name: Copy Libcurl
|
||||
id: prepare_libcurl
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
|
||||
6
.github/workflows/update-ops-docs.yml
vendored
6
.github/workflows/update-ops-docs.yml
vendored
@@ -14,14 +14,14 @@ on:
|
||||
|
||||
jobs:
|
||||
update-ops-docs:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v6
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.x'
|
||||
|
||||
|
||||
18
.github/workflows/winget.yml
vendored
18
.github/workflows/winget.yml
vendored
@@ -9,7 +9,6 @@ jobs:
|
||||
update:
|
||||
name: Update Winget Package
|
||||
runs-on: ubuntu-latest
|
||||
if: github.repository_owner == 'ggml-org'
|
||||
|
||||
steps:
|
||||
- name: Install cargo binstall
|
||||
@@ -17,28 +16,27 @@ jobs:
|
||||
|
||||
- name: Install komac
|
||||
run: |
|
||||
cargo binstall komac@2.15.0 -y
|
||||
cargo binstall komac@2.11.2 -y
|
||||
|
||||
- name: Find latest release
|
||||
id: find_latest_release
|
||||
uses: actions/github-script@v8
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
script: |
|
||||
const { data: releases } = await github.rest.repos.listReleases({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
});
|
||||
const { tag_name: version, assets: assets } = releases.find(({assets}) => assets.find(asset => asset.name.includes('win-vulkan')));
|
||||
const { browser_download_url: asset_url } = assets.find(asset => asset.name.includes('win-vulkan'));
|
||||
console.log("Latest release:", version);
|
||||
core.setOutput('VERSION', version);
|
||||
core.setOutput('ASSETURL', asset_url);
|
||||
console.log("Latest release:", releases[0].tag_name);
|
||||
return releases[0].tag_name;
|
||||
|
||||
- name: Update manifest
|
||||
env:
|
||||
VERSION: ${{ steps.find_latest_release.outputs.result }}
|
||||
run: |
|
||||
echo "Updating manifest..."
|
||||
komac update --version ${{ steps.find_latest_release.outputs.VERSION }} \
|
||||
--urls "${{ steps.find_latest_release.outputs.ASSETURL }}" \
|
||||
komac update --version ${{ env.VERSION }} \
|
||||
--urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
|
||||
--token ${{ secrets.WINGET_GITHUB_TOKEN }} \
|
||||
--submit \
|
||||
ggml.llamacpp
|
||||
|
||||
119
.gitignore
vendored
119
.gitignore
vendored
@@ -20,41 +20,52 @@
|
||||
*.so
|
||||
*.swp
|
||||
*.tmp
|
||||
*.DS_Store
|
||||
|
||||
# IDE / OS
|
||||
|
||||
/.cache/
|
||||
/.ccls-cache/
|
||||
/.direnv/
|
||||
/.envrc
|
||||
/.idea/
|
||||
/.swiftpm
|
||||
/.vs/
|
||||
/.vscode/
|
||||
/nppBackup
|
||||
.cache/
|
||||
.ccls-cache/
|
||||
.direnv/
|
||||
.DS_Store
|
||||
.envrc
|
||||
.idea/
|
||||
.swiftpm
|
||||
.vs/
|
||||
.vscode/
|
||||
nppBackup
|
||||
|
||||
|
||||
# Coverage
|
||||
|
||||
/gcovr-report/
|
||||
/lcov-report/
|
||||
gcovr-report/
|
||||
lcov-report/
|
||||
|
||||
# Build Artifacts
|
||||
|
||||
/tags
|
||||
/.build/
|
||||
/build*
|
||||
/release
|
||||
/debug
|
||||
tags
|
||||
.build/
|
||||
build*
|
||||
release
|
||||
debug
|
||||
!build-info.cmake
|
||||
!build-info.cpp.in
|
||||
!build-info.sh
|
||||
!build.zig
|
||||
!docs/build.md
|
||||
/libllama.so
|
||||
/llama-*
|
||||
/vulkan-shaders-gen
|
||||
android-ndk-*
|
||||
arm_neon.h
|
||||
cmake-build-*
|
||||
CMakeSettings.json
|
||||
compile_commands.json
|
||||
ggml-metal-embed.metal
|
||||
llama-batched-swift
|
||||
/rpc-server
|
||||
/out/
|
||||
/tmp/
|
||||
/autogen-*.md
|
||||
/common/build-info.cpp
|
||||
out/
|
||||
tmp/
|
||||
autogen-*.md
|
||||
|
||||
# Deprecated
|
||||
|
||||
@@ -63,40 +74,44 @@
|
||||
|
||||
# CI
|
||||
|
||||
!/.github/workflows/*.yml
|
||||
!.github/workflows/*.yml
|
||||
|
||||
# Models
|
||||
|
||||
/models/*
|
||||
/models-mnt
|
||||
!/models/.editorconfig
|
||||
!/models/ggml-vocab-*.gguf*
|
||||
!/models/templates
|
||||
models/*
|
||||
models-mnt
|
||||
!models/.editorconfig
|
||||
!models/ggml-vocab-*.gguf*
|
||||
!models/templates
|
||||
|
||||
# Zig
|
||||
/zig-out/
|
||||
/zig-cache/
|
||||
zig-out/
|
||||
zig-cache/
|
||||
|
||||
# Logs
|
||||
|
||||
ppl-*.txt
|
||||
qnt-*.txt
|
||||
perf-*.txt
|
||||
|
||||
# Examples
|
||||
|
||||
/examples/jeopardy/results.txt
|
||||
/tools/server/*.css.hpp
|
||||
/tools/server/*.html.hpp
|
||||
/tools/server/*.js.hpp
|
||||
/tools/server/*.mjs.hpp
|
||||
/tools/server/*.gz.hpp
|
||||
!/build_64.sh
|
||||
!/examples/*.bat
|
||||
!/examples/*/*.kts
|
||||
!/examples/*/*/*.kts
|
||||
!/examples/sycl/*.bat
|
||||
!/examples/sycl/*.sh
|
||||
examples/jeopardy/results.txt
|
||||
tools/server/*.css.hpp
|
||||
tools/server/*.html.hpp
|
||||
tools/server/*.js.hpp
|
||||
tools/server/*.mjs.hpp
|
||||
tools/server/*.gz.hpp
|
||||
!build_64.sh
|
||||
!examples/*.bat
|
||||
!examples/*/*.kts
|
||||
!examples/*/*/*.kts
|
||||
!examples/sycl/*.bat
|
||||
!examples/sycl/*.sh
|
||||
|
||||
# Server Web UI temporary files
|
||||
/tools/server/webui/node_modules
|
||||
/tools/server/webui/dist
|
||||
# we no longer use gz for index.html
|
||||
/tools/server/public/index.html.gz
|
||||
node_modules
|
||||
tools/server/webui/dist
|
||||
|
||||
# Python
|
||||
|
||||
@@ -126,22 +141,14 @@ poetry.toml
|
||||
# Scripts
|
||||
!/scripts/install-oneapi.bat
|
||||
|
||||
# Generated by scripts
|
||||
/hellaswag_val_full.txt
|
||||
/winogrande-debiased-eval.csv
|
||||
/wikitext-2-raw/
|
||||
|
||||
# Test models for lora adapters
|
||||
/lora-tests
|
||||
|
||||
# Local scripts
|
||||
/run-vim.sh
|
||||
/run-chat.sh
|
||||
/run-spec.sh
|
||||
/.ccache/
|
||||
.ccache/
|
||||
|
||||
# IDE
|
||||
/*.code-workspace
|
||||
/.windsurf/
|
||||
# emscripten
|
||||
a.out.*
|
||||
*.code-workspace
|
||||
.windsurf/
|
||||
|
||||
110
AGENTS.md
110
AGENTS.md
@@ -1,110 +0,0 @@
|
||||
# Instructions for llama.cpp
|
||||
|
||||
> [!IMPORTANT]
|
||||
> This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
|
||||
>
|
||||
> Read more: [CONTRIBUTING.md](CONTRIBUTING.md)
|
||||
|
||||
AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below).
|
||||
|
||||
---
|
||||
|
||||
## Guidelines for Contributors Using AI
|
||||
|
||||
llama.cpp is built by humans, for humans. Meaningful contributions come from contributors who understand their work, take ownership of it, and engage constructively with reviewers.
|
||||
|
||||
Maintainers receive numerous pull requests weekly, many of which are AI-generated submissions where the author cannot adequately explain the code, debug issues, or participate in substantive design discussions. Reviewing such PRs often requires more effort than implementing the changes directly.
|
||||
|
||||
**A pull request represents a long-term commitment.** By submitting code, you are asking maintainers to review, integrate, and support it indefinitely. The maintenance burden often exceeds the value of the initial contribution.
|
||||
|
||||
Most maintainers already have access to AI tools. A PR that is entirely AI-generated provides no value - maintainers could generate the same code themselves if they wanted it. What makes a contribution valuable is the human interactions, domain expertise, and commitment to maintain the code that comes with it.
|
||||
|
||||
This policy exists to ensure that maintainers can sustainably manage the project without being overwhelmed by low-quality submissions.
|
||||
|
||||
---
|
||||
|
||||
## Guidelines for Contributors
|
||||
|
||||
Contributors are expected to:
|
||||
|
||||
1. **Demonstrate full understanding of their code.** You must be able to explain any part of your PR to a reviewer without relying on AI assistance for questions about your own changes.
|
||||
|
||||
2. **Take responsibility for maintenance.** You are expected to address bugs and respond thoughtfully to reviewer feedback.
|
||||
|
||||
3. **Communicate clearly and concisely.** Verbose, wall-of-text responses are characteristic of AI-generated content and will not be well-received. Direct, human communication is expected.
|
||||
|
||||
4. **Respect maintainers' time.** Search for existing issues and discussions before submitting. Ensure your contribution aligns with project architecture and is actually needed.
|
||||
|
||||
Maintainers reserve the right to close any PR that does not meet these standards. This applies to all contributions to the main llama.cpp repository. **Private forks are exempt.**
|
||||
|
||||
### Permitted AI Usage
|
||||
|
||||
AI tools may be used responsibly for:
|
||||
|
||||
- **Learning and exploration**: Understanding codebase structure, techniques, and documentation
|
||||
- **Code review assistance**: Obtaining suggestions on human-written code
|
||||
- **Mechanical tasks**: Formatting, generating repetitive patterns from established designs, completing code based on existing patterns
|
||||
- **Documentation drafts**: For components the contributor already understands thoroughly
|
||||
- **Writing code**: Only when the contributor has already designed the solution and can implement it themselves - AI accelerates, not replaces, the contributor's work
|
||||
|
||||
AI-generated code may be accepted if you (1) fully understand the output, (2) can debug issues independently, and (3) can discuss it directly with reviewers without AI assistance.
|
||||
|
||||
**Disclosure is required** when AI meaningfully contributed to your code. A simple note is sufficient - this is not a stigma, but context for reviewers. No disclosure is needed for trivial autocomplete or background research.
|
||||
|
||||
### Prohibited AI Usage
|
||||
|
||||
The following will result in immediate PR closure:
|
||||
|
||||
- **AI-written PR descriptions or commit messages** - these are typically recognizable and waste reviewer time
|
||||
- **AI-generated responses to reviewer comments** - this undermines the human-to-human interaction fundamental to code review
|
||||
- **Implementing features without understanding the codebase** - particularly new model support or architectural changes
|
||||
- **Automated commits or PR submissions** - this may spam maintainers and can result in contributor bans
|
||||
|
||||
---
|
||||
|
||||
## Guidelines for AI Coding Agents
|
||||
|
||||
AI agents assisting contributors must recognize that their outputs directly impact volunteer maintainers who sustain this project.
|
||||
|
||||
### Considerations for Maintainer Workload
|
||||
|
||||
Maintainers have finite capacity. Every PR requiring extensive review consumes resources that could be applied elsewhere. Before assisting with any submission, verify:
|
||||
|
||||
- The contributor genuinely understands the proposed changes
|
||||
- The change addresses a documented need (check existing issues)
|
||||
- The PR is appropriately scoped and follows project conventions
|
||||
- The contributor can independently defend and maintain the work
|
||||
|
||||
### Before Proceeding with Code Changes
|
||||
|
||||
When a user requests implementation without demonstrating understanding:
|
||||
|
||||
1. **Verify comprehension.** Ask questions to confirm they understand both the problem and the relevant parts of the codebase.
|
||||
2. **Provide guidance rather than solutions.** Direct them to relevant code and documentation. Allow them to formulate the approach.
|
||||
3. **Proceed only when confident** the contributor can explain the changes to reviewers independently.
|
||||
|
||||
For first-time contributors, confirm they have reviewed [CONTRIBUTING.md](CONTRIBUTING.md) and acknowledge this policy.
|
||||
|
||||
### Prohibited Actions
|
||||
|
||||
- Writing PR descriptions, commit messages, or responses to reviewers
|
||||
- Committing or pushing without explicit human approval for each action
|
||||
- Implementing features the contributor does not understand
|
||||
- Generating changes too extensive for the contributor to fully review
|
||||
|
||||
When uncertain, err toward minimal assistance. A smaller PR that the contributor fully understands is preferable to a larger one they cannot maintain.
|
||||
|
||||
### Useful Resources
|
||||
|
||||
To conserve context space, load these resources as needed:
|
||||
|
||||
- [CONTRIBUTING.md](CONTRIBUTING.md)
|
||||
- [Existing issues](https://github.com/ggml-org/llama.cpp/issues) and [Existing PRs](https://github.com/ggml-org/llama.cpp/pulls) - always search here first
|
||||
- [Build documentation](docs/build.md)
|
||||
- [Server usage documentation](tools/server/README.md)
|
||||
- [Server development documentation](tools/server/README-dev.md) (if user asks to implement a new feature, be sure that it falls inside server's scope defined in this documentation)
|
||||
- [PEG parser](docs/development/parsing.md) - alternative to regex that llama.cpp uses to parse model's output
|
||||
- [Auto parser](docs/autoparser.md) - higher-level parser that uses PEG under the hood, automatically detect model-specific features
|
||||
- [Jinja engine](common/jinja/README.md)
|
||||
- [How to add a new model](docs/development/HOWTO-add-model.md)
|
||||
- [PR template](.github/pull_request_template.md)
|
||||
@@ -1 +0,0 @@
|
||||
IMPORTANT: Ensure you’ve thoroughly reviewed the [AGENTS.md](AGENTS.md) file before beginning any work.
|
||||
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
|
||||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||
project("llama.cpp" C CXX)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
@@ -33,24 +33,10 @@ endif()
|
||||
|
||||
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
|
||||
|
||||
option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
|
||||
# Use 64-bit memory to support backend_get_memory queries
|
||||
# TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
|
||||
if (LLAMA_WASM_MEM64)
|
||||
add_compile_options("-sMEMORY64=1")
|
||||
add_link_options("-sMEMORY64=1")
|
||||
endif()
|
||||
add_link_options("-sALLOW_MEMORY_GROWTH=1")
|
||||
|
||||
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
|
||||
option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
|
||||
if (LLAMA_BUILD_HTML)
|
||||
set(CMAKE_EXECUTABLE_SUFFIX ".html")
|
||||
endif()
|
||||
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
|
||||
else()
|
||||
if (MINGW)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
@@ -72,12 +58,6 @@ if (MSVC)
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
|
||||
endif()
|
||||
|
||||
if (LLAMA_STANDALONE)
|
||||
# enable parallel builds for msbuild
|
||||
list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
|
||||
list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
|
||||
endif()
|
||||
|
||||
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
|
||||
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
|
||||
else()
|
||||
@@ -108,15 +88,13 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
|
||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
|
||||
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
|
||||
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
|
||||
|
||||
|
||||
# Required for relocatable CMake package
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
||||
@@ -143,15 +121,10 @@ if (NOT DEFINED GGML_CUDA_GRAPHS)
|
||||
endif()
|
||||
|
||||
# transition helpers
|
||||
function (llama_option_depr TYPE OLD)
|
||||
function (llama_option_depr TYPE OLD NEW)
|
||||
if (${OLD})
|
||||
set(NEW "${ARGV2}")
|
||||
if(NEW)
|
||||
message(${TYPE} "${OLD} is deprecated, use ${NEW} instead")
|
||||
set(${NEW} ON PARENT_SCOPE)
|
||||
else()
|
||||
message(${TYPE} "${OLD} is deprecated and will be ignored")
|
||||
endif()
|
||||
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
|
||||
set(${NEW} ON PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
@@ -164,10 +137,29 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
||||
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
|
||||
llama_option_depr(WARNING LLAMA_CURL)
|
||||
|
||||
include("cmake/license.cmake")
|
||||
license_add_file("llama.cpp" "LICENSE")
|
||||
if (NOT MSVC)
|
||||
if (LLAMA_SANITIZE_THREAD)
|
||||
message(STATUS "Using -fsanitize=thread")
|
||||
|
||||
add_compile_options(-fsanitize=thread)
|
||||
link_libraries (-fsanitize=thread)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SANITIZE_ADDRESS)
|
||||
message(STATUS "Using -fsanitize=address")
|
||||
|
||||
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
link_libraries (-fsanitize=address)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SANITIZE_UNDEFINED)
|
||||
message(STATUS "Using -fsanitize=undefined")
|
||||
|
||||
add_compile_options(-fsanitize=undefined)
|
||||
link_libraries (-fsanitize=undefined)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# 3rd-party
|
||||
@@ -186,6 +178,11 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
|
||||
if (MINGW)
|
||||
# Target Windows 8 for PrefetchVirtualMemory
|
||||
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
||||
endif()
|
||||
|
||||
#
|
||||
# build the library
|
||||
#
|
||||
@@ -196,9 +193,13 @@ add_subdirectory(src)
|
||||
# utils, programs, examples and tests
|
||||
#
|
||||
|
||||
if (NOT LLAMA_BUILD_COMMON)
|
||||
message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
|
||||
set(LLAMA_CURL OFF)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
add_subdirectory(common)
|
||||
add_subdirectory(vendor/cpp-httplib)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
@@ -215,19 +216,6 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
|
||||
add_subdirectory(tools)
|
||||
endif()
|
||||
|
||||
# Automatically add all files from the 'licenses' directory
|
||||
file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
|
||||
|
||||
foreach(FILE_PATH ${EXTRA_LICENSES})
|
||||
get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
|
||||
string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
|
||||
license_add_file("${NAME}" "${FILE_PATH}")
|
||||
endforeach()
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
license_generate(common)
|
||||
endif()
|
||||
|
||||
#
|
||||
# install
|
||||
#
|
||||
|
||||
93
CODEOWNERS
93
CODEOWNERS
@@ -2,19 +2,27 @@
|
||||
# multiplie collaborators per item can be specified
|
||||
|
||||
/.devops/*.Dockerfile @ngxson
|
||||
/.github/actions/ @ggml-org/ci
|
||||
/.github/workflows/ @ggml-org/ci
|
||||
/.github/actions/ @slaren @CISC
|
||||
/.github/workflows/ @CISC
|
||||
/.github/workflows/release.yml @slaren
|
||||
/.github/workflows/winget.yml @slaren
|
||||
/ci/ @ggerganov
|
||||
/cmake/ @ggerganov
|
||||
/common/ @ggml-org/llama-common
|
||||
/common/jinja/ @CISC
|
||||
/common/ngram-map.* @srogmann
|
||||
/common/CMakeLists.txt @ggerganov
|
||||
/common/arg.* @ggerganov @ericcurtin
|
||||
/common/base64.hpp.* @ggerganov
|
||||
/common/build-info.* @ggerganov
|
||||
/common/common.* @ggerganov
|
||||
/common/console.* @ggerganov
|
||||
/common/http.* @angt
|
||||
/common/llguidance.* @ggerganov
|
||||
/common/log.* @ggerganov
|
||||
/common/sampling.* @ggerganov
|
||||
/common/speculative.* @ggerganov
|
||||
/convert_*.py @CISC
|
||||
/docs/backend/snapdragon/ @ggml-org/ggml-hexagon
|
||||
/examples/batched.swift/ @ggerganov
|
||||
/examples/batched/ @ggerganov
|
||||
/examples/convert-llama2c-to-ggml/ @ggerganov
|
||||
/examples/debug/ @danbev @pwilkin
|
||||
/examples/deprecation-warning/ @ggerganov
|
||||
/examples/diffusion/ @am17an
|
||||
/examples/embedding/ @ggerganov
|
||||
@@ -22,7 +30,7 @@
|
||||
/examples/export-docs/ @ggerganov
|
||||
/examples/gen-docs/ @ggerganov
|
||||
/examples/gguf/ @ggerganov
|
||||
/examples/llama.android/ @ggerganov @hanyin-arm @naco-siren
|
||||
/examples/llama.android/ @ggerganov
|
||||
/examples/llama.swiftui/ @ggerganov
|
||||
/examples/llama.vim @ggerganov
|
||||
/examples/lookahead/ @ggerganov
|
||||
@@ -32,66 +40,73 @@
|
||||
/examples/passkey/ @ggerganov
|
||||
/examples/retrieval/ @ggerganov
|
||||
/examples/save-load-state/ @ggerganov
|
||||
/examples/simple-chat/ @slaren
|
||||
/examples/simple/ @slaren
|
||||
/examples/speculative-simple/ @ggerganov
|
||||
/examples/speculative/ @ggerganov
|
||||
/ggml/cmake/ @ggerganov
|
||||
/ggml/include/ @ggerganov
|
||||
/ggml/src/ggml-cann/ @ggml-org/ggml-cann
|
||||
/ggml/src/ggml-common.h @ggerganov
|
||||
/ggml/src/ggml-cpu/ @ggerganov
|
||||
/ggml/include/ @ggerganov @slaren
|
||||
/ggml/src/ggml-alloc.c @slaren
|
||||
/ggml/src/ggml-backend* @slaren
|
||||
/ggml/src/ggml-blas/ @slaren
|
||||
/ggml/src/ggml-common.h @ggerganov @slaren
|
||||
/ggml/src/ggml-cpu/ @ggerganov @slaren
|
||||
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
|
||||
/ggml/src/ggml-cuda/ @ggml-org/ggml-cuda
|
||||
/ggml/src/ggml-cuda/common.cuh @slaren
|
||||
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/ggml-cuda.cu @slaren
|
||||
/ggml/src/ggml-cuda/mmf.* @JohannesGaessler @am17an
|
||||
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmvf.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
|
||||
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
|
||||
/ggml/src/ggml-hip/ @IMbackK
|
||||
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
|
||||
/ggml/src/ggml-impl.h @ggerganov
|
||||
/ggml/src/ggml-metal/ @ggml-org/ggml-metal
|
||||
/ggml/src/ggml-opencl/ @ggml-org/ggml-opencl
|
||||
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
|
||||
/ggml/src/ggml-impl.h @ggerganov @slaren
|
||||
/ggml/src/ggml-metal/ @ggerganov
|
||||
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
|
||||
/ggml/src/ggml-hexagon/ @max-krasnyansky
|
||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||
/ggml/src/ggml-quants.* @ggerganov
|
||||
/ggml/src/ggml-rpc/ @ggml-org/ggml-rpc
|
||||
/ggml/src/ggml-sycl/ @ggml-org/ggml-sycl
|
||||
/ggml/src/ggml-threading.* @ggerganov
|
||||
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
|
||||
/ggml/src/ggml-virtgpu/ @kpouget
|
||||
/ggml/src/ggml-webgpu/ @ggml-org/ggml-webgpu
|
||||
/ggml/src/ggml-zdnn/ @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
|
||||
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
|
||||
/ggml/src/ggml.c @ggerganov
|
||||
/ggml/src/ggml.cpp @ggerganov
|
||||
/ggml/src/ggml-rpc/ @rgerganov
|
||||
/ggml/src/ggml-threading.* @ggerganov @slaren
|
||||
/ggml/src/ggml-vulkan/ @0cc4m
|
||||
/ggml/src/ggml-webgpu/ @reeselevine
|
||||
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
|
||||
/ggml/src/ggml.c @ggerganov @slaren
|
||||
/ggml/src/ggml.cpp @ggerganov @slaren
|
||||
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
|
||||
/gguf-py/ @CISC
|
||||
/media/ @ggerganov
|
||||
/scripts/gen* @ggerganov
|
||||
/scripts/get* @ggerganov
|
||||
/scripts/sync* @ggerganov
|
||||
/scripts/snapdragon/ @ggml-org/ggml-hexagon
|
||||
/src/ @ggerganov
|
||||
/src/llama-adapter.* @CISC
|
||||
/src/llama-arch.* @CISC
|
||||
/src/llama-chat.* @ngxson
|
||||
/src/llama-graph.* @CISC
|
||||
/src/llama-model-loader.* @slaren
|
||||
/src/llama-model.* @CISC
|
||||
/src/llama-vocab.* @CISC
|
||||
/src/models/ @CISC
|
||||
/tests/ @ggerganov
|
||||
/tests/test-chat.* @pwilkin
|
||||
/tests/test-llama-archs.cpp @JohannesGaessler
|
||||
/tests/test-backend-ops.cpp @slaren
|
||||
/tests/test-thread-safety.cpp @slaren
|
||||
/tools/batched-bench/ @ggerganov
|
||||
/tools/cli/ @ngxson
|
||||
/tools/completion/ @ggerganov
|
||||
/tools/mtmd/ @ggml-org/llama-mtmd
|
||||
/tools/llama-bench/ @slaren
|
||||
/tools/main/ @ggerganov
|
||||
/tools/mtmd/ @ngxson
|
||||
/tools/perplexity/ @ggerganov
|
||||
/tools/parser/ @pwilkin
|
||||
/tools/quantize/ @ggerganov
|
||||
/tools/rpc/ @ggml-org/ggml-rpc
|
||||
/tools/server/* @ggml-org/llama-server # no subdir
|
||||
/tools/server/tests/ @ggml-org/llama-server
|
||||
/tools/server/webui/ @ggml-org/llama-webui
|
||||
/tools/rpc/ @rgerganov
|
||||
/tools/run/ @ericcurtin
|
||||
/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
|
||||
/tools/server/webui/ @allozaur
|
||||
/tools/tokenize/ @ggerganov
|
||||
/tools/tts/ @ggerganov
|
||||
/vendor/ @ggerganov
|
||||
/.clang-format @slaren
|
||||
/.clang-tidy @slaren
|
||||
/AUTHORS @ggerganov
|
||||
/CMakeLists.txt @ggerganov
|
||||
/CONTRIBUTING.md @ggerganov
|
||||
|
||||
@@ -6,53 +6,19 @@ The project differentiates between 3 levels of contributors:
|
||||
- Collaborators (Triage): people with significant contributions, who may be responsible for some parts of the code, and are expected to maintain and review contributions for the code they own
|
||||
- Maintainers: responsible for reviewing and merging PRs, after approval from the code owners
|
||||
|
||||
# AI Usage Policy
|
||||
|
||||
> [!IMPORTANT]
|
||||
> This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
|
||||
>
|
||||
> Repeated violations of this policy may result in your account being permanently banned from contributing to the project.
|
||||
>
|
||||
> Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file.
|
||||
|
||||
Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations).
|
||||
|
||||
If AI is used to generate any portion of the code, contributors must adhere to the following requirements:
|
||||
|
||||
1. Explicitly disclose the manner in which AI was employed.
|
||||
2. Perform a comprehensive manual review prior to submitting the pull request.
|
||||
3. Be prepared to explain every line of code they submitted when asked about it by a maintainer.
|
||||
4. It is strictly prohibited to use AI to write your posts for you (bug reports, feature requests, pull request descriptions, Github discussions, responding to humans, ...).
|
||||
|
||||
For more info, please refer to the [AGENTS.md](AGENTS.md) file.
|
||||
|
||||
# Pull requests (for contributors & collaborators)
|
||||
|
||||
Before submitting your PR:
|
||||
- Search for existing PRs to prevent duplicating efforts
|
||||
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
|
||||
- Test your changes:
|
||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
|
||||
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
||||
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
||||
- Create separate PRs for each feature or fix:
|
||||
- Avoid combining unrelated changes in a single PR
|
||||
- For intricate features, consider opening a feature request first to discuss and align expectations
|
||||
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
|
||||
- In particular, adding new data types (extension of the `ggml_type` enum) carries with it a disproportionate maintenance burden. As such, to add a new quantization type you will need to meet the following *additional* criteria *at minimum*:
|
||||
- convert a small model to GGUF using the new type and upload it to HuggingFace
|
||||
- provide [perplexity](https://github.com/ggml-org/llama.cpp/tree/master/tools/perplexity) comparisons to FP16/BF16 (whichever is the native precision) as well as to types of similar size
|
||||
- provide KL divergence data calculated vs. the FP16/BF16 (whichever is the native precision) version for both the new type as well as types of similar size
|
||||
- provide [performance data](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) for the new type in comparison to types of similar size on pure CPU
|
||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
|
||||
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
|
||||
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
||||
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
|
||||
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
||||
- If you are a new contributor, limit your open PRs to 1.
|
||||
|
||||
After submitting your PR:
|
||||
- Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
|
||||
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
||||
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
|
||||
- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
|
||||
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for fixing related issues and reviewing related PRs
|
||||
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
|
||||
|
||||
# Pull requests (for maintainers)
|
||||
|
||||
@@ -63,11 +29,6 @@ After submitting your PR:
|
||||
- When merging a PR, make sure you have a good understanding of the changes
|
||||
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
|
||||
|
||||
Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
|
||||
- The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone.
|
||||
- The pull request duplicates an existing one.
|
||||
- The contributor fails to adhere to this contributing guide or the AI policy.
|
||||
|
||||
# Coding guidelines
|
||||
|
||||
- Avoid adding third-party dependencies, extra files, extra headers, etc.
|
||||
@@ -167,7 +128,7 @@ Maintainers reserve the right to decline review or close pull requests for any r
|
||||
|
||||
# Code maintenance
|
||||
|
||||
- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file responsible for:
|
||||
- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
|
||||
- Reviewing and merging related PRs
|
||||
- Fixing related bugs
|
||||
- Providing developer guidance/support
|
||||
@@ -180,8 +141,6 @@ Maintainers reserve the right to decline review or close pull requests for any r
|
||||
- New code should follow the guidelines (coding, naming, etc.) outlined in this document. Exceptions are allowed in isolated, backend-specific parts of the code that do not interface directly with the `ggml` interfaces.
|
||||
_(NOTE: for legacy reasons, existing code is not required to follow this guideline)_
|
||||
|
||||
- For changes in server, please make sure to refer to the [server development documentation](./tools/server/README-dev.md)
|
||||
|
||||
# Documentation
|
||||
|
||||
- Documentation is a community effort
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023-2026 The ggml authors
|
||||
Copyright (c) 2023-2024 The ggml authors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
65
README.md
65
README.md
@@ -17,14 +17,14 @@ LLM inference in C/C++
|
||||
|
||||
## Hot topics
|
||||
|
||||
- **Hugging Face cache migration: models downloaded with `-hf` are now stored in the standard Hugging Face cache directory, enabling sharing with other HF tools.**
|
||||
- **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
|
||||
- [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
|
||||
- [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
|
||||
- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)**
|
||||
- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
|
||||
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
|
||||
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
|
||||
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
|
||||
@@ -62,7 +62,6 @@ range of hardware - locally and in the cloud.
|
||||
- Plain C/C++ implementation without any dependencies
|
||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
||||
- RVV, ZVFH, ZFH, ZICBOP and ZIHINTPAUSE support for RISC-V architectures
|
||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
|
||||
- Vulkan and SYCL backend support
|
||||
@@ -85,7 +84,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
||||
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
||||
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
|
||||
- [x] [Jamba](https://huggingface.co/ai21labs)
|
||||
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
|
||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||
@@ -133,7 +131,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
|
||||
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
|
||||
- [x] [RWKV-7](https://huggingface.co/collections/shoumenchougou/rwkv7-gxx-gguf)
|
||||
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
|
||||
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
|
||||
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
|
||||
@@ -192,7 +189,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
||||
- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
|
||||
- Go (no CGo needed): [hybridgroup/yzma](https://github.com/hybridgroup/yzma)
|
||||
- Android: [llama.android](/examples/llama.android)
|
||||
|
||||
</details>
|
||||
|
||||
@@ -202,7 +198,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
|
||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||
- [BonzAI App](https://apps.apple.com/us/app/bonzai-your-local-ai-agent/id6752847988) (proprietary)
|
||||
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
|
||||
- [Dot](https://github.com/alexpinel/Dot) (GPL)
|
||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||
@@ -214,7 +209,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
|
||||
- [LARS](https://github.com/abgulati/LARS) (AGPL)
|
||||
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
||||
- [LlamaLib](https://github.com/undreamai/LlamaLib) (Apache-2.0)
|
||||
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
|
||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
||||
@@ -242,12 +236,11 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
<details>
|
||||
<summary>Tools</summary>
|
||||
|
||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from Hugging Face Hub and convert them to GGML
|
||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
|
||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
||||
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
|
||||
- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0)
|
||||
|
||||
</details>
|
||||
|
||||
@@ -260,8 +253,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
|
||||
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
|
||||
- [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
|
||||
- [LLMKube](https://github.com/defilantech/llmkube) - Kubernetes operator for llama.cpp with multi-GPU and Apple Silicon Metal
|
||||
support"
|
||||
</details>
|
||||
|
||||
<details>
|
||||
@@ -280,19 +271,16 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [BLAS](docs/build.md#blas-build) | All |
|
||||
| [BLIS](docs/backend/BLIS.md) | All |
|
||||
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||
| [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
|
||||
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
||||
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
||||
| [HIP](docs/build.md#hip) | AMD GPU |
|
||||
| [ZenDNN](docs/build.md#zendnn) | AMD CPU |
|
||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
|
||||
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
|
||||
| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
|
||||
| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
|
||||
|
||||
## Obtaining and quantizing models
|
||||
|
||||
@@ -301,13 +289,13 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
|
||||
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
|
||||
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
|
||||
|
||||
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
|
||||
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
|
||||
|
||||
```sh
|
||||
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
|
||||
```
|
||||
|
||||
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. The `MODEL_ENDPOINT` must point to a Hugging Face compatible API endpoint.
|
||||
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
|
||||
|
||||
After downloading a model, use the CLI tools to run it locally - see below.
|
||||
|
||||
@@ -322,7 +310,7 @@ The Hugging Face platform provides a variety of online tools for converting, qua
|
||||
|
||||
To learn more about model quantization, [read this documentation](tools/quantize/README.md)
|
||||
|
||||
## [`llama-cli`](tools/cli)
|
||||
## [`llama-cli`](tools/main)
|
||||
|
||||
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
|
||||
|
||||
@@ -356,6 +344,19 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||
|
||||
</details>
|
||||
|
||||
- <details>
|
||||
<summary>Run simple text completion</summary>
|
||||
|
||||
To disable conversation mode explicitly, use `-no-cnv`
|
||||
|
||||
```bash
|
||||
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
|
||||
|
||||
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
- <details>
|
||||
<summary>Constrain the output with a custom grammar</summary>
|
||||
|
||||
@@ -490,6 +491,21 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||
|
||||
</details>
|
||||
|
||||
## [`llama-run`](tools/run)
|
||||
|
||||
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
|
||||
|
||||
- <details>
|
||||
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
|
||||
|
||||
```bash
|
||||
llama-run granite-code
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
[^3]: [RamaLama](https://github.com/containers/ramalama)
|
||||
|
||||
## [`llama-simple`](examples/simple)
|
||||
|
||||
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
|
||||
@@ -519,8 +535,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
||||
|
||||
## Other documentation
|
||||
|
||||
- [cli](tools/cli/README.md)
|
||||
- [completion](tools/completion/README.md)
|
||||
- [main (cli)](tools/main/README.md)
|
||||
- [server](tools/server/README.md)
|
||||
- [GBNF grammars](grammars/README.md)
|
||||
|
||||
@@ -592,5 +607,7 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
|
||||
- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
|
||||
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
|
||||
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
|
||||
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
|
||||
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
|
||||
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
|
||||
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
|
||||
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
|
||||
|
||||
53
SECURITY.md
53
SECURITY.md
@@ -1,52 +1,12 @@
|
||||
# Security Policy
|
||||
|
||||
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
|
||||
- [**Requirements**](#requirements)
|
||||
- [**Covered Topics**](#covered-topics)
|
||||
- [**Using llama.cpp securely**](#using-llamacpp-securely)
|
||||
- [Untrusted models](#untrusted-models)
|
||||
- [Untrusted inputs](#untrusted-inputs)
|
||||
- [Data privacy](#data-privacy)
|
||||
- [Untrusted environments or networks](#untrusted-environments-or-networks)
|
||||
- [Multi-Tenant environments](#multi-tenant-environments)
|
||||
|
||||
## Reporting a vulnerability
|
||||
|
||||
If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
||||
|
||||
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
|
||||
|
||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> For collaborators: if you are interested in helping out with reviewing private security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
|
||||
|
||||
## Requirements
|
||||
|
||||
Before submitting your report, ensure you meet the following requirements:
|
||||
|
||||
- You have read this policy and fully understand it.
|
||||
- AI is only permitted in an assistive capacity as stated in [AGENTS.md](AGENTS.md). We do not accept reports that are written exclusively by AI.
|
||||
- Your report must include a working Proof-of-Concept in the form of a script and/or attached files.
|
||||
|
||||
Maintainers reserve the right to close the report if these requirements are not fulfilled.
|
||||
|
||||
## Covered Topics
|
||||
|
||||
Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues.
|
||||
|
||||
- `src/**/*`
|
||||
- `ggml/**/*`
|
||||
- `gguf-py/**/*`
|
||||
- `tools/server/*`, **excluding** the following topics:
|
||||
- Web UI
|
||||
- Features marked as experimental
|
||||
- Features not recommended for use in untrusted environments (e.g., router, MCP)
|
||||
- Bugs that can lead to Denial-of-Service attack
|
||||
|
||||
Note that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities in LLaMA C++.
|
||||
|
||||
For vulnerabilities that fall within the `vendor` directory, please report them directly to the third-party project.
|
||||
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
|
||||
|
||||
## Using llama.cpp securely
|
||||
|
||||
@@ -95,3 +55,14 @@ If you intend to run multiple models in parallel with shared memory, it is your
|
||||
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
|
||||
|
||||
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
|
||||
|
||||
## Reporting a vulnerability
|
||||
|
||||
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
|
||||
|
||||
<!-- normal version -->
|
||||
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
||||
|
||||
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
|
||||
|
||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +0,0 @@
|
||||
{
|
||||
"chars": 2296.1916666666666,
|
||||
"chars:std": 986.051306946325,
|
||||
"score": 0.925,
|
||||
"score:std": 0.26339134382131846
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -1,311 +0,0 @@
|
||||
## System info
|
||||
|
||||
```bash
|
||||
uname --all
|
||||
Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
|
||||
|
||||
g++ --version
|
||||
g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
|
||||
|
||||
nvidia-smi
|
||||
Thu Feb 5 13:49:40 2026
|
||||
+-----------------------------------------------------------------------------------------+
|
||||
| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
|
||||
+-----------------------------------------+------------------------+----------------------+
|
||||
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
||||
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
||||
| | | MIG M. |
|
||||
|=========================================+========================+======================|
|
||||
| 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A |
|
||||
| N/A 47C P0 13W / N/A | Not Supported | 0% Default |
|
||||
| | | N/A |
|
||||
+-----------------------------------------+------------------------+----------------------+
|
||||
```
|
||||
|
||||
## ggml-org/gpt-oss-20b-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.270 | 1895.57 | 0.399 | 80.13 | 0.669 | 812.60 |
|
||||
| 512 | 32 | 2 | 1088 | 0.230 | 4451.23 | 0.583 | 109.71 | 0.813 | 1337.56 |
|
||||
| 512 | 32 | 4 | 2176 | 0.437 | 4688.87 | 0.820 | 156.03 | 1.257 | 1730.91 |
|
||||
| 512 | 32 | 8 | 4352 | 0.863 | 4744.23 | 0.942 | 271.79 | 1.805 | 2410.73 |
|
||||
| 512 | 32 | 16 | 8704 | 1.725 | 4748.19 | 1.173 | 436.38 | 2.899 | 3002.85 |
|
||||
| 512 | 32 | 32 | 17408 | 3.437 | 4767.38 | 1.503 | 681.49 | 4.939 | 3524.40 |
|
||||
| 4096 | 32 | 1 | 4128 | 0.907 | 4513.91 | 0.407 | 78.54 | 1.315 | 3139.56 |
|
||||
| 4096 | 32 | 2 | 8256 | 1.796 | 4560.42 | 0.625 | 102.37 | 2.422 | 3409.45 |
|
||||
| 4096 | 32 | 4 | 16512 | 3.596 | 4555.66 | 0.888 | 144.11 | 4.485 | 3681.93 |
|
||||
| 4096 | 32 | 8 | 33024 | 7.184 | 4561.44 | 1.098 | 233.11 | 8.282 | 3987.51 |
|
||||
| 4096 | 32 | 16 | 66048 | 14.369 | 4560.82 | 1.503 | 340.74 | 15.872 | 4161.30 |
|
||||
| 4096 | 32 | 32 | 132096 | 28.760 | 4557.52 | 2.162 | 473.59 | 30.922 | 4271.95 |
|
||||
| 8192 | 32 | 1 | 8224 | 1.859 | 4405.59 | 0.430 | 74.36 | 2.290 | 3591.61 |
|
||||
| 8192 | 32 | 2 | 16448 | 3.698 | 4430.02 | 0.656 | 97.59 | 4.354 | 3777.47 |
|
||||
| 8192 | 32 | 4 | 32896 | 7.403 | 4426.10 | 0.957 | 133.82 | 8.360 | 3934.97 |
|
||||
| 8192 | 32 | 8 | 65792 | 14.802 | 4427.63 | 1.222 | 209.44 | 16.024 | 4105.87 |
|
||||
| 8192 | 32 | 16 | 131584 | 29.596 | 4428.67 | 1.741 | 294.13 | 31.337 | 4199.00 |
|
||||
| 8192 | 32 | 32 | 263168 | 59.169 | 4430.42 | 2.619 | 390.92 | 61.789 | 4259.17 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 4505.82 ± 12.90 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 83.43 ± 0.59 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 4158.34 ± 18.84 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 79.22 ± 0.60 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 3993.81 ± 17.55 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 75.22 ± 1.05 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 3449.98 ± 12.13 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 70.36 ± 0.37 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 2689.42 ± 18.89 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 61.65 ± 0.30 |
|
||||
|
||||
build: 11fb327bf (7941)
|
||||
|
||||
## ggml-org/gpt-oss-120b-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.445 | 1151.80 | 0.560 | 57.14 | 1.005 | 541.53 |
|
||||
| 512 | 32 | 2 | 1088 | 0.472 | 2169.85 | 0.874 | 73.27 | 1.345 | 808.65 |
|
||||
| 512 | 32 | 4 | 2176 | 0.826 | 2480.33 | 1.299 | 98.51 | 2.125 | 1023.94 |
|
||||
| 512 | 32 | 8 | 4352 | 1.644 | 2491.67 | 1.608 | 159.18 | 3.252 | 1338.20 |
|
||||
| 512 | 32 | 16 | 8704 | 3.292 | 2488.35 | 2.117 | 241.85 | 5.409 | 1609.13 |
|
||||
| 512 | 32 | 32 | 17408 | 6.604 | 2481.07 | 2.898 | 353.31 | 9.502 | 1832.04 |
|
||||
| 4096 | 32 | 1 | 4128 | 1.698 | 2412.65 | 0.580 | 55.21 | 2.277 | 1812.66 |
|
||||
| 4096 | 32 | 2 | 8256 | 3.399 | 2409.88 | 0.934 | 68.53 | 4.333 | 1905.27 |
|
||||
| 4096 | 32 | 4 | 16512 | 6.823 | 2401.21 | 1.411 | 90.72 | 8.234 | 2005.30 |
|
||||
| 4096 | 32 | 8 | 33024 | 13.574 | 2413.97 | 1.841 | 139.07 | 15.415 | 2142.31 |
|
||||
| 4096 | 32 | 16 | 66048 | 27.176 | 2411.52 | 2.609 | 196.26 | 29.785 | 2217.49 |
|
||||
| 4096 | 32 | 32 | 132096 | 54.359 | 2411.23 | 3.905 | 262.20 | 58.264 | 2267.19 |
|
||||
| 8192 | 32 | 1 | 8224 | 3.491 | 2346.81 | 0.613 | 52.23 | 4.103 | 2004.21 |
|
||||
| 8192 | 32 | 2 | 16448 | 6.939 | 2361.03 | 0.981 | 65.21 | 7.921 | 2076.56 |
|
||||
| 8192 | 32 | 4 | 32896 | 13.888 | 2359.40 | 1.511 | 84.71 | 15.399 | 2136.21 |
|
||||
| 8192 | 32 | 8 | 65792 | 27.756 | 2361.18 | 2.034 | 125.86 | 29.790 | 2208.56 |
|
||||
| 8192 | 32 | 16 | 131584 | 55.554 | 2359.34 | 3.021 | 169.49 | 58.575 | 2246.41 |
|
||||
| 8192 | 32 | 32 | 263168 | 111.036 | 2360.89 | 4.537 | 225.72 | 115.573 | 2277.08 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 2443.91 ± 7.47 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 58.72 ± 0.20 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 2309.84 ± 3.63 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 55.67 ± 0.35 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 2216.68 ± 10.16 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 52.87 ± 0.43 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 1956.31 ± 6.39 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 49.45 ± 0.20 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 1567.08 ± 11.79 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 42.76 ± 0.14 |
|
||||
|
||||
build: 11fb327bf (7941)
|
||||
|
||||
## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.393 | 1303.73 | 0.548 | 58.36 | 0.941 | 578.10 |
|
||||
| 512 | 32 | 2 | 1088 | 0.387 | 2648.68 | 0.910 | 70.35 | 1.296 | 839.27 |
|
||||
| 512 | 32 | 4 | 2176 | 0.659 | 3107.63 | 1.302 | 98.33 | 1.961 | 1109.77 |
|
||||
| 512 | 32 | 8 | 4352 | 1.322 | 3099.35 | 1.669 | 153.42 | 2.990 | 1455.43 |
|
||||
| 512 | 32 | 16 | 8704 | 2.639 | 3104.63 | 2.212 | 231.44 | 4.851 | 1794.32 |
|
||||
| 512 | 32 | 32 | 17408 | 5.284 | 3100.80 | 2.955 | 346.53 | 8.239 | 2112.93 |
|
||||
| 4096 | 32 | 1 | 4128 | 1.417 | 2890.36 | 0.598 | 53.51 | 2.015 | 2048.45 |
|
||||
| 4096 | 32 | 2 | 8256 | 2.829 | 2895.62 | 1.019 | 62.82 | 3.848 | 2145.60 |
|
||||
| 4096 | 32 | 4 | 16512 | 5.656 | 2896.96 | 1.528 | 83.79 | 7.183 | 2298.71 |
|
||||
| 4096 | 32 | 8 | 33024 | 11.338 | 2890.02 | 2.127 | 120.36 | 13.465 | 2452.53 |
|
||||
| 4096 | 32 | 16 | 66048 | 22.709 | 2885.96 | 3.104 | 164.97 | 25.812 | 2558.79 |
|
||||
| 4096 | 32 | 32 | 132096 | 45.301 | 2893.35 | 4.723 | 216.80 | 50.024 | 2640.63 |
|
||||
| 8192 | 32 | 1 | 8224 | 3.022 | 2711.09 | 0.678 | 47.20 | 3.700 | 2222.89 |
|
||||
| 8192 | 32 | 2 | 16448 | 6.039 | 2713.01 | 1.149 | 55.70 | 7.188 | 2288.21 |
|
||||
| 8192 | 32 | 4 | 32896 | 12.050 | 2719.35 | 1.785 | 71.69 | 13.835 | 2377.67 |
|
||||
| 8192 | 32 | 8 | 65792 | 24.113 | 2717.90 | 2.629 | 97.39 | 26.741 | 2460.31 |
|
||||
| 8192 | 32 | 16 | 131584 | 48.178 | 2720.58 | 4.099 | 124.91 | 52.277 | 2517.06 |
|
||||
| 8192 | 32 | 32 | 263168 | 96.401 | 2719.31 | 6.696 | 152.93 | 103.097 | 2552.63 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 2986.97 ± 18.87 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 61.06 ± 0.23 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 2633.45 ± 6.26 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 54.77 ± 0.28 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 2354.14 ± 3.84 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 48.02 ± 0.40 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 1908.86 ± 4.25 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 40.23 ± 0.10 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 1348.17 ± 2.00 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 30.21 ± 0.04 |
|
||||
|
||||
build: 11fb327bf (7941)
|
||||
|
||||
## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.212 | 2420.12 | 1.100 | 29.10 | 1.311 | 414.85 |
|
||||
| 512 | 32 | 2 | 1088 | 0.428 | 2393.89 | 1.185 | 54.00 | 1.613 | 674.56 |
|
||||
| 512 | 32 | 4 | 2176 | 0.894 | 2290.41 | 1.229 | 104.17 | 2.123 | 1025.02 |
|
||||
| 512 | 32 | 8 | 4352 | 1.758 | 2330.36 | 1.319 | 194.15 | 3.076 | 1414.70 |
|
||||
| 512 | 32 | 16 | 8704 | 3.508 | 2335.21 | 1.543 | 331.90 | 5.051 | 1723.33 |
|
||||
| 512 | 32 | 32 | 17408 | 7.035 | 2328.93 | 1.738 | 589.21 | 8.773 | 1984.29 |
|
||||
| 4096 | 32 | 1 | 4128 | 1.831 | 2237.25 | 1.125 | 28.44 | 2.956 | 1396.42 |
|
||||
| 4096 | 32 | 2 | 8256 | 3.642 | 2249.48 | 1.253 | 51.07 | 4.895 | 1686.64 |
|
||||
| 4096 | 32 | 4 | 16512 | 7.274 | 2252.26 | 1.380 | 92.72 | 8.655 | 1907.81 |
|
||||
| 4096 | 32 | 8 | 33024 | 14.576 | 2248.09 | 1.617 | 158.29 | 16.193 | 2039.37 |
|
||||
| 4096 | 32 | 16 | 66048 | 29.138 | 2249.17 | 2.081 | 246.01 | 31.219 | 2115.63 |
|
||||
| 4096 | 32 | 32 | 132096 | 58.275 | 2249.19 | 2.814 | 363.87 | 61.089 | 2162.34 |
|
||||
| 8192 | 32 | 1 | 8224 | 3.757 | 2180.26 | 1.184 | 27.03 | 4.941 | 1664.37 |
|
||||
| 8192 | 32 | 2 | 16448 | 7.522 | 2178.05 | 1.341 | 47.73 | 8.863 | 1855.77 |
|
||||
| 8192 | 32 | 4 | 32896 | 15.043 | 2178.25 | 1.548 | 82.69 | 16.591 | 1982.74 |
|
||||
| 8192 | 32 | 8 | 65792 | 30.111 | 2176.49 | 1.937 | 132.13 | 32.048 | 2052.90 |
|
||||
| 8192 | 32 | 16 | 131584 | 60.405 | 2169.90 | 2.706 | 189.21 | 63.111 | 2084.97 |
|
||||
| 8192 | 32 | 32 | 263168 | 120.439 | 2176.58 | 3.993 | 256.46 | 124.432 | 2114.96 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 2250.28 ± 6.41 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 29.43 ± 0.02 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 2100.19 ± 8.96 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 28.61 ± 0.02 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 2007.56 ± 4.16 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 27.38 ± 0.09 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 1779.11 ± 6.42 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 25.72 ± 0.03 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 1471.23 ± 1.71 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 22.51 ± 0.02 |
|
||||
|
||||
build: 11fb327bf (7941)
|
||||
|
||||
## ggml-org/gemma-3-4b-it-qat-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.092 | 5566.97 | 0.412 | 77.63 | 0.504 | 1078.95 |
|
||||
| 512 | 32 | 2 | 1088 | 0.161 | 6345.67 | 0.522 | 122.70 | 0.683 | 1593.06 |
|
||||
| 512 | 32 | 4 | 2176 | 0.325 | 6309.87 | 0.562 | 227.68 | 0.887 | 2453.87 |
|
||||
| 512 | 32 | 8 | 4352 | 0.643 | 6374.42 | 0.685 | 373.67 | 1.328 | 3277.94 |
|
||||
| 512 | 32 | 16 | 8704 | 1.277 | 6413.64 | 0.915 | 559.47 | 2.192 | 3970.01 |
|
||||
| 512 | 32 | 32 | 17408 | 2.518 | 6506.57 | 1.249 | 819.61 | 3.767 | 4620.64 |
|
||||
| 4096 | 32 | 1 | 4128 | 0.674 | 6079.68 | 0.453 | 70.60 | 1.127 | 3662.88 |
|
||||
| 4096 | 32 | 2 | 8256 | 1.335 | 6137.82 | 0.627 | 102.03 | 1.962 | 4208.11 |
|
||||
| 4096 | 32 | 4 | 16512 | 2.657 | 6167.35 | 0.749 | 170.92 | 3.405 | 4848.71 |
|
||||
| 4096 | 32 | 8 | 33024 | 5.307 | 6173.91 | 0.974 | 262.89 | 6.281 | 5257.53 |
|
||||
| 4096 | 32 | 16 | 66048 | 10.610 | 6176.96 | 1.379 | 371.42 | 11.988 | 5509.40 |
|
||||
| 4096 | 32 | 32 | 132096 | 21.213 | 6178.89 | 2.122 | 482.50 | 23.335 | 5660.82 |
|
||||
| 8192 | 32 | 1 | 8224 | 1.359 | 6027.34 | 0.467 | 68.52 | 1.826 | 4503.48 |
|
||||
| 8192 | 32 | 2 | 16448 | 2.699 | 6069.68 | 0.653 | 98.03 | 3.352 | 4906.68 |
|
||||
| 8192 | 32 | 4 | 32896 | 5.366 | 6106.74 | 0.818 | 156.55 | 6.184 | 5319.96 |
|
||||
| 8192 | 32 | 8 | 65792 | 10.755 | 6093.50 | 1.174 | 218.04 | 11.929 | 5515.22 |
|
||||
| 8192 | 32 | 16 | 131584 | 21.484 | 6100.82 | 1.829 | 279.90 | 23.314 | 5644.11 |
|
||||
| 8192 | 32 | 32 | 263168 | 42.950 | 6103.40 | 3.058 | 334.91 | 46.008 | 5720.05 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 5948.74 ± 10.61 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 81.05 ± 0.20 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 5652.69 ± 34.29 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 76.37 ± 0.58 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 5509.57 ± 40.69 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 71.61 ± 0.80 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 5340.86 ± 36.92 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 70.89 ± 0.34 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 5023.30 ± 13.52 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 62.28 ± 0.30 |
|
||||
|
||||
build: 11fb327bf (7941)
|
||||
|
||||
## ggml-org/GLM-4.7-Flash-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.433 | 1181.83 | 0.693 | 46.16 | 1.126 | 482.94 |
|
||||
| 512 | 32 | 2 | 1088 | 0.439 | 2334.46 | 1.034 | 61.89 | 1.473 | 738.75 |
|
||||
| 512 | 32 | 4 | 2176 | 0.772 | 2654.46 | 1.459 | 87.76 | 2.230 | 975.77 |
|
||||
| 512 | 32 | 8 | 4352 | 1.541 | 2658.78 | 2.043 | 125.31 | 3.583 | 1214.47 |
|
||||
| 512 | 32 | 16 | 8704 | 3.083 | 2656.91 | 2.675 | 191.42 | 5.758 | 1511.62 |
|
||||
| 512 | 32 | 32 | 17408 | 6.159 | 2660.12 | 3.615 | 283.24 | 9.774 | 1780.98 |
|
||||
| 4096 | 32 | 1 | 4128 | 1.915 | 2139.30 | 0.725 | 44.14 | 2.640 | 1563.83 |
|
||||
| 4096 | 32 | 2 | 8256 | 3.834 | 2136.40 | 1.119 | 57.21 | 4.953 | 1666.81 |
|
||||
| 4096 | 32 | 4 | 16512 | 7.636 | 2145.72 | 1.631 | 78.49 | 9.266 | 1781.93 |
|
||||
| 4096 | 32 | 8 | 33024 | 15.295 | 2142.40 | 2.344 | 109.21 | 17.639 | 1872.20 |
|
||||
| 4096 | 32 | 16 | 66048 | 30.573 | 2143.62 | 3.773 | 135.70 | 34.346 | 1923.04 |
|
||||
| 4096 | 32 | 32 | 132096 | 61.282 | 2138.82 | 5.795 | 176.71 | 67.077 | 1969.31 |
|
||||
| 8192 | 32 | 1 | 8224 | 4.510 | 1816.24 | 0.760 | 42.11 | 5.270 | 1560.44 |
|
||||
| 8192 | 32 | 2 | 16448 | 9.036 | 1813.19 | 1.206 | 53.06 | 10.242 | 1605.91 |
|
||||
| 8192 | 32 | 4 | 32896 | 18.070 | 1813.43 | 1.783 | 71.80 | 19.852 | 1657.03 |
|
||||
| 8192 | 32 | 8 | 65792 | 36.125 | 1814.15 | 2.635 | 97.14 | 38.760 | 1697.41 |
|
||||
| 8192 | 32 | 16 | 131584 | 72.367 | 1811.20 | 4.954 | 103.34 | 77.322 | 1701.77 |
|
||||
| 8192 | 32 | 32 | 263168 | 144.501 | 1814.13 | 8.103 | 126.37 | 152.604 | 1724.51 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | ngl | n_ubatch | fa | dio | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | --: | --------------: | -------------------: |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 | 2364.18 ± 11.43 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 | 48.68 ± 0.12 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d4096 | 1684.13 ± 1.24 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d4096 | 44.62 ± 0.22 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d8192 | 1314.68 ± 1.41 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d8192 | 42.59 ± 0.11 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d16384 | 914.05 ± 3.32 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d16384 | 38.72 ± 0.13 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d32768 | 567.20 ± 0.90 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d32768 | 32.65 ± 0.09 |
|
||||
|
||||
build: 11fb327bf (7941)
|
||||
File diff suppressed because one or more lines are too long
@@ -1,298 +0,0 @@
|
||||
## System info
|
||||
|
||||
```bash
|
||||
uname -a
|
||||
Darwin gg-studio 25.2.0 Darwin Kernel Version 25.2.0: Tue Nov 18 21:07:05 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6020 arm64
|
||||
|
||||
g++ --version
|
||||
Apple clang version 17.0.0 (clang-1700.3.19.1)
|
||||
Target: arm64-apple-darwin25.2.0
|
||||
```
|
||||
|
||||
## ggml-org/gpt-oss-20b-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.215 | 2381.35 | 0.245 | 130.45 | 0.460 | 1181.81 |
|
||||
| 512 | 32 | 2 | 1088 | 0.379 | 2701.43 | 0.382 | 167.56 | 0.761 | 1429.67 |
|
||||
| 512 | 32 | 4 | 2176 | 0.721 | 2839.27 | 0.604 | 211.76 | 1.326 | 1641.32 |
|
||||
| 512 | 32 | 8 | 4352 | 1.433 | 2858.30 | 1.033 | 247.75 | 2.466 | 1764.57 |
|
||||
| 512 | 32 | 16 | 8704 | 2.853 | 2871.12 | 1.570 | 326.11 | 4.423 | 1967.77 |
|
||||
| 512 | 32 | 32 | 17408 | 5.699 | 2874.95 | 1.910 | 536.15 | 7.609 | 2287.88 |
|
||||
| 4096 | 32 | 1 | 4128 | 1.552 | 2638.56 | 0.334 | 95.72 | 1.887 | 2188.00 |
|
||||
| 4096 | 32 | 2 | 8256 | 3.084 | 2655.88 | 0.404 | 158.54 | 3.488 | 2366.86 |
|
||||
| 4096 | 32 | 4 | 16512 | 6.151 | 2663.78 | 0.652 | 196.39 | 6.802 | 2427.37 |
|
||||
| 4096 | 32 | 8 | 33024 | 12.288 | 2666.77 | 1.135 | 225.47 | 13.423 | 2460.27 |
|
||||
| 4096 | 32 | 16 | 66048 | 24.563 | 2668.12 | 1.762 | 290.55 | 26.325 | 2508.97 |
|
||||
| 4096 | 32 | 32 | 132096 | 49.114 | 2668.73 | 2.398 | 426.94 | 51.512 | 2564.35 |
|
||||
| 8192 | 32 | 1 | 8224 | 3.345 | 2448.78 | 0.275 | 116.46 | 3.620 | 2271.76 |
|
||||
| 8192 | 32 | 2 | 16448 | 6.665 | 2458.11 | 0.425 | 150.71 | 7.090 | 2319.91 |
|
||||
| 8192 | 32 | 4 | 32896 | 13.315 | 2460.92 | 0.691 | 185.21 | 14.006 | 2348.63 |
|
||||
| 8192 | 32 | 8 | 65792 | 26.611 | 2462.73 | 1.212 | 211.16 | 27.823 | 2364.62 |
|
||||
| 8192 | 32 | 16 | 131584 | 53.232 | 2462.27 | 1.919 | 266.83 | 55.151 | 2385.88 |
|
||||
| 8192 | 32 | 32 | 263168 | 110.455 | 2373.30 | 2.752 | 372.03 | 113.208 | 2324.64 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | threads | n_ubatch | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 2713.40 ± 3.56 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 129.97 ± 3.90 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 2324.59 ± 3.01 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 123.38 ± 0.17 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 1989.82 ± 30.11 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 117.39 ± 0.33 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 1556.54 ± 6.22 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 109.75 ± 0.42 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 1122.63 ± 1.45 |
|
||||
| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 98.25 ± 0.08 |
|
||||
|
||||
build: b828e18c7 (7948)
|
||||
|
||||
## ggml-org/gpt-oss-120b-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.426 | 1200.92 | 0.361 | 88.56 | 0.788 | 690.64 |
|
||||
| 512 | 32 | 2 | 1088 | 0.683 | 1500.14 | 0.545 | 117.35 | 1.228 | 886.02 |
|
||||
| 512 | 32 | 4 | 2176 | 1.204 | 1701.56 | 0.847 | 151.19 | 2.050 | 1061.34 |
|
||||
| 512 | 32 | 8 | 4352 | 2.402 | 1705.20 | 1.455 | 176.00 | 3.857 | 1128.45 |
|
||||
| 512 | 32 | 16 | 8704 | 4.802 | 1705.90 | 2.349 | 217.93 | 7.152 | 1217.08 |
|
||||
| 512 | 32 | 32 | 17408 | 9.593 | 1707.85 | 3.665 | 279.42 | 13.258 | 1313.01 |
|
||||
| 4096 | 32 | 1 | 4128 | 2.581 | 1587.08 | 0.390 | 82.12 | 2.970 | 1389.67 |
|
||||
| 4096 | 32 | 2 | 8256 | 5.124 | 1598.79 | 0.589 | 108.62 | 5.713 | 1445.10 |
|
||||
| 4096 | 32 | 4 | 16512 | 10.231 | 1601.47 | 0.928 | 137.98 | 11.158 | 1479.80 |
|
||||
| 4096 | 32 | 8 | 33024 | 20.468 | 1600.94 | 1.606 | 159.38 | 22.074 | 1496.04 |
|
||||
| 4096 | 32 | 16 | 66048 | 40.924 | 1601.42 | 2.639 | 193.99 | 43.563 | 1516.15 |
|
||||
| 4096 | 32 | 32 | 132096 | 81.819 | 1601.98 | 4.466 | 229.29 | 86.284 | 1530.94 |
|
||||
| 8192 | 32 | 1 | 8224 | 5.517 | 1484.74 | 0.409 | 78.16 | 5.927 | 1387.58 |
|
||||
| 8192 | 32 | 2 | 16448 | 11.008 | 1488.43 | 0.622 | 102.92 | 11.629 | 1414.34 |
|
||||
| 8192 | 32 | 4 | 32896 | 22.002 | 1489.29 | 0.987 | 129.66 | 22.990 | 1430.90 |
|
||||
| 8192 | 32 | 8 | 65792 | 46.051 | 1423.11 | 1.858 | 137.79 | 47.909 | 1373.27 |
|
||||
| 8192 | 32 | 16 | 131584 | 97.680 | 1341.85 | 2.872 | 178.28 | 100.552 | 1308.62 |
|
||||
| 8192 | 32 | 32 | 263168 | 176.407 | 1486.02 | 5.048 | 202.85 | 181.455 | 1450.32 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | threads | n_ubatch | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 1648.69 ± 1.80 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 85.60 ± 0.52 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 1429.86 ± 1.01 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 82.03 ± 0.12 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 1257.90 ± 1.81 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 78.23 ± 0.33 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 1013.49 ± 0.70 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 73.20 ± 0.28 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 721.11 ± 0.58 |
|
||||
| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 65.52 ± 0.10 |
|
||||
|
||||
build: b828e18c7 (7948)
|
||||
|
||||
## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.243 | 2109.23 | 0.419 | 76.34 | 0.662 | 821.84 |
|
||||
| 512 | 32 | 2 | 1088 | 0.406 | 2521.40 | 0.575 | 111.36 | 0.981 | 1109.27 |
|
||||
| 512 | 32 | 4 | 2176 | 0.744 | 2751.65 | 0.841 | 152.22 | 1.585 | 1372.71 |
|
||||
| 512 | 32 | 8 | 4352 | 1.479 | 2770.20 | 1.330 | 192.48 | 2.809 | 1549.53 |
|
||||
| 512 | 32 | 16 | 8704 | 2.951 | 2776.20 | 2.572 | 199.05 | 5.523 | 1575.93 |
|
||||
| 512 | 32 | 32 | 17408 | 5.899 | 2777.64 | 2.603 | 393.34 | 8.502 | 2047.54 |
|
||||
| 4096 | 32 | 1 | 4128 | 1.901 | 2154.15 | 0.474 | 67.58 | 2.375 | 1738.14 |
|
||||
| 4096 | 32 | 2 | 8256 | 3.788 | 2162.89 | 0.652 | 98.17 | 4.439 | 1859.69 |
|
||||
| 4096 | 32 | 4 | 16512 | 7.564 | 2166.18 | 0.990 | 129.24 | 8.554 | 1930.34 |
|
||||
| 4096 | 32 | 8 | 33024 | 15.121 | 2166.98 | 1.632 | 156.82 | 16.754 | 1971.12 |
|
||||
| 4096 | 32 | 16 | 66048 | 30.241 | 2167.09 | 3.166 | 161.72 | 33.407 | 1977.04 |
|
||||
| 4096 | 32 | 32 | 132096 | 60.474 | 2167.42 | 3.780 | 270.93 | 64.254 | 2055.86 |
|
||||
| 8192 | 32 | 1 | 8224 | 4.733 | 1730.92 | 0.483 | 66.29 | 5.215 | 1576.85 |
|
||||
| 8192 | 32 | 2 | 16448 | 9.459 | 1732.09 | 0.722 | 88.58 | 10.182 | 1615.46 |
|
||||
| 8192 | 32 | 4 | 32896 | 18.912 | 1732.65 | 1.120 | 114.26 | 20.032 | 1642.14 |
|
||||
| 8192 | 32 | 8 | 65792 | 37.797 | 1733.91 | 1.873 | 136.67 | 39.670 | 1658.49 |
|
||||
| 8192 | 32 | 16 | 131584 | 84.133 | 1557.92 | 3.718 | 137.72 | 87.850 | 1497.82 |
|
||||
| 8192 | 32 | 32 | 263168 | 157.550 | 1663.88 | 4.854 | 210.98 | 162.403 | 1620.46 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | threads | n_ubatch | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 2453.11 ± 1.70 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 78.97 ± 0.46 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 1569.46 ± 1.97 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 71.18 ± 0.37 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 1145.51 ± 1.16 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 65.11 ± 0.36 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 741.04 ± 0.74 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 56.87 ± 0.14 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 431.31 ± 0.31 |
|
||||
| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 45.26 ± 0.11 |
|
||||
|
||||
build: b828e18c7 (7948)
|
||||
|
||||
## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.339 | 1509.22 | 0.409 | 78.17 | 0.749 | 726.67 |
|
||||
| 512 | 32 | 2 | 1088 | 0.646 | 1584.93 | 0.483 | 132.45 | 1.129 | 963.45 |
|
||||
| 512 | 32 | 4 | 2176 | 1.258 | 1627.50 | 0.585 | 218.67 | 1.844 | 1180.21 |
|
||||
| 512 | 32 | 8 | 4352 | 2.506 | 1634.41 | 1.005 | 254.83 | 3.511 | 1239.64 |
|
||||
| 512 | 32 | 16 | 8704 | 5.007 | 1635.99 | 1.595 | 321.07 | 6.602 | 1318.38 |
|
||||
| 512 | 32 | 32 | 17408 | 10.007 | 1637.19 | 1.676 | 611.12 | 11.683 | 1490.03 |
|
||||
| 4096 | 32 | 1 | 4128 | 2.730 | 1500.46 | 0.431 | 74.31 | 3.160 | 1306.12 |
|
||||
| 4096 | 32 | 2 | 8256 | 5.446 | 1504.33 | 0.524 | 122.04 | 5.970 | 1382.91 |
|
||||
| 4096 | 32 | 4 | 16512 | 10.875 | 1506.59 | 0.662 | 193.45 | 11.537 | 1431.28 |
|
||||
| 4096 | 32 | 8 | 33024 | 21.749 | 1506.61 | 1.158 | 221.11 | 22.907 | 1441.64 |
|
||||
| 4096 | 32 | 16 | 66048 | 43.477 | 1507.36 | 1.901 | 269.32 | 45.378 | 1455.49 |
|
||||
| 4096 | 32 | 32 | 132096 | 86.954 | 1507.37 | 2.325 | 440.42 | 89.279 | 1479.59 |
|
||||
| 8192 | 32 | 1 | 8224 | 5.940 | 1379.21 | 0.449 | 71.20 | 6.389 | 1287.20 |
|
||||
| 8192 | 32 | 2 | 16448 | 11.865 | 1380.84 | 0.559 | 114.59 | 12.424 | 1323.92 |
|
||||
| 8192 | 32 | 4 | 32896 | 23.723 | 1381.25 | 0.728 | 175.80 | 24.452 | 1345.35 |
|
||||
| 8192 | 32 | 8 | 65792 | 47.434 | 1381.63 | 1.279 | 200.09 | 48.713 | 1350.60 |
|
||||
| 8192 | 32 | 16 | 131584 | 94.864 | 1381.69 | 2.198 | 232.97 | 97.061 | 1355.68 |
|
||||
| 8192 | 32 | 32 | 263168 | 189.743 | 1381.57 | 3.052 | 335.50 | 192.795 | 1365.01 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | threads | n_ubatch | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 1565.91 ± 0.86 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 79.68 ± 0.39 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 1317.41 ± 1.02 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 74.70 ± 0.04 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 1134.65 ± 0.76 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 71.31 ± 0.12 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 886.46 ± 0.78 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 65.93 ± 0.06 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 612.21 ± 0.30 |
|
||||
| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 56.83 ± 0.02 |
|
||||
|
||||
build: b828e18c7 (7948)
|
||||
|
||||
## ggml-org/gemma-3-4b-it-qat-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.186 | 2748.06 | 0.235 | 136.28 | 0.421 | 1291.78 |
|
||||
| 512 | 32 | 2 | 1088 | 0.342 | 2990.95 | 0.312 | 204.99 | 0.655 | 1662.15 |
|
||||
| 512 | 32 | 4 | 2176 | 0.662 | 3092.69 | 0.404 | 316.97 | 1.066 | 2041.21 |
|
||||
| 512 | 32 | 8 | 4352 | 1.317 | 3110.41 | 0.579 | 441.80 | 1.896 | 2294.97 |
|
||||
| 512 | 32 | 16 | 8704 | 2.625 | 3120.23 | 1.207 | 424.08 | 3.833 | 2270.93 |
|
||||
| 512 | 32 | 32 | 17408 | 5.242 | 3125.34 | 1.299 | 788.23 | 6.541 | 2661.19 |
|
||||
| 4096 | 32 | 1 | 4128 | 1.408 | 2909.90 | 0.296 | 108.07 | 1.704 | 2422.95 |
|
||||
| 4096 | 32 | 2 | 8256 | 2.793 | 2933.40 | 0.325 | 197.00 | 3.118 | 2648.25 |
|
||||
| 4096 | 32 | 4 | 16512 | 5.567 | 2943.22 | 0.440 | 291.07 | 6.006 | 2749.05 |
|
||||
| 4096 | 32 | 8 | 33024 | 11.114 | 2948.23 | 0.640 | 400.26 | 11.754 | 2809.59 |
|
||||
| 4096 | 32 | 16 | 66048 | 22.217 | 2949.76 | 1.327 | 385.83 | 23.544 | 2805.26 |
|
||||
| 4096 | 32 | 32 | 132096 | 44.420 | 2950.77 | 1.553 | 659.30 | 45.973 | 2873.36 |
|
||||
| 8192 | 32 | 1 | 8224 | 2.860 | 2864.58 | 0.250 | 127.90 | 3.110 | 2644.42 |
|
||||
| 8192 | 32 | 2 | 16448 | 5.702 | 2873.63 | 0.335 | 191.07 | 6.036 | 2724.77 |
|
||||
| 8192 | 32 | 4 | 32896 | 11.383 | 2878.69 | 0.456 | 280.72 | 11.839 | 2778.63 |
|
||||
| 8192 | 32 | 8 | 65792 | 22.750 | 2880.75 | 0.671 | 381.48 | 23.421 | 2809.14 |
|
||||
| 8192 | 32 | 16 | 131584 | 45.484 | 2881.74 | 1.406 | 364.04 | 46.890 | 2806.22 |
|
||||
| 8192 | 32 | 32 | 263168 | 90.956 | 2882.10 | 1.793 | 570.98 | 92.749 | 2837.41 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | threads | n_ubatch | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 2923.59 ± 3.10 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 134.28 ± 1.29 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 2748.21 ± 3.05 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 133.11 ± 0.08 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 2641.45 ± 2.31 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 125.85 ± 0.35 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 2446.20 ± 2.94 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 125.00 ± 0.12 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 2129.18 ± 7.43 |
|
||||
| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 113.14 ± 0.10 |
|
||||
|
||||
build: b828e18c7 (7948)
|
||||
|
||||
## ggml-org/GLM-4.7-Flash-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
|
||||
main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.326 | 1568.69 | 0.522 | 61.28 | 0.849 | 641.09 |
|
||||
| 512 | 32 | 2 | 1088 | 0.528 | 1939.42 | 0.744 | 86.07 | 1.272 | 855.63 |
|
||||
| 512 | 32 | 4 | 2176 | 0.968 | 2114.85 | 1.105 | 115.85 | 2.073 | 1049.56 |
|
||||
| 512 | 32 | 8 | 4352 | 1.928 | 2124.62 | 1.684 | 151.99 | 3.612 | 1204.82 |
|
||||
| 512 | 32 | 16 | 8704 | 3.844 | 2131.34 | 3.141 | 162.99 | 6.985 | 1246.11 |
|
||||
| 512 | 32 | 32 | 17408 | 7.683 | 2132.38 | 3.924 | 260.95 | 11.608 | 1499.71 |
|
||||
| 4096 | 32 | 1 | 4128 | 3.280 | 1248.75 | 0.723 | 44.29 | 4.003 | 1031.33 |
|
||||
| 4096 | 32 | 2 | 8256 | 6.545 | 1251.63 | 0.930 | 68.85 | 7.475 | 1104.53 |
|
||||
| 4096 | 32 | 4 | 16512 | 13.080 | 1252.64 | 1.454 | 88.03 | 14.534 | 1136.12 |
|
||||
| 4096 | 32 | 8 | 33024 | 26.154 | 1252.90 | 2.388 | 107.20 | 28.542 | 1157.04 |
|
||||
| 4096 | 32 | 16 | 66048 | 52.297 | 1253.14 | 4.724 | 108.37 | 57.022 | 1158.30 |
|
||||
| 4096 | 32 | 32 | 132096 | 104.578 | 1253.34 | 7.266 | 140.93 | 111.844 | 1181.08 |
|
||||
| 8192 | 32 | 1 | 8224 | 9.623 | 851.31 | 0.767 | 41.72 | 10.390 | 791.54 |
|
||||
| 8192 | 32 | 2 | 16448 | 20.916 | 783.32 | 1.148 | 55.74 | 22.064 | 745.45 |
|
||||
| 8192 | 32 | 4 | 32896 | 43.509 | 753.14 | 1.833 | 69.82 | 45.342 | 725.51 |
|
||||
| 8192 | 32 | 8 | 65792 | 79.621 | 823.10 | 3.180 | 80.50 | 82.801 | 794.58 |
|
||||
| 8192 | 32 | 16 | 131584 | 153.770 | 852.39 | 6.502 | 78.74 | 160.272 | 821.00 |
|
||||
| 8192 | 32 | 32 | 263168 | 307.539 | 852.39 | 10.839 | 94.48 | 318.378 | 826.59 |
|
||||
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | threads | n_ubatch | fa | test | t/s |
|
||||
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 1629.33 ± 0.27 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 59.58 ± 0.13 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 732.67 ± 0.42 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 47.44 ± 0.15 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 474.33 ± 0.33 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 40.20 ± 0.20 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 277.46 ± 0.09 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 31.50 ± 0.93 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 151.44 ± 0.05 |
|
||||
| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 21.81 ± 0.01 |
|
||||
|
||||
build: b828e18c7 (7948)
|
||||
@@ -1,117 +0,0 @@
|
||||
# NVIDIA DGX Spark
|
||||
|
||||
## System info
|
||||
|
||||
```bash
|
||||
uname --all
|
||||
Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
|
||||
|
||||
g++ --version
|
||||
g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
|
||||
|
||||
nvidia-smi
|
||||
Fri Mar 6 11:39:45 2026
|
||||
+-----------------------------------------------------------------------------------------+
|
||||
| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
|
||||
+-----------------------------------------+------------------------+----------------------+
|
||||
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
||||
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
||||
| | | MIG M. |
|
||||
|=========================================+========================+======================|
|
||||
| 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A |
|
||||
| N/A 52C P0 13W / N/A | Not Supported | 0% Default |
|
||||
| | | N/A |
|
||||
+-----------------------------------------+------------------------+----------------------+
|
||||
```
|
||||
|
||||
## ggml-org/Nemotron-3-Super-120B-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/Nemotron-3-Super-120B-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 1.094 | 468.05 | 1.621 | 19.74 | 2.715 | 200.37 |
|
||||
| 512 | 32 | 2 | 1088 | 1.463 | 700.16 | 2.437 | 26.26 | 3.900 | 279.01 |
|
||||
| 512 | 32 | 4 | 2176 | 2.647 | 773.76 | 4.043 | 31.66 | 6.689 | 325.29 |
|
||||
| 512 | 32 | 8 | 4352 | 5.291 | 774.14 | 6.151 | 41.62 | 11.442 | 380.37 |
|
||||
| 512 | 32 | 16 | 8704 | 10.603 | 772.62 | 10.385 | 49.30 | 20.987 | 414.72 |
|
||||
| 512 | 32 | 32 | 17408 | 21.231 | 771.69 | 18.235 | 56.16 | 39.466 | 441.09 |
|
||||
| 4096 | 32 | 1 | 4128 | 5.340 | 767.05 | 1.616 | 19.81 | 6.956 | 593.47 |
|
||||
| 4096 | 32 | 2 | 8256 | 10.673 | 767.55 | 2.454 | 26.08 | 13.127 | 628.94 |
|
||||
| 4096 | 32 | 4 | 16512 | 21.348 | 767.46 | 4.072 | 31.44 | 25.420 | 649.57 |
|
||||
| 4096 | 32 | 8 | 33024 | 42.714 | 767.15 | 6.277 | 40.78 | 48.991 | 674.08 |
|
||||
| 4096 | 32 | 16 | 66048 | 85.385 | 767.54 | 10.596 | 48.32 | 95.981 | 688.14 |
|
||||
| 4096 | 32 | 32 | 132096 | 170.819 | 767.32 | 18.619 | 55.00 | 189.437 | 697.31 |
|
||||
| 8192 | 32 | 1 | 8224 | 10.690 | 766.32 | 1.619 | 19.76 | 12.310 | 668.10 |
|
||||
| 8192 | 32 | 2 | 16448 | 21.382 | 766.24 | 2.467 | 25.94 | 23.850 | 689.65 |
|
||||
| 8192 | 32 | 4 | 32896 | 42.782 | 765.92 | 4.098 | 31.23 | 46.881 | 701.69 |
|
||||
| 8192 | 32 | 8 | 65792 | 85.582 | 765.77 | 6.368 | 40.20 | 91.951 | 715.52 |
|
||||
| 8192 | 32 | 16 | 131584 | 171.066 | 766.21 | 10.774 | 47.52 | 181.840 | 723.62 |
|
||||
| 8192 | 32 | 32 | 263168 | 342.140 | 766.19 | 18.969 | 53.98 | 361.109 | 728.78 |
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | n_ubatch | fa | test | t/s |
|
||||
| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: |
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 | 768.84 ± 0.90 |
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 | 19.94 ± 0.16 |
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d4096 | 764.51 ± 0.50 |
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d4096 | 19.95 ± 0.18 |
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d8192 | 759.53 ± 0.71 |
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d8192 | 19.83 ± 0.18 |
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d16384 | 747.98 ± 1.58 |
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d16384 | 19.84 ± 0.18 |
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d32768 | 724.40 ± 2.70 |
|
||||
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d32768 | 19.45 ± 0.18 |
|
||||
|
||||
build: 04a65daab (8268)
|
||||
|
||||
## ggml-org/Nemotron-3-Nano-4B-GGUF
|
||||
|
||||
Model: https://huggingface.co/ggml-org/Nemotron-3-Nano-4B-GGUF
|
||||
|
||||
- `llama-batched-bench`
|
||||
|
||||
main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20
|
||||
|
||||
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||
| 512 | 32 | 1 | 544 | 0.152 | 3371.61 | 0.597 | 53.64 | 0.748 | 726.90 |
|
||||
| 512 | 32 | 2 | 1088 | 0.319 | 3208.68 | 0.857 | 74.66 | 1.176 | 924.89 |
|
||||
| 512 | 32 | 4 | 2176 | 0.720 | 2843.56 | 1.323 | 96.78 | 2.043 | 1065.18 |
|
||||
| 512 | 32 | 8 | 4352 | 1.428 | 2867.96 | 2.311 | 110.76 | 3.739 | 1163.82 |
|
||||
| 512 | 32 | 16 | 8704 | 2.857 | 2866.94 | 4.203 | 121.82 | 7.060 | 1232.82 |
|
||||
| 512 | 32 | 32 | 17408 | 5.709 | 2869.76 | 7.964 | 128.58 | 13.673 | 1273.14 |
|
||||
| 4096 | 32 | 1 | 4128 | 1.458 | 2809.76 | 0.605 | 52.92 | 2.062 | 2001.52 |
|
||||
| 4096 | 32 | 2 | 8256 | 2.905 | 2819.95 | 0.875 | 73.12 | 3.780 | 2183.95 |
|
||||
| 4096 | 32 | 4 | 16512 | 5.790 | 2829.74 | 1.361 | 94.07 | 7.151 | 2309.17 |
|
||||
| 4096 | 32 | 8 | 33024 | 11.598 | 2825.32 | 2.378 | 107.65 | 13.976 | 2362.89 |
|
||||
| 4096 | 32 | 16 | 66048 | 23.208 | 2823.88 | 4.348 | 117.76 | 27.556 | 2396.89 |
|
||||
| 4096 | 32 | 32 | 132096 | 46.515 | 2817.85 | 8.279 | 123.69 | 54.794 | 2410.79 |
|
||||
| 8192 | 32 | 1 | 8224 | 2.950 | 2776.95 | 0.617 | 51.89 | 3.567 | 2305.75 |
|
||||
| 8192 | 32 | 2 | 16448 | 5.921 | 2767.32 | 0.896 | 71.45 | 6.816 | 2413.05 |
|
||||
| 8192 | 32 | 4 | 32896 | 11.842 | 2767.21 | 1.401 | 91.34 | 13.243 | 2484.03 |
|
||||
| 8192 | 32 | 8 | 65792 | 23.726 | 2762.17 | 2.461 | 104.03 | 26.187 | 2512.38 |
|
||||
| 8192 | 32 | 16 | 131584 | 47.777 | 2743.43 | 4.577 | 111.86 | 52.354 | 2513.36 |
|
||||
| 8192 | 32 | 32 | 263168 | 96.691 | 2711.16 | 8.772 | 116.73 | 105.463 | 2495.36 |
|
||||
|
||||
- `llama-bench`
|
||||
|
||||
| model | size | params | backend | n_ubatch | fa | test | t/s |
|
||||
| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 | 2761.90 ± 19.31 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 | 52.85 ± 0.12 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d4096 | 2687.07 ± 21.84 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d4096 | 52.32 ± 0.23 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d8192 | 2564.52 ± 57.69 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d8192 | 51.27 ± 0.34 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d16384 | 2334.02 ± 37.83 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d16384 | 49.71 ± 0.14 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | pp2048 @ d32768 | 2041.46 ± 40.45 |
|
||||
| nemotron 4B Q8_0 | 3.94 GiB | 3.97 B | CUDA | 2048 | 1 | tg32 @ d32768 | 46.71 ± 0.13 |
|
||||
|
||||
build: 1bbec6a75 (8382)
|
||||
@@ -43,6 +43,11 @@ COMMON_CMAKE_ARGS=(
|
||||
-DGGML_OPENMP=${GGML_OPENMP}
|
||||
)
|
||||
|
||||
XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
|
||||
MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
|
||||
MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
|
||||
echo "Detected Xcode version: $XCODE_VERSION"
|
||||
|
||||
check_required_tool() {
|
||||
local tool=$1
|
||||
local install_message=$2
|
||||
@@ -55,12 +60,9 @@ check_required_tool() {
|
||||
}
|
||||
echo "Checking for required tools..."
|
||||
check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
|
||||
check_required_tool "xcrun" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
|
||||
|
||||
XCODE_VERSION=$(xcrun xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
|
||||
MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
|
||||
MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
|
||||
echo "Detected Xcode version: $XCODE_VERSION"
|
||||
check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
|
||||
check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
|
||||
check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
|
||||
|
||||
set -e
|
||||
|
||||
@@ -258,7 +260,7 @@ combine_static_libraries() {
|
||||
|
||||
# Since we have multiple architectures libtool will find object files that do not
|
||||
# match the target architecture. We suppress these warnings.
|
||||
xcrun libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
|
||||
libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
|
||||
|
||||
# Determine SDK, architectures, and install_name based on platform and simulator flag.
|
||||
local sdk=""
|
||||
@@ -331,7 +333,7 @@ combine_static_libraries() {
|
||||
|
||||
# Platform-specific post-processing for device builds
|
||||
if [[ "$is_simulator" == "false" ]]; then
|
||||
if xcrun -f vtool &>/dev/null; then
|
||||
if command -v xcrun vtool &>/dev/null; then
|
||||
case "$platform" in
|
||||
"ios")
|
||||
echo "Marking binary as a framework binary for iOS..."
|
||||
@@ -412,7 +414,7 @@ cmake -B build-ios-sim -G Xcode \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-S .
|
||||
cmake --build build-ios-sim --config Release -- -quiet
|
||||
|
||||
@@ -426,7 +428,7 @@ cmake -B build-ios-device -G Xcode \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-S .
|
||||
cmake --build build-ios-device --config Release -- -quiet
|
||||
|
||||
@@ -437,7 +439,7 @@ cmake -B build-macos -G Xcode \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-S .
|
||||
cmake --build build-macos --config Release -- -quiet
|
||||
|
||||
@@ -449,10 +451,9 @@ cmake -B build-visionos -G Xcode \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xros \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-S .
|
||||
cmake --build build-visionos --config Release -- -quiet
|
||||
|
||||
@@ -464,10 +465,9 @@ cmake -B build-visionos-sim -G Xcode \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xrsimulator \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-S .
|
||||
cmake --build build-visionos-sim --config Release -- -quiet
|
||||
|
||||
@@ -483,7 +483,7 @@ cmake -B build-tvos-sim -G Xcode \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-S .
|
||||
cmake --build build-tvos-sim --config Release -- -quiet
|
||||
|
||||
@@ -498,7 +498,7 @@ cmake -B build-tvos-device -G Xcode \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_CURL=OFF \
|
||||
-S .
|
||||
cmake --build build-tvos-device --config Release -- -quiet
|
||||
|
||||
@@ -524,13 +524,13 @@ combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
|
||||
|
||||
# Create XCFramework with correct debug symbols paths
|
||||
echo "Creating XCFramework..."
|
||||
xcrun xcodebuild -create-xcframework \
|
||||
xcodebuild -create-xcframework \
|
||||
-framework $(pwd)/build-ios-sim/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-ios-device/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-macos/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-macos/dSYMs/llama.dSYM \
|
||||
-debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
|
||||
-framework $(pwd)/build-visionos/framework/llama.framework \
|
||||
-debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
|
||||
-framework $(pwd)/build-visionos-sim/framework/llama.framework \
|
||||
|
||||
209
ci/run.sh
209
ci/run.sh
@@ -25,15 +25,6 @@
|
||||
# # with KLEIDIAI support
|
||||
# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with BLAS support
|
||||
# GG_BUILD_BLAS=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# with BLAS support (custom vendor)
|
||||
# GG_BUILD_BLAS=1 GG_BUILD_BLAS_VENDOR=Intel10_64lp bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# with OPENVINO support
|
||||
# GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
@@ -54,23 +45,14 @@ sd=`dirname $0`
|
||||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
|
||||
CTEST_EXTRA=""
|
||||
|
||||
# Default to use make unless specified for compatibility
|
||||
CMAKE_GENERATOR="Unix Makefiles"
|
||||
|
||||
if [ ! -z "${GG_BUILD_NINJA}" ]; then
|
||||
CMAKE_GENERATOR="Ninja"
|
||||
fi
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DGGML_CUDA_CUB_3DOT2=ON"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
|
||||
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
|
||||
@@ -119,28 +101,10 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
fi
|
||||
|
||||
# Build shared libs on Windows
|
||||
# to reduce binary size and avoid errors in library loading unit tests
|
||||
if uname -s | grep -qi nt; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DBUILD_SHARED_LIBS=ON"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
|
||||
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
|
||||
if [ -z "${CMAKE_PREFIX_PATH}" ]; then
|
||||
export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}"
|
||||
else
|
||||
export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}:${CMAKE_PREFIX_PATH}"
|
||||
fi
|
||||
fi
|
||||
|
||||
# For some systems, Dawn_DIR needs to be set explicitly, e.g., the lib64 path
|
||||
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_DIR}" ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DDawn_DIR=${GG_BUILD_WEBGPU_DAWN_DIR}"
|
||||
fi
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_MUSA} ]; then
|
||||
@@ -156,23 +120,30 @@ fi
|
||||
|
||||
if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
|
||||
echo ">>===== Enabling KleidiAI support"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } -DGGML_CPU_KLEIDIAI=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_BLAS} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
|
||||
fi
|
||||
CANDIDATES=("armv9-a+dotprod+i8mm" "armv8.6-a+dotprod+i8mm" "armv8.2-a+dotprod")
|
||||
CPU=""
|
||||
|
||||
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
|
||||
if [ -z ${OpenVINO_DIR} ]; then
|
||||
echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:"
|
||||
echo "source /opt/intel/openvino/setupvars.sh"
|
||||
for cpu in "${CANDIDATES[@]}"; do
|
||||
if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
|
||||
CPU="$cpu"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -z "$CPU" ]; then
|
||||
echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
|
||||
exit 1
|
||||
fi
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"
|
||||
|
||||
# TODO: fix and re-enable the `test-llama-archs` test below
|
||||
CTEST_EXTRA="-E test-llama-archs"
|
||||
echo ">>===== Using ARM baseline: ${CPU}"
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_KLEIDIAI=ON \
|
||||
-DGGML_CPU_AARCH64=ON \
|
||||
-DGGML_CPU_ARM_ARCH=${CPU} \
|
||||
-DBUILD_SHARED_LIBS=OFF"
|
||||
fi
|
||||
|
||||
## helpers
|
||||
@@ -226,13 +197,13 @@ function gg_run_ctest_debug {
|
||||
|
||||
set -e
|
||||
|
||||
# Check required binaries are installed
|
||||
# Check cmake, make and ctest are installed
|
||||
gg_check_build_requirements
|
||||
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -257,16 +228,16 @@ function gg_run_ctest_release {
|
||||
|
||||
set -e
|
||||
|
||||
# Check required binaries are installed
|
||||
# Check cmake, make and ctest are installed
|
||||
gg_check_build_requirements
|
||||
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
(time ctest -C Release --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
else
|
||||
(time ctest -C Release --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
fi
|
||||
|
||||
set +e
|
||||
@@ -307,8 +278,7 @@ function gg_sum_test_scripts {
|
||||
}
|
||||
|
||||
function gg_get_model {
|
||||
#local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
|
||||
local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-q4_0.gguf"
|
||||
local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
|
||||
if [[ -s $gguf_0 ]]; then
|
||||
echo -n "$gguf_0"
|
||||
else
|
||||
@@ -324,7 +294,7 @@ function gg_run_ctest_with_model_debug {
|
||||
cd build-ci-debug
|
||||
set -e
|
||||
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest -C Debug --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
cd ..
|
||||
@@ -337,7 +307,7 @@ function gg_run_ctest_with_model_release {
|
||||
cd build-ci-release
|
||||
set -e
|
||||
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest -C Release --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
# test memory leaks
|
||||
#if [[ ! -z ${GG_BUILD_METAL} ]]; then
|
||||
@@ -391,8 +361,8 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
set -e
|
||||
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf --outtype f16
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
|
||||
@@ -423,20 +393,18 @@ function gg_run_qwen3_0_6b {
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
||||
|
||||
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
if [ -z ${GG_BUILD_NO_BF16} ]; then
|
||||
@@ -455,10 +423,10 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
@@ -540,8 +508,8 @@ function gg_run_embd_bge_small {
|
||||
|
||||
set -e
|
||||
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
@@ -550,10 +518,8 @@ function gg_run_embd_bge_small {
|
||||
|
||||
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
|
||||
|
||||
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -585,17 +551,15 @@ function gg_run_rerank_tiny {
|
||||
|
||||
set -e
|
||||
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
|
||||
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
|
||||
|
||||
# for this model, the SEP token is "</s>"
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
|
||||
# sample output
|
||||
# rerank score 0: 0.029
|
||||
@@ -632,36 +596,12 @@ function gg_sum_rerank_tiny {
|
||||
}
|
||||
|
||||
function gg_check_build_requirements {
|
||||
if ! command -v git &> /dev/null; then
|
||||
gg_printf 'git not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v git-lfs &> /dev/null; then
|
||||
gg_printf 'git-lfs not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v wget &> /dev/null; then
|
||||
gg_printf 'wget not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v python3 &> /dev/null; then
|
||||
gg_printf 'python3 not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v pip3 &> /dev/null; then
|
||||
gg_printf 'pip3 not found, please install'
|
||||
fi
|
||||
|
||||
if ! python3 -m ensurepip --help &> /dev/null; then
|
||||
gg_printf 'ensurepip not found, please install python3-venv package'
|
||||
fi
|
||||
|
||||
if ! command -v cmake &> /dev/null; then
|
||||
gg_printf 'cmake not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v ccache &> /dev/null; then
|
||||
gg_printf 'ccache not found, please consider installing for faster builds'
|
||||
if ! command -v make &> /dev/null; then
|
||||
gg_printf 'make not found, please install'
|
||||
fi
|
||||
|
||||
if ! command -v ctest &> /dev/null; then
|
||||
@@ -669,29 +609,6 @@ function gg_check_build_requirements {
|
||||
fi
|
||||
}
|
||||
|
||||
function gg_run_test_backend_ops_cpu {
|
||||
cd ${SRC}
|
||||
|
||||
cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time ./bin/test-backend-ops -b CPU ) 2>&1 | tee -a $OUT/${ci}-test-backend-ops-cpu.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_test_backend_ops_cpu {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs test-backend-ops for CPU backend\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-test-backend-ops-cpu.log)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '\n'
|
||||
}
|
||||
|
||||
## main
|
||||
|
||||
export LLAMA_LOG_PREFIX=1
|
||||
@@ -720,10 +637,6 @@ ret=0
|
||||
test $ret -eq 0 && gg_run ctest_debug
|
||||
test $ret -eq 0 && gg_run ctest_release
|
||||
|
||||
if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
|
||||
test $ret -eq 0 && gg_run test_backend_ops_cpu
|
||||
fi
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
test $ret -eq 0 && gg_run embd_bge_small
|
||||
test $ret -eq 0 && gg_run rerank_tiny
|
||||
|
||||
@@ -39,10 +39,26 @@ if(Git_FOUND)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
|
||||
|
||||
if(CMAKE_VS_PLATFORM_NAME)
|
||||
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
||||
if(MSVC)
|
||||
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
|
||||
if (CMAKE_VS_PLATFORM_NAME)
|
||||
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
||||
else()
|
||||
set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
else()
|
||||
set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} --version
|
||||
OUTPUT_VARIABLE OUT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
|
||||
set(BUILD_COMPILER ${OUT})
|
||||
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
||||
OUTPUT_VARIABLE OUT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
set(BUILD_TARGET ${OUT})
|
||||
endif()
|
||||
|
||||
@@ -32,27 +32,4 @@ function(llama_add_compile_flags)
|
||||
set(CXX_FLAGS "" PARENT_SCOPE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
if (LLAMA_SANITIZE_THREAD)
|
||||
message(STATUS "Using -fsanitize=thread")
|
||||
|
||||
add_compile_options(-fsanitize=thread)
|
||||
link_libraries (-fsanitize=thread)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SANITIZE_ADDRESS)
|
||||
message(STATUS "Using -fsanitize=address")
|
||||
|
||||
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||
link_libraries (-fsanitize=address)
|
||||
endif()
|
||||
|
||||
if (LLAMA_SANITIZE_UNDEFINED)
|
||||
message(STATUS "Using -fsanitize=undefined")
|
||||
|
||||
add_compile_options(-fsanitize=undefined)
|
||||
link_libraries (-fsanitize=undefined)
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
get_filename_component(DEST_DIR "${DEST}" DIRECTORY)
|
||||
file(MAKE_DIRECTORY "${DEST_DIR}")
|
||||
|
||||
if(NOT EXISTS "${DEST}")
|
||||
message(STATUS "Downloading ${NAME} from ggml-org/models...")
|
||||
endif()
|
||||
|
||||
file(DOWNLOAD
|
||||
"https://huggingface.co/ggml-org/models/resolve/main/${NAME}?download=true"
|
||||
"${DEST}"
|
||||
TLS_VERIFY ON
|
||||
EXPECTED_HASH ${HASH}
|
||||
STATUS status
|
||||
)
|
||||
|
||||
list(GET status 0 code)
|
||||
|
||||
if(NOT code EQUAL 0)
|
||||
list(GET status 1 msg)
|
||||
message(FATAL_ERROR "Failed to download ${NAME}: ${msg}")
|
||||
endif()
|
||||
@@ -1,40 +0,0 @@
|
||||
define_property(GLOBAL PROPERTY LICENSE_TEXT
|
||||
BRIEF_DOCS "Embedded licenses"
|
||||
FULL_DOCS "Global string containing all aggregated licenses"
|
||||
)
|
||||
|
||||
function(license_add_file NAME FILE)
|
||||
if(NOT IS_ABSOLUTE "${FILE}")
|
||||
set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
|
||||
endif()
|
||||
if(EXISTS "${FILE}")
|
||||
set(TITLE "License for ${NAME}")
|
||||
string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
|
||||
file(READ "${FILE}" TEXT)
|
||||
get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
|
||||
string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
|
||||
set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
|
||||
else()
|
||||
message(WARNING "License file '${FILE}' not found")
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(license_generate TARGET_NAME)
|
||||
message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
|
||||
get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
|
||||
|
||||
set(CPP_CONTENT "// Generated by CMake\n\n")
|
||||
string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
|
||||
string(APPEND CPP_CONTENT "${TEXT}")
|
||||
string(APPEND CPP_CONTENT "nullptr\n")
|
||||
string(APPEND CPP_CONTENT "};\n")
|
||||
|
||||
set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
|
||||
file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
|
||||
|
||||
if(TARGET ${TARGET_NAME})
|
||||
target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
|
||||
else()
|
||||
message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
|
||||
endif()
|
||||
endfunction()
|
||||
@@ -5,6 +5,7 @@ find_package(Threads REQUIRED)
|
||||
llama_add_compile_flags()
|
||||
|
||||
# Build info header
|
||||
#
|
||||
|
||||
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
|
||||
@@ -47,24 +48,14 @@ add_library(${TARGET} STATIC
|
||||
arg.cpp
|
||||
arg.h
|
||||
base64.hpp
|
||||
chat-auto-parser-generator.cpp
|
||||
chat-auto-parser-helpers.cpp
|
||||
chat-auto-parser.h
|
||||
chat-diff-analyzer.cpp
|
||||
chat-peg-parser.cpp
|
||||
chat-peg-parser.h
|
||||
chat-parser.cpp
|
||||
chat-parser.h
|
||||
chat.cpp
|
||||
chat.h
|
||||
common.cpp
|
||||
common.h
|
||||
console.cpp
|
||||
console.h
|
||||
debug.cpp
|
||||
debug.h
|
||||
download.cpp
|
||||
download.h
|
||||
hf-cache.cpp
|
||||
hf-cache.h
|
||||
http.h
|
||||
json-partial.cpp
|
||||
json-partial.h
|
||||
@@ -74,55 +65,86 @@ add_library(${TARGET} STATIC
|
||||
log.h
|
||||
ngram-cache.cpp
|
||||
ngram-cache.h
|
||||
ngram-map.cpp
|
||||
ngram-map.h
|
||||
ngram-mod.cpp
|
||||
ngram-mod.h
|
||||
peg-parser.cpp
|
||||
peg-parser.h
|
||||
preset.cpp
|
||||
preset.h
|
||||
regex-partial.cpp
|
||||
reasoning-budget.cpp
|
||||
reasoning-budget.h
|
||||
regex-partial.h
|
||||
sampling.cpp
|
||||
sampling.h
|
||||
speculative.cpp
|
||||
speculative.h
|
||||
unicode.cpp
|
||||
unicode.h
|
||||
jinja/lexer.cpp
|
||||
jinja/lexer.h
|
||||
jinja/parser.cpp
|
||||
jinja/parser.h
|
||||
jinja/runtime.cpp
|
||||
jinja/runtime.h
|
||||
jinja/value.cpp
|
||||
jinja/value.h
|
||||
jinja/string.cpp
|
||||
jinja/string.h
|
||||
jinja/caps.cpp
|
||||
jinja/caps.h
|
||||
)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
build_info
|
||||
cpp-httplib
|
||||
)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||
|
||||
# Use curl to download model url
|
||||
if (LLAMA_CURL)
|
||||
find_package(CURL)
|
||||
if (NOT CURL_FOUND)
|
||||
message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
|
||||
endif()
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
||||
include_directories(${CURL_INCLUDE_DIRS})
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if (LLAMA_OPENSSL)
|
||||
find_package(OpenSSL)
|
||||
if (OpenSSL_FOUND)
|
||||
include(CheckCSourceCompiles)
|
||||
set(SAVED_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
|
||||
set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
|
||||
check_c_source_compiles("
|
||||
#include <openssl/opensslv.h>
|
||||
#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
|
||||
# if OPENSSL_VERSION_NUMBER < 0x1010107f
|
||||
# error bad version
|
||||
# endif
|
||||
#else
|
||||
# if OPENSSL_VERSION_NUMBER < 0x30000000L
|
||||
# error bad version
|
||||
# endif
|
||||
#endif
|
||||
int main() { return 0; }
|
||||
" OPENSSL_VERSION_SUPPORTED)
|
||||
set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
|
||||
if (OPENSSL_VERSION_SUPPORTED)
|
||||
message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
|
||||
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||
target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
|
||||
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
||||
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
|
||||
find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
|
||||
find_library(SECURITY_FRAMEWORK Security REQUIRED)
|
||||
target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "OpenSSL not found, SSL support disabled")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
include(ExternalProject)
|
||||
set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
|
||||
set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
|
||||
set(LLGUIDANCE_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}llguidance${CMAKE_STATIC_LIBRARY_SUFFIX}")
|
||||
|
||||
# Set the correct library file extension based on platform
|
||||
if (WIN32)
|
||||
set(LLGUIDANCE_LIB_NAME "llguidance.lib")
|
||||
# Add Windows-specific libraries
|
||||
set(LLGUIDANCE_PLATFORM_LIBS
|
||||
ws2_32 # Windows Sockets API
|
||||
userenv # For GetUserProfileDirectoryW
|
||||
ntdll # For NT functions
|
||||
bcrypt # For BCryptGenRandom
|
||||
)
|
||||
else()
|
||||
set(LLGUIDANCE_LIB_NAME "libllguidance.a")
|
||||
set(LLGUIDANCE_PLATFORM_LIBS "")
|
||||
endif()
|
||||
|
||||
ExternalProject_Add(llguidance_ext
|
||||
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
||||
@@ -144,10 +166,34 @@ if (LLAMA_LLGUIDANCE)
|
||||
add_dependencies(llguidance llguidance_ext)
|
||||
|
||||
target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
|
||||
target_link_libraries(${TARGET} PRIVATE llguidance)
|
||||
if (WIN32)
|
||||
target_link_libraries(${TARGET} PRIVATE ws2_32 userenv ntdll bcrypt)
|
||||
endif()
|
||||
endif()
|
||||
# Add platform libraries to the main target
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
||||
endif ()
|
||||
|
||||
target_link_libraries(${TARGET} PUBLIC llama Threads::Threads)
|
||||
target_include_directories(${TARGET} PUBLIC . ../vendor)
|
||||
target_compile_features (${TARGET} PUBLIC cxx_std_17)
|
||||
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||
|
||||
|
||||
#
|
||||
# copy the license files
|
||||
#
|
||||
|
||||
# Check if running in GitHub Actions
|
||||
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
|
||||
message(STATUS "Running inside GitHub Actions - copying license files")
|
||||
|
||||
# Copy all files from licenses/ to build/bin/
|
||||
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
|
||||
foreach(LICENSE_FILE ${LICENSE_FILES})
|
||||
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
|
||||
add_custom_command(
|
||||
POST_BUILD
|
||||
TARGET ${TARGET}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
"${LICENSE_FILE}"
|
||||
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
|
||||
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
|
||||
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
2566
common/arg.cpp
2566
common/arg.cpp
File diff suppressed because it is too large
Load Diff
67
common/arg.h
67
common/arg.h
@@ -3,14 +3,8 @@
|
||||
#include "common.h"
|
||||
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
// pseudo-env variable to identify preset-only arguments
|
||||
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
|
||||
#define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
@@ -20,20 +14,15 @@ struct common_arg {
|
||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||
std::set<enum llama_example> excludes = {};
|
||||
std::vector<const char *> args;
|
||||
std::vector<const char *> args_neg; // for negated args like --no-xxx
|
||||
const char * value_hint = nullptr; // help text or example for arg value
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
const char * env = nullptr;
|
||||
std::string help;
|
||||
bool is_sparam = false; // is current arg a sampling param?
|
||||
bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
|
||||
void (*handler_void) (common_params & params) = nullptr;
|
||||
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
||||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
||||
void (*handler_int) (common_params & params, int) = nullptr;
|
||||
void (*handler_bool) (common_params & params, bool) = nullptr;
|
||||
|
||||
common_arg() = default;
|
||||
|
||||
common_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
@@ -55,13 +44,6 @@ struct common_arg {
|
||||
void (*handler)(common_params & params)
|
||||
) : args(args), help(help), handler_void(handler) {}
|
||||
|
||||
common_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const std::initializer_list<const char *> & args_neg,
|
||||
const std::string & help,
|
||||
void (*handler)(common_params & params, bool)
|
||||
) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
|
||||
|
||||
// support 2 values for arg
|
||||
common_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
@@ -75,38 +57,13 @@ struct common_arg {
|
||||
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
||||
common_arg & set_env(const char * env);
|
||||
common_arg & set_sparam();
|
||||
common_arg & set_preset_only();
|
||||
bool in_example(enum llama_example ex);
|
||||
bool is_exclude(enum llama_example ex);
|
||||
bool get_value_from_env(std::string & output) const;
|
||||
bool has_value_from_env() const;
|
||||
std::string to_string() const;
|
||||
|
||||
// for using as key in std::map
|
||||
bool operator<(const common_arg& other) const {
|
||||
if (args.empty() || other.args.empty()) {
|
||||
return false;
|
||||
}
|
||||
return strcmp(args[0], other.args[0]) < 0;
|
||||
}
|
||||
bool operator==(const common_arg& other) const {
|
||||
if (args.empty() || other.args.empty()) {
|
||||
return false;
|
||||
}
|
||||
return strcmp(args[0], other.args[0]) == 0;
|
||||
}
|
||||
|
||||
// get all args and env vars (including negated args/env)
|
||||
std::vector<std::string> get_args() const;
|
||||
std::vector<std::string> get_env() const;
|
||||
bool get_value_from_env(std::string & output);
|
||||
bool has_value_from_env();
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
namespace common_arg_utils {
|
||||
bool is_truthy(const std::string & value);
|
||||
bool is_falsey(const std::string & value);
|
||||
bool is_autoy(const std::string & value);
|
||||
}
|
||||
|
||||
struct common_params_context {
|
||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||
common_params & params;
|
||||
@@ -119,13 +76,13 @@ struct common_params_context {
|
||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
// parse input arguments from CLI into a map
|
||||
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
|
||||
|
||||
// populate preset-only arguments
|
||||
// these arguments are not treated as command line arguments
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/18163
|
||||
void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||
|
||||
// initialize argument parser context - used by test-arg-parser and preset
|
||||
// function to be used by test-arg-parser
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
struct common_remote_params {
|
||||
std::vector<std::string> headers;
|
||||
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
||||
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
||||
};
|
||||
// get remote file content, returns <http_code, raw_response_body>
|
||||
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
||||
|
||||
@@ -1,487 +0,0 @@
|
||||
#include "chat-auto-parser-helpers.h"
|
||||
#include "chat-auto-parser.h"
|
||||
#include "chat-peg-parser.h"
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "log.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
// Helper to iterate over tools/functions
|
||||
static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
|
||||
for (const auto & tool : tools) {
|
||||
if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
|
||||
continue;
|
||||
}
|
||||
fn(tool);
|
||||
}
|
||||
}
|
||||
|
||||
namespace autoparser {
|
||||
|
||||
parser_build_context::parser_build_context(common_chat_peg_builder & p, const generation_params & inputs) :
|
||||
p(p),
|
||||
inputs(inputs),
|
||||
reasoning_parser(p.eps()) {}
|
||||
|
||||
common_chat_params peg_generator::generate_parser(const common_chat_template & tmpl,
|
||||
const struct generation_params & inputs) {
|
||||
// Run differential analysis to extract template structure
|
||||
struct autoparser autoparser;
|
||||
autoparser.analyze_template(tmpl);
|
||||
return generate_parser(tmpl, inputs, autoparser);
|
||||
}
|
||||
|
||||
common_chat_params peg_generator::generate_parser(const common_chat_template & tmpl,
|
||||
const struct generation_params & inputs,
|
||||
const autoparser & autoparser) {
|
||||
// Create the result structure
|
||||
common_chat_params data;
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = autoparser.preserved_tokens;
|
||||
|
||||
auto parser = autoparser.build_parser(inputs);
|
||||
data.parser = parser.save();
|
||||
|
||||
// Build grammar if tools are present
|
||||
bool has_tools =
|
||||
autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
|
||||
std::string trigger_marker = !autoparser.tools.format.section_start.empty() ? autoparser.tools.format.section_start :
|
||||
autoparser.tools.format.per_call_start;
|
||||
|
||||
bool has_response_format = !inputs.json_schema.empty() && inputs.json_schema.is_object();
|
||||
bool include_grammar = has_response_format || (has_tools &&
|
||||
((inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO && !trigger_marker.empty()) ||
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
|
||||
|
||||
if (include_grammar) {
|
||||
data.grammar_lazy = !has_response_format && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
auto schema = function.contains("parameters") ? function.at("parameters") : json::object();
|
||||
builder.resolve_refs(schema);
|
||||
});
|
||||
parser.build_grammar(builder, data.grammar_lazy);
|
||||
});
|
||||
|
||||
// Set grammar triggers based on tool section markers (fall back to per-call markers)
|
||||
if (data.grammar_lazy) {
|
||||
data.grammar_triggers = {
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
common_peg_arena autoparser::build_parser(const generation_params & inputs) const {
|
||||
if (!analysis_complete) {
|
||||
throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
|
||||
}
|
||||
return build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
parser_build_context ctx(p, inputs);
|
||||
bool extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
|
||||
ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
|
||||
ctx.content = &content;
|
||||
ctx.reasoning = &reasoning;
|
||||
|
||||
// Build reasoning parser
|
||||
ctx.reasoning_parser = reasoning.build_parser(ctx);
|
||||
|
||||
auto parser = p.eps();
|
||||
|
||||
bool has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
|
||||
bool pure_content = reasoning.mode == reasoning_mode::NONE;
|
||||
|
||||
if (has_response_format) {
|
||||
auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
|
||||
parser = ctx.reasoning_parser + p.space() + p.choice({
|
||||
p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
|
||||
response_format
|
||||
}) + p.end();
|
||||
pure_content = false;
|
||||
} else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
|
||||
parser = tools.build_parser(ctx);
|
||||
pure_content = false;
|
||||
} else {
|
||||
parser = content.build_parser(ctx);
|
||||
}
|
||||
return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser;
|
||||
});
|
||||
}
|
||||
|
||||
common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
|
||||
if (!ctx.extracting_reasoning) {
|
||||
return p.eps();
|
||||
}
|
||||
|
||||
if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
|
||||
if (!end.empty()) {
|
||||
if (!start.empty()) {
|
||||
// Standard tag-based: optional(<think>reasoning</think>)
|
||||
return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
|
||||
}
|
||||
// Delimiter-style (empty start)
|
||||
return p.optional(p.reasoning(p.until(end)) + end + p.space());
|
||||
}
|
||||
}
|
||||
|
||||
return p.eps();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_content::build_parser(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
|
||||
if (is_always_wrapped()) {
|
||||
if (ctx.extracting_reasoning) {
|
||||
return ctx.reasoning_parser + start + p.content(p.until(end)) + end + p.end();
|
||||
}
|
||||
return p.content(p.until(start)) + start + p.content(p.until(end)) + end + p.end();
|
||||
}
|
||||
return ctx.reasoning_parser + p.content(p.rest()) + p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_content::build_optional_wrapped(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
|
||||
if (is_always_wrapped()) {
|
||||
return p.optional(start + p.content(p.until(end)) + end);
|
||||
}
|
||||
return p.eps();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const {
|
||||
switch (format.mode) {
|
||||
case tool_format::JSON_NATIVE:
|
||||
return build_tool_parser_json_native(ctx);
|
||||
case tool_format::TAG_WITH_JSON:
|
||||
return build_tool_parser_tag_json(ctx);
|
||||
case tool_format::TAG_WITH_TAGGED:
|
||||
return build_tool_parser_tag_tagged(ctx);
|
||||
default:
|
||||
LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
|
||||
"Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
|
||||
"report an issue at https://github.com/ggml-org/llama.cpp/issues\n");
|
||||
return ctx.p.eps();
|
||||
}
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
// Build effective field names with dot notation if function_field is set
|
||||
std::string name_field = format.name_field;
|
||||
std::string args_field = format.args_field;
|
||||
|
||||
if (!format.function_field.empty() && format.function_field != "function" &&
|
||||
name_field.find('.') == std::string::npos) {
|
||||
name_field = format.function_field + "." + name_field;
|
||||
args_field = format.function_field + "." + args_field;
|
||||
}
|
||||
|
||||
auto tools_parser = p.standard_json_tools(
|
||||
format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
|
||||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
|
||||
format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
|
||||
|
||||
// Handle content wrappers if present
|
||||
if (ctx.content && ctx.content->is_always_wrapped()) {
|
||||
auto wrapped_content = ctx.content->build_optional_wrapped(ctx);
|
||||
return ctx.reasoning_parser + wrapped_content + tools_parser + p.end();
|
||||
}
|
||||
|
||||
std::string tool_start = "{";
|
||||
if (!format.section_start.empty()) {
|
||||
tool_start = format.section_start;
|
||||
} else if (!format.per_call_start.empty()) {
|
||||
tool_start = format.per_call_start;
|
||||
}
|
||||
|
||||
return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
|
||||
p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
|
||||
const common_peg_parser & call_id_section, bool have_call_id,
|
||||
const common_peg_parser & args,
|
||||
std::optional<common_peg_parser> atomic_peek) const {
|
||||
auto open = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix);
|
||||
bool matched_atomic = false;
|
||||
common_peg_parser func_parser = p.eps();
|
||||
|
||||
if (!function.name_suffix.empty()) {
|
||||
func_parser = open + call_id_section + p.space() + args;
|
||||
matched_atomic = true;
|
||||
} else if (have_call_id) {
|
||||
func_parser = p.atomic(open + call_id_section) + p.space() + args;
|
||||
matched_atomic = true;
|
||||
} else if (atomic_peek.has_value()) {
|
||||
func_parser = p.atomic(open + call_id_section + p.space() + *atomic_peek) + args;
|
||||
matched_atomic = true;
|
||||
} else {
|
||||
func_parser = open + call_id_section + p.space() + args;
|
||||
}
|
||||
|
||||
if (!function.close.empty()) {
|
||||
func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
|
||||
} else if (!format.per_call_end.empty()) {
|
||||
// When there's no func_close but there is a per_call_end marker, use peek() to ensure
|
||||
// we only emit tool_close when we can actually see the closing marker. This prevents
|
||||
// premature closing during partial parsing when we've seen e.g. "</" which could be
|
||||
// either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
|
||||
func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
|
||||
} else {
|
||||
func_parser = func_parser + p.tool_close(p.space()); // force this to process tool closing callbacks in mapper
|
||||
}
|
||||
if (!matched_atomic) {
|
||||
func_parser = p.atomic(func_parser);
|
||||
}
|
||||
return func_parser;
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
common_peg_parser tool_choice = p.choice();
|
||||
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & func = tool.at("function");
|
||||
std::string name = func.at("name");
|
||||
const auto & schema = func.contains("parameters") ? func.at("parameters") : json::object();
|
||||
|
||||
// Build call_id parser based on position (if supported)
|
||||
bool have_call_id = false;
|
||||
common_peg_parser call_id_section = p.eps();
|
||||
if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
|
||||
(!call_id.suffix.empty() || !arguments.start.empty())) {
|
||||
if (!call_id.suffix.empty()) {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
|
||||
} else {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
|
||||
}
|
||||
have_call_id = true;
|
||||
}
|
||||
auto args_parser = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
|
||||
if (!arguments.start.empty()) {
|
||||
args_parser = p.literal(arguments.start) + args_parser;
|
||||
}
|
||||
if (!arguments.end.empty()) {
|
||||
args_parser = args_parser + p.literal(arguments.end);
|
||||
}
|
||||
|
||||
auto atomic_peek = !arguments.start.empty() ? std::optional(p.peek(p.literal(arguments.start))) : std::nullopt;
|
||||
auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_parser, atomic_peek);
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
});
|
||||
|
||||
auto require_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
common_peg_parser tool_calls = p.eps();
|
||||
|
||||
if (!format.per_call_start.empty()) {
|
||||
auto wrapped_call = format.per_call_start + tool_choice + format.per_call_end;
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
|
||||
} else {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call);
|
||||
}
|
||||
if (!format.section_start.empty()) {
|
||||
tool_calls = p.trigger_rule("tool-calls",
|
||||
p.literal(format.section_start) + p.space() + tool_calls + p.space() +
|
||||
(format.section_end.empty() ? p.end() : p.literal(format.section_end)));
|
||||
}
|
||||
} else {
|
||||
std::string separator = ", "; // Default
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_calls = p.trigger_rule("tool-call", format.section_start + tool_choice +
|
||||
p.zero_or_more(separator + tool_choice) + format.section_end);
|
||||
} else {
|
||||
tool_calls = p.trigger_rule("tool-call", format.section_start + tool_choice + format.section_end);
|
||||
}
|
||||
}
|
||||
|
||||
if (!require_calls) {
|
||||
tool_calls = p.optional(tool_calls);
|
||||
}
|
||||
|
||||
std::string trigger_marker = !format.section_start.empty() ? format.section_start : format.per_call_start;
|
||||
auto content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
|
||||
return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
|
||||
p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
common_peg_parser tool_choice = p.choice();
|
||||
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & func = tool.at("function");
|
||||
std::string name = func.at("name");
|
||||
const auto & params = func.contains("parameters") ? func.at("parameters") : json::object();
|
||||
const auto & properties = params.contains("properties") ? params.at("properties") : json::object();
|
||||
std::set<std::string> required;
|
||||
|
||||
// Build parser for each argument, separating required and optional
|
||||
std::vector<common_peg_parser> required_parsers;
|
||||
std::vector<common_peg_parser> optional_parsers;
|
||||
for (const auto & [param_name, param_schema] : properties.items()) {
|
||||
bool is_required = required.find(param_name) != required.end();
|
||||
std::string type = "object";
|
||||
if (param_schema.contains("type")) {
|
||||
const auto & type_obj = param_schema.at("type");
|
||||
if (type_obj.is_string()) {
|
||||
type_obj.get_to(type);
|
||||
} else if (type_obj.is_array()) {
|
||||
// Handle nullable types like ["string", "null"]
|
||||
for (const auto & t : type_obj) {
|
||||
if (t.is_string() && t.get<std::string>() != "null") {
|
||||
type = t.get<std::string>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (type_obj.is_object()) {
|
||||
if (type_obj.contains("type") && type_obj.at("type").is_string()) {
|
||||
type_obj.at("type").get_to(type);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Infer string type from enum values when type is unspecified
|
||||
if (type == "object" && param_schema.contains("enum")) {
|
||||
const auto & enum_vals = param_schema.at("enum");
|
||||
if (enum_vals.is_array()) {
|
||||
for (const auto & v : enum_vals) {
|
||||
if (v.is_string()) {
|
||||
type = "string";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto arg =
|
||||
p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
|
||||
arguments.name_suffix) +
|
||||
arguments.value_prefix +
|
||||
(type == "string" ?
|
||||
p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
|
||||
"tool-" + name + "-arg-" + param_name + "-schema",
|
||||
param_schema, true)) :
|
||||
p.tool_arg_json_value(p.schema(
|
||||
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
|
||||
p.space()) +
|
||||
p.tool_arg_close(p.literal(arguments.value_suffix)));
|
||||
|
||||
auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
|
||||
if (is_required) {
|
||||
required_parsers.push_back(named_arg);
|
||||
} else {
|
||||
optional_parsers.push_back(named_arg);
|
||||
}
|
||||
}
|
||||
|
||||
// Build required arg sequence in definition order
|
||||
common_peg_parser args_seq = p.eps();
|
||||
for (size_t i = 0; i < required_parsers.size(); i++) {
|
||||
if (i > 0) {
|
||||
args_seq = args_seq + p.space();
|
||||
}
|
||||
args_seq = args_seq + required_parsers[i];
|
||||
}
|
||||
|
||||
// Build optional args with flexible ordering
|
||||
if (!optional_parsers.empty()) {
|
||||
common_peg_parser any_opt = p.choice();
|
||||
for (const auto & opt : optional_parsers) {
|
||||
any_opt |= opt;
|
||||
}
|
||||
args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
|
||||
}
|
||||
|
||||
if (!arguments.start.empty()) {
|
||||
args_seq = p.literal(arguments.start) + args_seq;
|
||||
}
|
||||
if (!arguments.end.empty()) {
|
||||
args_seq = args_seq + p.literal(arguments.end);
|
||||
}
|
||||
|
||||
// Build call_id parser based on position (if supported)
|
||||
common_peg_parser call_id_section = p.eps();
|
||||
bool have_call_id = false;
|
||||
if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
|
||||
(!call_id.suffix.empty() || !arguments.start.empty())) {
|
||||
have_call_id = true;
|
||||
if (!call_id.suffix.empty()) {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
|
||||
} else {
|
||||
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
|
||||
}
|
||||
}
|
||||
|
||||
// Only peek for an arg tag when there are required args that must follow.
|
||||
// When all args are optional, the model may emit no arg tags at all (#20650).
|
||||
auto atomic_peek = (!arguments.name_prefix.empty() && !required_parsers.empty()) ?
|
||||
std::optional(p.peek(p.literal(arguments.name_prefix))) : std::nullopt;
|
||||
auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_seq, atomic_peek);
|
||||
tool_choice |= p.rule("tool-" + name, func_parser);
|
||||
});
|
||||
|
||||
auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
common_peg_parser tool_calls = p.eps();
|
||||
|
||||
if (!format.per_call_start.empty()) {
|
||||
auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
|
||||
} else {
|
||||
tool_calls = p.trigger_rule("tool-call", wrapped_call);
|
||||
}
|
||||
if (!format.section_start.empty()) {
|
||||
tool_calls = p.trigger_rule("tool-calls",
|
||||
p.literal(format.section_start) + p.space() + tool_calls + p.space() +
|
||||
(format.section_end.empty() ? p.end() : p.literal(format.section_end)));
|
||||
}
|
||||
} else {
|
||||
std::string separator = ", "; // Default
|
||||
|
||||
if (inputs.parallel_tool_calls) {
|
||||
tool_calls = p.trigger_rule("tool-call", format.section_start + p.space() + tool_choice +
|
||||
p.zero_or_more(separator + tool_choice) + p.space() +
|
||||
format.section_end);
|
||||
} else {
|
||||
tool_calls = p.trigger_rule(
|
||||
"tool-call", format.section_start + p.space() + tool_choice + p.space() + format.section_end);
|
||||
}
|
||||
}
|
||||
|
||||
if (!require_tools) {
|
||||
tool_calls = p.optional(tool_calls);
|
||||
}
|
||||
|
||||
std::string trigger_marker = !format.section_start.empty() ? format.section_start : format.per_call_start;
|
||||
auto content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
|
||||
return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
|
||||
p.end();
|
||||
}
|
||||
|
||||
} // namespace autoparser
|
||||
@@ -1,364 +0,0 @@
|
||||
#include "chat-auto-parser-helpers.h"
|
||||
|
||||
#include "chat-auto-parser.h"
|
||||
#include "chat-peg-parser.h"
|
||||
#include "chat.h"
|
||||
#include "log.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <cctype>
|
||||
#include <numeric>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
std::string trim_whitespace(const std::string & str) {
|
||||
size_t start = 0;
|
||||
while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
|
||||
start++;
|
||||
}
|
||||
|
||||
if (start == str.length()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
size_t end = str.length() - 1;
|
||||
while (end > start && std::isspace(static_cast<unsigned char>(str[end]))) {
|
||||
end--;
|
||||
}
|
||||
|
||||
return str.substr(start, end - start + 1);
|
||||
}
|
||||
|
||||
std::string trim_leading_whitespace(const std::string & str) {
|
||||
size_t start = 0;
|
||||
while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
|
||||
start++;
|
||||
}
|
||||
|
||||
return str.substr(start);
|
||||
}
|
||||
|
||||
std::string trim_trailing_whitespace(const std::string & str) {
|
||||
if (str.empty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
size_t end = str.length() - 1;
|
||||
while (end > 0 && std::isspace(static_cast<unsigned char>(str[end]))) {
|
||||
end--;
|
||||
}
|
||||
|
||||
// If first char is also whitespace, return empty string
|
||||
if (end == 0 && std::isspace(static_cast<unsigned char>(str[0]))) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return str.substr(0, end + 1);
|
||||
}
|
||||
|
||||
std::string trim_trailing_newlines(const std::string & str) {
|
||||
size_t end = str.length();
|
||||
while (end > 0 && str[end - 1] == '\n') {
|
||||
end--;
|
||||
}
|
||||
|
||||
return str.substr(0, end);
|
||||
}
|
||||
|
||||
static size_t common_prefix_len(const std::string & left, const std::string & right) {
|
||||
size_t prefix_len = 0;
|
||||
size_t min_len = std::min(left.length(), right.length());
|
||||
while (prefix_len < min_len && left[prefix_len] == right[prefix_len]) {
|
||||
prefix_len++;
|
||||
}
|
||||
return prefix_len;
|
||||
}
|
||||
|
||||
static size_t common_suffix_len(const std::string & left, const std::string & right) {
|
||||
size_t suffix_len = 0;
|
||||
size_t min_len = std::min(left.length(), right.length());
|
||||
while (suffix_len < min_len && left[left.length() - 1 - suffix_len] == right[right.length() - 1 - suffix_len]) {
|
||||
suffix_len++;
|
||||
}
|
||||
return suffix_len;
|
||||
}
|
||||
|
||||
diff_split calculate_diff_split(const std::string & left, const std::string & right) {
|
||||
diff_split result;
|
||||
|
||||
auto left_seg = segmentize_markers(left);
|
||||
auto right_seg = segmentize_markers(right);
|
||||
|
||||
if (left_seg.empty()) {
|
||||
result.right = right;
|
||||
return result;
|
||||
}
|
||||
if (right_seg.empty()) {
|
||||
result.left = left;
|
||||
return result;
|
||||
}
|
||||
|
||||
auto left_start = left_seg.begin();
|
||||
auto left_end = --left_seg.end();
|
||||
auto right_start = right_seg.begin();
|
||||
auto right_end = --right_seg.end();
|
||||
|
||||
auto test = [&] () {
|
||||
return left_start != left_end && right_start != right_end;
|
||||
};
|
||||
|
||||
bool left_fully_consumed = false;
|
||||
bool right_fully_consumed = false;
|
||||
|
||||
while (test()) {
|
||||
bool advanced = false;
|
||||
if (*left_start == *right_start) {
|
||||
result.prefix.append(left_start->value);
|
||||
left_start++;
|
||||
right_start++;
|
||||
advanced = true;
|
||||
}
|
||||
if (*left_end == *right_end) {
|
||||
result.suffix = left_end->value + result.suffix;
|
||||
if (left_start != left_end) {
|
||||
left_end--;
|
||||
} else {
|
||||
left_fully_consumed = true;
|
||||
}
|
||||
if (right_start != right_end) {
|
||||
right_end--;
|
||||
} else {
|
||||
right_fully_consumed = true;
|
||||
}
|
||||
advanced = true;
|
||||
}
|
||||
if (!advanced) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (left_start == left_end && right_start != right_end) {
|
||||
if (*left_start == *right_end) {
|
||||
result.suffix = right_end->value + result.suffix;
|
||||
right_end--;
|
||||
left_fully_consumed = true;
|
||||
} else if (*left_start == *right_start) {
|
||||
result.prefix.append(right_start->value);
|
||||
right_start++;
|
||||
left_fully_consumed = true;
|
||||
}
|
||||
} else if (right_start == right_end && left_start != left_end) {
|
||||
if (*left_end == *right_start) {
|
||||
result.suffix = left_end->value + result.suffix;
|
||||
left_end--;
|
||||
right_fully_consumed = true;
|
||||
} else if (*left_start == *right_start) {
|
||||
result.prefix.append(left_start->value);
|
||||
left_start++;
|
||||
right_fully_consumed = true;
|
||||
}
|
||||
} else if (left_start == left_end && right_start == right_end && *left_start == *right_start && left_start->type == segment_type::MARKER) {
|
||||
result.prefix.append(right_start->value);
|
||||
left_fully_consumed = true;
|
||||
right_fully_consumed = true;
|
||||
}
|
||||
|
||||
auto eat_segment = [](std::string str, const segment & seg) -> std::string { return std::move(str) + seg.value; };
|
||||
|
||||
bool can_have_text_suffix = left_end->type == segment_type::TEXT && right_end->type == segment_type::TEXT;
|
||||
bool can_have_text_prefix = right_start->type == segment_type::TEXT && left_start->type == segment_type::TEXT;
|
||||
|
||||
std::string remainder_left = std::accumulate(left_start, left_fully_consumed ? left_end : ++left_end, std::string(), eat_segment);
|
||||
std::string remainder_right = std::accumulate(right_start, right_fully_consumed ? right_end : ++right_end, std::string(), eat_segment);
|
||||
|
||||
size_t suffix_len = can_have_text_suffix ? common_suffix_len(remainder_left, remainder_right) : 0;
|
||||
// avoid overlaps between prefix and suffix
|
||||
size_t prefix_len = can_have_text_prefix ? common_prefix_len(remainder_left.substr(0, remainder_left.size() - suffix_len),
|
||||
remainder_right.substr(0, remainder_right.size() - suffix_len)) : 0;
|
||||
|
||||
result.prefix.append(remainder_left.substr(0, prefix_len));
|
||||
result.suffix = remainder_left.substr(remainder_left.length() - suffix_len, suffix_len) + result.suffix;
|
||||
result.left = remainder_left.substr(prefix_len, remainder_left.length() - prefix_len - suffix_len);
|
||||
result.right = remainder_right.substr(prefix_len, remainder_right.length() - prefix_len - suffix_len);
|
||||
|
||||
if (result.left == "" && result.right == "") {
|
||||
// degenerate case, no diff
|
||||
result.prefix = left;
|
||||
result.suffix = "";
|
||||
// pick prefix = all as representation
|
||||
}
|
||||
|
||||
// When left has no unique content (result.left is empty), left is entirely
|
||||
// shared with right. The simultaneous prefix/suffix segment matching can
|
||||
// incorrectly consume trailing segments of left as suffix when those same
|
||||
// segments also appear at the end of right (e.g. "\n" at the end of both
|
||||
// the shared content and the generation prompt). This rotates the diff.
|
||||
// Fix: if left is a prefix of right, enforce that directly.
|
||||
if (result.left.empty() && !result.right.empty() &&
|
||||
left.size() <= right.size() &&
|
||||
right.substr(0, left.size()) == left) {
|
||||
result.prefix = left;
|
||||
result.suffix = "";
|
||||
result.right = right.substr(left.size());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns the prefix of `full` up until the first occurrence of the common prefix of `left` and `right`
|
||||
std::string until_common_prefix(const std::string & full, const std::string & left, const std::string & right) {
|
||||
// Find the common prefix of left and right
|
||||
size_t common_prefix_len = 0;
|
||||
size_t min_len = std::min(left.length(), right.length());
|
||||
while (common_prefix_len < min_len && left[common_prefix_len] == right[common_prefix_len]) {
|
||||
common_prefix_len++;
|
||||
}
|
||||
|
||||
// If there's no common prefix, return empty string
|
||||
if (common_prefix_len == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Find the common prefix in the full string
|
||||
std::string common_prefix = left.substr(0, common_prefix_len);
|
||||
size_t pos = full.find(common_prefix);
|
||||
|
||||
// If not found, return empty string
|
||||
if (pos == std::string::npos) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Return everything before the common prefix
|
||||
return full.substr(0, pos);
|
||||
}
|
||||
|
||||
// Returns the suffix of `full` after the last occurrence of the common suffix of `left` and `right`
|
||||
std::string after_common_suffix(const std::string & full, const std::string & left, const std::string & right) {
|
||||
// Find the common suffix of left and right (compare from the end)
|
||||
size_t common_suffix_len = 0;
|
||||
size_t min_len = std::min(left.length(), right.length());
|
||||
while (common_suffix_len < min_len &&
|
||||
left[left.length() - 1 - common_suffix_len] == right[right.length() - 1 - common_suffix_len]) {
|
||||
common_suffix_len++;
|
||||
}
|
||||
|
||||
// If there's no common suffix, return empty string
|
||||
if (common_suffix_len == 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Extract the common suffix
|
||||
std::string common_suffix = left.substr(left.length() - common_suffix_len);
|
||||
|
||||
// Find the last occurrence of the common suffix in the full string
|
||||
size_t pos = full.rfind(common_suffix);
|
||||
|
||||
// If not found, return empty string
|
||||
if (pos == std::string::npos) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Return everything after the common suffix
|
||||
return full.substr(pos + common_suffix_len);
|
||||
}
|
||||
|
||||
// TODO: segmentize will treat a JSON array inside tags as a tag: <calls>[{ "fun": { ... } }]</calls> will be three markers
|
||||
// not too worried about that because it hasn't turned out as a problem anywhere, but noting here in case it will
|
||||
// Might have to put some restrictions on tag contents as well (like "no { }")
|
||||
std::vector<segment> segmentize_markers(const std::string & text) {
|
||||
std::vector<segment> retval;
|
||||
bool in_marker = false;
|
||||
char marker_opener = '\0';
|
||||
|
||||
auto is_marker_opener = [](char c) -> bool { return c == '<' || c == '['; };
|
||||
auto is_marker_closer = [](char op, char c) -> bool { return (op == '<' && c == '>') || (op == '[' && c == ']'); };
|
||||
|
||||
size_t last_border = 0;
|
||||
|
||||
for (size_t cur_pos = 0; cur_pos < text.length(); cur_pos++) {
|
||||
if (!in_marker && is_marker_opener(text[cur_pos])) {
|
||||
if (last_border < cur_pos) {
|
||||
retval.push_back(segment(segment_type::TEXT, text.substr(last_border, cur_pos - last_border)));
|
||||
}
|
||||
last_border = cur_pos;
|
||||
in_marker = true;
|
||||
marker_opener = text[cur_pos];
|
||||
} else if (in_marker && is_marker_closer(marker_opener, text[cur_pos])) {
|
||||
// no need to check because last_border will always be smaller
|
||||
retval.push_back(segment(segment_type::MARKER, text.substr(last_border, cur_pos - last_border + 1)));
|
||||
last_border = cur_pos + 1;
|
||||
in_marker = false;
|
||||
marker_opener = '\0';
|
||||
}
|
||||
}
|
||||
if (last_border < text.length()) {
|
||||
retval.push_back(segment(segment_type::TEXT, text.substr(last_border)));
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments) {
|
||||
std::vector<segment> result;
|
||||
for (const auto & seg : segments) {
|
||||
if (!trim_whitespace(seg.value).empty()) {
|
||||
result.push_back(seg);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace autoparser {
|
||||
|
||||
std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
|
||||
generation_params tmpl_params;
|
||||
tmpl_params.messages = params.messages;
|
||||
tmpl_params.tools = params.tools;
|
||||
tmpl_params.add_generation_prompt = params.add_generation_prompt;
|
||||
tmpl_params.enable_thinking = params.enable_thinking;
|
||||
|
||||
if (params.extra_context) {
|
||||
tmpl_params.extra_context = *params.extra_context;
|
||||
}
|
||||
tmpl_params.extra_context["enable_thinking"] = params.enable_thinking;
|
||||
|
||||
try {
|
||||
return common_chat_template_direct_apply(tmpl, tmpl_params);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_DBG("Template application failed: %s\n", e.what());
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<compare_variants_result> compare_variants(
|
||||
const common_chat_template & tmpl,
|
||||
const template_params & params_A,
|
||||
const std::function<void(template_params &)> & params_modifier) {
|
||||
// Create variant B by copying A
|
||||
template_params params_B = params_A;
|
||||
|
||||
// Apply modifier to create variant B
|
||||
if (params_modifier) {
|
||||
params_modifier(params_B);
|
||||
}
|
||||
|
||||
// Apply template to both variants
|
||||
std::string output_A = apply_template(tmpl, params_A);
|
||||
std::string output_B = apply_template(tmpl, params_B);
|
||||
|
||||
// Check for template application failures
|
||||
if (output_A.empty() || output_B.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Calculate diff and return result with both outputs
|
||||
compare_variants_result result;
|
||||
result.diff = calculate_diff_split(output_A, output_B);
|
||||
result.output_A = output_A;
|
||||
result.output_B = output_B;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace autoparser
|
||||
|
||||
@@ -1,74 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "chat-auto-parser.h"
|
||||
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
std::string trim_whitespace(const std::string & str);
|
||||
std::string trim_leading_whitespace(const std::string & str);
|
||||
std::string trim_trailing_whitespace(const std::string & str);
|
||||
std::string trim_trailing_newlines(const std::string & str);
|
||||
|
||||
// calculate a diff split (longest common prefix, longest common suffix excluding prefix,
|
||||
// mismatched part on the left, mismatched part on the right) between two strings
|
||||
// account for markers - align prefix and suffix endings so that they end on markers
|
||||
// * eg.:
|
||||
// calculate_diff_split("<html><body><div></div></body></html>", "<html><body><p>Something</p></body><html>") ->
|
||||
// { "prefix": "<html><body>" (not: "<html><body><"), "suffix": "</body></html>", "left": "<div></div>", "right": "<p>Something</p>" }
|
||||
// calculate_diff_split("<html><body>Something</body></html>", "<html><body></body><html>") ->
|
||||
// { "prefix": "<html><body>", "suffix": "</body></html>", "left": "Something", "right": "" }
|
||||
diff_split calculate_diff_split(const std::string & left, const std::string & right);
|
||||
|
||||
// Returns the prefix of `full` up until the first occurrence of the common prefix of `left` and `right`
|
||||
// Returns empty string if there's no common prefix
|
||||
// * eg.:
|
||||
// until_common_prefix("really want a FUNCTION call", "FUNCTION alpha", "FUNCTION beta") -> "really want a "
|
||||
// until_common_prefix("<tool_call>", "<something>", "<something_else>") -> ""
|
||||
// until_common_prefix("some text", "1234", "abcd") -> ""
|
||||
// until_common_prefix("one arg two args three args four", "argument alpha", "argument beta") -> "one ""
|
||||
std::string until_common_prefix(const std::string & full, const std::string & left, const std::string & right);
|
||||
|
||||
// Returns the suffix of `full` after the last occurrence of the common suffix of `left` and `right`
|
||||
// Returns empty string if there's no common suffix
|
||||
// Mirror function of `until_common_prefix`
|
||||
// * eg.:
|
||||
// after_common_suffix("really want a FUNCTION call", "first FUNCTION", "second FUNCTION") -> " call"
|
||||
// after_common_suffix("one arg two-args three args four", "alpha-args", "beta-args") -> " three args four"
|
||||
std::string after_common_suffix(const std::string & full, const std::string & left, const std::string & right);
|
||||
|
||||
// Segmentize text into markers and non-marker fragments
|
||||
// * eg.:
|
||||
// segmentize_markers("<html><head><title>The site title</title><body><div>Here's some <b>content</b></div></body></html>" ->
|
||||
// [ (MARKER, "<html>"), (MARKER, "<head>"), (MARKER, "<title>"), (TEXT, "The site title"), (MARKER, "</title>"),
|
||||
// (MARKER, "<body>"), (MARKER, "<div>"), (TEXT, "Here's some "), (MARKER, "<b>"), (TEXT, "content"), (MARKER, "</b>"),
|
||||
// (MARKER, "</div>"), (MARKER, "</body>"), (MARKER, "</html>")
|
||||
// ]
|
||||
// segmentize_markers("<|tool_call|>[args]{ are here }[/args]<|tool_call_end|>") ->
|
||||
// [ (MARKER, "<|tool_call|>"), (MARKER, "[args]"), (TEXT, "{ are here }"), (MARKER, "[/args]"), (MARKER, "<|tool_call_end|>") ]
|
||||
std::vector<segment> segmentize_markers(const std::string & text);
|
||||
|
||||
// Prune whitespace-only segments from a vector of segments
|
||||
// * eg.:
|
||||
// segmentize_markers("<tool_call>\n<function=foo>\n<arg=bar>\n \n</arg>\n</function>\n</tool_call>") ->
|
||||
// X = [ (MARKER, "<tool_call>"), (TEXT, "\n"), (MARKER, "<function=foo>"), (TEXT, "\n"), (MARKER, "<arg=bar>"), (TEXT, "\n \n"),
|
||||
// (MARKER, "</arg>"), (TEXT, "\n"), (MARKER, "</function>"), (TEXT, "\n"), (MARKER, "</tool_call>") ]
|
||||
// prune_whitespace_segments(X) -> [ (MARKER, "<tool_call>"), (MARKER, "<function=foo>"), (MARKER, "<arg=bar>"), (MARKER, "</arg>"),
|
||||
// (MARKER, "</function>"), (MARKER, "</tool_call>") ]
|
||||
std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments);
|
||||
|
||||
namespace autoparser {
|
||||
|
||||
// Apply a template with the given parameters, returning the rendered string (empty on failure)
|
||||
std::string apply_template(const common_chat_template & tmpl, const template_params & params);
|
||||
|
||||
// Factorized differential comparison function
|
||||
// Takes base params and a single modifier lambda to create variant B
|
||||
// Returns compare_variants_result containing diff and both outputs, or std::nullopt on failure
|
||||
std::optional<compare_variants_result> compare_variants(
|
||||
const common_chat_template & tmpl,
|
||||
const template_params & params_A,
|
||||
const std::function<void(template_params &)> & params_modifier);
|
||||
|
||||
} // namespace autoparser
|
||||
@@ -1,434 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "jinja/caps.h"
|
||||
#include "peg-parser.h"
|
||||
#include "nlohmann/json.hpp"
|
||||
|
||||
#include <chrono>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
class common_chat_peg_builder;
|
||||
|
||||
// ============================================================================
|
||||
// Parameters for template application (low-level, used by diff analysis)
|
||||
// ============================================================================
|
||||
struct template_params {
|
||||
json messages;
|
||||
json tools;
|
||||
bool add_generation_prompt = false;
|
||||
bool enable_thinking = true;
|
||||
std::optional<json> extra_context = std::nullopt;
|
||||
};
|
||||
|
||||
struct diff_split {
|
||||
std::string prefix;
|
||||
std::string suffix;
|
||||
std::string left;
|
||||
std::string right;
|
||||
|
||||
bool operator==(struct diff_split & other) const {
|
||||
return prefix == other.prefix && suffix == other.suffix && left == other.left && right == other.right;
|
||||
}
|
||||
};
|
||||
|
||||
// Result of compare_variants containing diff and original outputs
|
||||
struct compare_variants_result {
|
||||
diff_split diff;
|
||||
std::string output_A;
|
||||
std::string output_B;
|
||||
};
|
||||
|
||||
namespace autoparser {
|
||||
|
||||
// ============================================================================
|
||||
// High-level params for parser generation
|
||||
// ============================================================================
|
||||
|
||||
struct generation_params {
|
||||
json messages;
|
||||
json tools;
|
||||
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
json json_schema;
|
||||
bool parallel_tool_calls = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
bool stream = true;
|
||||
std::string grammar;
|
||||
bool add_generation_prompt = false;
|
||||
bool enable_thinking = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
std::string generation_prompt;
|
||||
json extra_context;
|
||||
bool add_bos = false;
|
||||
bool add_eos = false;
|
||||
bool is_inference = true;
|
||||
bool add_inference = false;
|
||||
bool mark_input = true; // whether to mark input strings in the jinja context
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Analysis Result Enums
|
||||
// ============================================================================
|
||||
|
||||
// Reasoning handling mode (derived from R1-R3 comparisons)
|
||||
enum class reasoning_mode {
|
||||
NONE, // No reasoning markers detected
|
||||
TAG_BASED, // Tag-based: <think>...</think> (start can be empty for delimiter-style)
|
||||
TOOLS_ONLY // Only reason on tool calls, not on normal content
|
||||
};
|
||||
|
||||
inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode) {
|
||||
switch (mode) {
|
||||
case reasoning_mode::NONE:
|
||||
return os << "NONE";
|
||||
case reasoning_mode::TAG_BASED:
|
||||
return os << "TAG_BASED";
|
||||
case reasoning_mode::TOOLS_ONLY:
|
||||
return os << "TOOLS_ONLY";
|
||||
default:
|
||||
return os << "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
// Content wrapping mode (derived from C1 comparison)
|
||||
enum class content_mode {
|
||||
PLAIN, // No content markers
|
||||
ALWAYS_WRAPPED, // Content always wrapped with markers
|
||||
WRAPPED_WITH_REASONING, // Content wrapped only when reasoning present
|
||||
};
|
||||
|
||||
inline std::ostream & operator<<(std::ostream & os, const content_mode & mode) {
|
||||
switch (mode) {
|
||||
case content_mode::PLAIN:
|
||||
return os << "PLAIN";
|
||||
case content_mode::ALWAYS_WRAPPED:
|
||||
return os << "ALWAYS_WRAPPED";
|
||||
case content_mode::WRAPPED_WITH_REASONING:
|
||||
return os << "WRAPPED_WITH_REASONING";
|
||||
default:
|
||||
return os << "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
// Call ID position in tool calls (for non-JSON formats)
|
||||
enum class call_id_position {
|
||||
NONE, // No call ID support detected
|
||||
PRE_FUNC_NAME, // Call ID before function name: [CALL_ID]id[FUNC]name{args}
|
||||
BETWEEN_FUNC_AND_ARGS, // Call ID between function and args: [FUNC]name[CALL_ID]id{args}
|
||||
POST_ARGS, // Call ID after arguments: [FUNC]name{args}[CALL_ID]id
|
||||
};
|
||||
|
||||
inline std::ostream & operator<<(std::ostream & os, const call_id_position & pos) {
|
||||
switch (pos) {
|
||||
case call_id_position::NONE:
|
||||
return os << "NONE";
|
||||
case call_id_position::PRE_FUNC_NAME:
|
||||
return os << "PRE_FUNC_NAME";
|
||||
case call_id_position::BETWEEN_FUNC_AND_ARGS:
|
||||
return os << "BETWEEN_FUNC_AND_ARGS";
|
||||
case call_id_position::POST_ARGS:
|
||||
return os << "POST_ARGS";
|
||||
default:
|
||||
return os << "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
// Tool call format classification (derived from T1-T5, A1-A3 comparisons)
|
||||
enum class tool_format {
|
||||
NONE, // No tool support detected
|
||||
JSON_NATIVE, // Pure JSON: {"name": "X", "arguments": {...}}
|
||||
TAG_WITH_JSON, // Tag-based with JSON args: <function=X>{...}</function>
|
||||
TAG_WITH_TAGGED, // Tag-based with tagged args: <param=key>value</param>
|
||||
};
|
||||
|
||||
inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
|
||||
switch (format) {
|
||||
case tool_format::NONE:
|
||||
return os << "NONE";
|
||||
case tool_format::JSON_NATIVE:
|
||||
return os << "JSON_NATIVE";
|
||||
case tool_format::TAG_WITH_JSON:
|
||||
return os << "TAG_WITH_JSON";
|
||||
case tool_format::TAG_WITH_TAGGED:
|
||||
return os << "TAG_WITH_TAGGED";
|
||||
default:
|
||||
return os << "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Sub-structs for tool analysis
|
||||
// ============================================================================
|
||||
|
||||
struct tool_format_analysis {
|
||||
tool_format mode = tool_format::NONE;
|
||||
|
||||
std::string section_start; // e.g., "<tool_call>", "[TOOL_CALLS]", ""
|
||||
std::string section_end; // e.g., "</tool_call>", ""
|
||||
std::string per_call_start; // e.g., "<|tool_call_begin|>", "" (for multi-call templates)
|
||||
std::string per_call_end; // e.g., "<|tool_call_end|>", ""
|
||||
|
||||
bool fun_name_is_key = false; // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
|
||||
bool tools_array_wrapped = false; // Tool calls wrapped in JSON array [...]
|
||||
|
||||
std::string function_field = "function";
|
||||
std::string name_field = "name";
|
||||
std::string args_field = "arguments";
|
||||
std::string id_field;
|
||||
std::string gen_id_field;
|
||||
std::vector<std::string> parameter_order;
|
||||
};
|
||||
|
||||
struct tool_function_analysis {
|
||||
std::string name_prefix; // e.g., "<function=", "\"name\": \"", "functions."
|
||||
std::string name_suffix; // e.g., ">", "\"", ":0"
|
||||
std::string close; // e.g., "</function>", "" (for tag-based)
|
||||
};
|
||||
|
||||
struct tool_arguments_analysis {
|
||||
std::string start; // e.g., "<|tool_call_argument_begin|>", "<args>"
|
||||
std::string end; // e.g., "<|tool_call_argument_end|>", "</args>"
|
||||
std::string name_prefix; // e.g., "<param=", "<arg_key>", "\""
|
||||
std::string name_suffix; // e.g., ">", "</arg_key>", "\":"
|
||||
std::string value_prefix; // e.g., "", "<arg_value>", ""
|
||||
std::string value_suffix; // e.g., "</param>", "</arg_value>", ""
|
||||
std::string separator; // e.g., "", "\n", ","
|
||||
};
|
||||
|
||||
struct tool_id_analysis {
|
||||
call_id_position pos = call_id_position::NONE;
|
||||
|
||||
std::string prefix; // e.g., "[CALL_ID]" (marker before call ID value)
|
||||
std::string suffix; // e.g., "" (marker after call ID value, before next section)
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Parser build context (shared interface for build_parser methods)
|
||||
// ============================================================================
|
||||
|
||||
struct analyze_content;
|
||||
struct analyze_reasoning;
|
||||
|
||||
struct parser_build_context {
|
||||
common_chat_peg_builder & p;
|
||||
const generation_params & inputs;
|
||||
common_peg_parser reasoning_parser;
|
||||
bool extracting_reasoning = false;
|
||||
const analyze_reasoning * reasoning = nullptr;
|
||||
const analyze_content * content = nullptr;
|
||||
|
||||
parser_build_context(common_chat_peg_builder & p, const generation_params & inputs);
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Base class for analyzers with parser building
|
||||
// ============================================================================
|
||||
|
||||
struct analyze_base {
|
||||
virtual ~analyze_base() = default;
|
||||
virtual common_peg_parser build_parser(parser_build_context & ctx) const = 0;
|
||||
|
||||
protected:
|
||||
const common_chat_template * tmpl = nullptr;
|
||||
|
||||
analyze_base() = default;
|
||||
explicit analyze_base(const common_chat_template & tmpl) : tmpl(&tmpl) {}
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Reasoning analyzer
|
||||
// ============================================================================
|
||||
|
||||
struct analyze_reasoning : analyze_base {
|
||||
reasoning_mode mode = reasoning_mode::NONE;
|
||||
|
||||
std::string start; // e.g., "<think>", "[THINK]", "<|START_THINKING|>", ""
|
||||
std::string end; // e.g., "</think>", "[BEGIN FINAL RESPONSE]", "<|END_THINKING|>"
|
||||
|
||||
analyze_reasoning() = default;
|
||||
analyze_reasoning(const common_chat_template & tmpl, bool supports_tools);
|
||||
analyze_reasoning(std::string start_, std::string end_) : start(std::move(start_)), end(std::move(end_)) {}
|
||||
|
||||
common_peg_parser build_parser(parser_build_context & ctx) const override;
|
||||
|
||||
private:
|
||||
// Look for reasoning markers in rendered content
|
||||
void compare_reasoning_presence();
|
||||
|
||||
// Compare generation prompt with enable_thinking=true vs false
|
||||
void compare_thinking_enabled();
|
||||
|
||||
// Check if reasoning is always possible or only in tool calls
|
||||
void compare_reasoning_scope();
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Content analyzer
|
||||
// ============================================================================
|
||||
|
||||
struct analyze_content : analyze_base {
|
||||
content_mode mode = content_mode::PLAIN;
|
||||
|
||||
std::string start; // e.g., "<response>", ">>>all\n", ""
|
||||
std::string end; // e.g., "</response>", ""
|
||||
|
||||
bool requires_nonnull_content = false;
|
||||
|
||||
analyze_content() = default;
|
||||
analyze_content(const common_chat_template & tmpl, const analyze_reasoning & reasoning);
|
||||
|
||||
common_peg_parser build_parser(parser_build_context & ctx) const override;
|
||||
|
||||
bool is_always_wrapped() const;
|
||||
common_peg_parser build_optional_wrapped(parser_build_context & ctx) const;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Tool analyzer
|
||||
// ============================================================================
|
||||
|
||||
struct analyze_tools : analyze_base {
|
||||
tool_format_analysis format;
|
||||
tool_function_analysis function;
|
||||
tool_arguments_analysis arguments;
|
||||
tool_id_analysis call_id;
|
||||
|
||||
analyze_tools() = default;
|
||||
analyze_tools(const common_chat_template & tmpl,
|
||||
const jinja::caps & caps,
|
||||
const analyze_reasoning & reasoning);
|
||||
|
||||
common_peg_parser build_parser(parser_build_context & ctx) const override;
|
||||
|
||||
private:
|
||||
// Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
|
||||
void analyze_tool_calls(const analyze_reasoning & reasoning);
|
||||
|
||||
// Analyze format based on position of function and argument name in needle
|
||||
void analyze_tool_call_format(const std::string & haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle,
|
||||
const analyze_reasoning & reasoning);
|
||||
|
||||
// Analyze specifics of JSON native format (entire tool call is a JSON object)
|
||||
void analyze_tool_call_format_json_native(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle,
|
||||
const std::string & arg_name_needle);
|
||||
|
||||
// Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
|
||||
void analyze_tool_call_format_non_json(const std::string & clean_haystack,
|
||||
const std::string & fun_name_needle);
|
||||
|
||||
// Check for and extract specific per-call markers for non-native-JSON templates with parallel call support
|
||||
void check_per_call_markers();
|
||||
|
||||
// Extract function name markers
|
||||
void extract_function_markers();
|
||||
|
||||
// Delegates to separate functions for: separator analysis, argument name analysis, argument value analysis
|
||||
void analyze_arguments();
|
||||
|
||||
// Extract argument name markers
|
||||
void extract_argument_name_markers();
|
||||
|
||||
// Extract argument value markers
|
||||
void extract_argument_value_markers();
|
||||
|
||||
// Extract argument separator, if specified (eg. <arg=foo>...</arg><sep><arg=bar>...</arg>)
|
||||
void extract_argument_separator();
|
||||
|
||||
// Extract argument wrapper markers, if present (eg. '<args><arg=foo>...</arg><arg=bar>...</arg></args>')
|
||||
void extract_args_markers();
|
||||
|
||||
// Extract call ID markers, if present
|
||||
void extract_call_id_markers();
|
||||
|
||||
// Per-format tool parser builders
|
||||
common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
|
||||
common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
|
||||
common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
|
||||
|
||||
// Shared helper: builds func_parser from open+call_id+args, handling atomic wrapping and close.
|
||||
// atomic_peek: if present, used as the peek expression in the third atomicity branch.
|
||||
common_peg_parser build_func_parser(common_chat_peg_builder & p, const std::string & name,
|
||||
const common_peg_parser & call_id_section, bool have_call_id,
|
||||
const common_peg_parser & args,
|
||||
std::optional<common_peg_parser> atomic_peek) const;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Main autoparser class
|
||||
// ============================================================================
|
||||
|
||||
struct autoparser {
|
||||
jinja::caps jinja_caps;
|
||||
analyze_reasoning reasoning;
|
||||
analyze_content content;
|
||||
analyze_tools tools;
|
||||
bool analysis_complete = false;
|
||||
|
||||
// Preserved tokens for tokenizer (union of all non-empty markers)
|
||||
std::vector<std::string> preserved_tokens;
|
||||
|
||||
autoparser() = default;
|
||||
|
||||
// Run full differential analysis on a template
|
||||
void analyze_template(const common_chat_template & tmpl);
|
||||
|
||||
// Build the PEG parser for this template
|
||||
common_peg_arena build_parser(const generation_params & inputs) const;
|
||||
|
||||
private:
|
||||
// Collect tokens from entire analysis to preserve
|
||||
void collect_preserved_tokens();
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Parser generator
|
||||
// ============================================================================
|
||||
|
||||
class peg_generator {
|
||||
public:
|
||||
static common_chat_params generate_parser(const common_chat_template & tmpl,
|
||||
const struct generation_params & inputs);
|
||||
|
||||
static common_chat_params generate_parser(const common_chat_template & tmpl,
|
||||
const struct generation_params & inputs,
|
||||
const autoparser & autoparser);
|
||||
};
|
||||
|
||||
} // namespace autoparser
|
||||
|
||||
enum segment_type { TEXT, MARKER };
|
||||
|
||||
inline std::ostream & operator<<(std::ostream & os, const segment_type & type) {
|
||||
switch (type) {
|
||||
case segment_type::TEXT:
|
||||
return os << "TEXT";
|
||||
case segment_type::MARKER:
|
||||
return os << "MARKER";
|
||||
default:
|
||||
return os << "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
struct segment {
|
||||
segment_type type;
|
||||
std::string value;
|
||||
|
||||
segment(segment_type type, std::string value) : type(type), value(std::move(value)) {}
|
||||
|
||||
bool operator==(const segment & other) const {
|
||||
return type == other.type && value == other.value;
|
||||
}
|
||||
|
||||
bool operator!=(const segment & other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
534
common/chat-parser.cpp
Normal file
534
common/chat-parser.cpp
Normal file
@@ -0,0 +1,534 @@
|
||||
#include "chat-parser.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "regex-partial.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <optional>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
|
||||
: input_(input), is_partial_(is_partial), syntax_(syntax)
|
||||
{
|
||||
result_.role = "assistant";
|
||||
|
||||
while (true) {
|
||||
std::string id = std::to_string(std::rand());
|
||||
if (input.find(id) == std::string::npos) {
|
||||
healing_marker_ = id;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string common_chat_msg_parser::str(const common_string_range & rng) const {
|
||||
GGML_ASSERT(rng.begin <= rng.end);
|
||||
return input_.substr(rng.begin, rng.end - rng.begin);
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::add_content(const std::string &content) {
|
||||
result_.content += content;
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
|
||||
result_.reasoning_content += reasoning_content;
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
|
||||
if (name.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
common_chat_tool_call tool_call;
|
||||
tool_call.name = name;
|
||||
tool_call.arguments = arguments;
|
||||
tool_call.id = id;
|
||||
|
||||
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
||||
result_.tool_calls.emplace_back(tool_call);
|
||||
|
||||
return true;
|
||||
}
|
||||
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
||||
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
|
||||
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
|
||||
std::string arguments = "";
|
||||
if (tool_call.contains("arguments")) {
|
||||
if (tool_call.at("arguments").is_object()) {
|
||||
arguments = tool_call.at("arguments").dump();
|
||||
} else {
|
||||
arguments = tool_call.at("arguments");
|
||||
}
|
||||
}
|
||||
|
||||
return add_tool_call(name, id, arguments);
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::add_tool_calls(const json & arr) {
|
||||
for (const auto & item : arr) {
|
||||
if (!add_tool_call(item)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
|
||||
if (!tool_call.is_object() || tool_call.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the tool name (the single key in the object)
|
||||
auto it = tool_call.begin();
|
||||
std::string name = it.key();
|
||||
|
||||
if (name.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the arguments (the nested object)
|
||||
const json & args_json = it.value();
|
||||
std::string arguments = "";
|
||||
|
||||
if (args_json.is_object()) {
|
||||
arguments = args_json.dump();
|
||||
} else if (args_json.is_string()) {
|
||||
arguments = args_json;
|
||||
} else if (!args_json.is_null()) {
|
||||
// For other types, convert to string representation
|
||||
arguments = args_json.dump();
|
||||
}
|
||||
|
||||
return add_tool_call(name, "", arguments);
|
||||
}
|
||||
void common_chat_msg_parser::finish() {
|
||||
if (!is_partial_ && pos_ != input_.size()) {
|
||||
throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
|
||||
}
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::consume_spaces() {
|
||||
const auto length = input_.size();
|
||||
auto consumed = false;
|
||||
while (pos_ < length && std::isspace(input_[pos_])) {
|
||||
++pos_;
|
||||
consumed = true;
|
||||
}
|
||||
return consumed;
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
|
||||
auto pos = pos_;
|
||||
for (auto i = 0u; i < literal.size(); ++i) {
|
||||
if (pos >= input_.size()) {
|
||||
return false;
|
||||
}
|
||||
if (input_[pos] != literal[i]) {
|
||||
return false;
|
||||
}
|
||||
++pos;
|
||||
}
|
||||
pos_ = pos;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_literal(const std::string & literal) {
|
||||
auto idx = input_.find(literal, pos_);
|
||||
if (idx != std::string::npos) {
|
||||
find_regex_result res;
|
||||
res.prelude = input_.substr(pos_, idx - pos_);
|
||||
auto end = idx + literal.size();
|
||||
res.groups.emplace_back(common_string_range{idx, end});
|
||||
move_to(end);
|
||||
return res;
|
||||
}
|
||||
if (is_partial_) {
|
||||
idx = string_find_partial_stop(input_, literal);
|
||||
if (idx != std::string::npos && idx >= pos_) {
|
||||
find_regex_result res;
|
||||
res.prelude = input_.substr(pos_, idx - pos_);
|
||||
auto end = input_.size();
|
||||
res.groups.emplace_back(common_string_range{idx, end});
|
||||
move_to(end);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::consume_literal(const std::string & literal) {
|
||||
if (!try_consume_literal(literal)) {
|
||||
throw common_chat_msg_partial_exception(literal);
|
||||
}
|
||||
}
|
||||
|
||||
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
|
||||
std::string pending_reasoning_prefix;
|
||||
|
||||
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto set_reasoning_prefix = [&](size_t prefix_pos) {
|
||||
if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
|
||||
return;
|
||||
}
|
||||
if (prefix_pos + start_think.size() > input_.size()) {
|
||||
pending_reasoning_prefix.clear();
|
||||
return;
|
||||
}
|
||||
// Capture the exact literal that opened the reasoning section so we can
|
||||
// surface it back to callers. This ensures formats that force the
|
||||
// reasoning tag open (e.g. DeepSeek R1) retain their original prefix
|
||||
// instead of dropping it during parsing.
|
||||
pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
|
||||
};
|
||||
|
||||
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
|
||||
auto stripped_reasoning = string_strip(reasoning);
|
||||
if (stripped_reasoning.empty()) {
|
||||
return;
|
||||
}
|
||||
if (syntax_.reasoning_in_content) {
|
||||
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
|
||||
add_content(stripped_reasoning);
|
||||
if (closed) {
|
||||
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
|
||||
}
|
||||
} else {
|
||||
if (!pending_reasoning_prefix.empty()) {
|
||||
add_reasoning_content(pending_reasoning_prefix);
|
||||
pending_reasoning_prefix.clear();
|
||||
}
|
||||
add_reasoning_content(stripped_reasoning);
|
||||
}
|
||||
};
|
||||
|
||||
const size_t saved_pos = pos_;
|
||||
const size_t saved_content_size = result_.content.size();
|
||||
const size_t saved_reasoning_size = result_.reasoning_content.size();
|
||||
|
||||
auto restore_state = [&]() {
|
||||
move_to(saved_pos);
|
||||
result_.content.resize(saved_content_size);
|
||||
result_.reasoning_content.resize(saved_reasoning_size);
|
||||
};
|
||||
|
||||
// Allow leading whitespace to be preserved as content when reasoning is present at the start
|
||||
size_t cursor = pos_;
|
||||
size_t whitespace_end = cursor;
|
||||
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
|
||||
++whitespace_end;
|
||||
}
|
||||
|
||||
if (whitespace_end >= input_.size()) {
|
||||
restore_state();
|
||||
if (syntax_.thinking_forced_open) {
|
||||
auto rest = input_.substr(saved_pos);
|
||||
if (!rest.empty()) {
|
||||
handle_reasoning(rest, /* closed */ !is_partial());
|
||||
}
|
||||
move_to(input_.size());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
cursor = whitespace_end;
|
||||
const size_t remaining = input_.size() - cursor;
|
||||
const size_t start_prefix = std::min(start_think.size(), remaining);
|
||||
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
|
||||
|
||||
if (has_start_tag && start_prefix < start_think.size()) {
|
||||
move_to(input_.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (has_start_tag) {
|
||||
if (whitespace_end > pos_) {
|
||||
add_content(input_.substr(pos_, whitespace_end - pos_));
|
||||
}
|
||||
set_reasoning_prefix(cursor);
|
||||
cursor += start_think.size();
|
||||
} else if (syntax_.thinking_forced_open) {
|
||||
cursor = whitespace_end;
|
||||
} else {
|
||||
restore_state();
|
||||
return false;
|
||||
}
|
||||
while (true) {
|
||||
if (cursor >= input_.size()) {
|
||||
move_to(input_.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t end_pos = input_.find(end_think, cursor);
|
||||
if (end_pos == std::string::npos) {
|
||||
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
|
||||
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
|
||||
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
|
||||
if (reasoning_end > cursor) {
|
||||
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
|
||||
}
|
||||
move_to(input_.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (end_pos > cursor) {
|
||||
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
|
||||
} else {
|
||||
handle_reasoning("", /* closed */ true);
|
||||
}
|
||||
|
||||
cursor = end_pos + end_think.size();
|
||||
|
||||
while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
|
||||
++cursor;
|
||||
}
|
||||
|
||||
const size_t next_remaining = input_.size() - cursor;
|
||||
if (next_remaining == 0) {
|
||||
move_to(cursor);
|
||||
return true;
|
||||
}
|
||||
|
||||
const size_t next_prefix = std::min(start_think.size(), next_remaining);
|
||||
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
|
||||
if (next_prefix < start_think.size()) {
|
||||
move_to(input_.size());
|
||||
return true;
|
||||
}
|
||||
set_reasoning_prefix(cursor);
|
||||
cursor += start_think.size();
|
||||
continue;
|
||||
}
|
||||
|
||||
move_to(cursor);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
std::string common_chat_msg_parser::consume_rest() {
|
||||
auto rest = input_.substr(pos_);
|
||||
pos_ = input_.size();
|
||||
return rest;
|
||||
}
|
||||
|
||||
// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
|
||||
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
|
||||
auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
|
||||
pos_ = m.groups[0].end;
|
||||
|
||||
if (add_prelude_to_content) {
|
||||
add_content(prelude);
|
||||
}
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
|
||||
if (is_partial()) {
|
||||
throw common_chat_msg_partial_exception(regex.str());
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
return find_regex_result{prelude, m.groups};
|
||||
}
|
||||
|
||||
common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
|
||||
if (auto result = try_consume_regex(regex)) {
|
||||
return *result;
|
||||
}
|
||||
throw common_chat_msg_partial_exception(regex.str());
|
||||
}
|
||||
|
||||
std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
|
||||
auto m = regex.search(input_, pos_);
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
|
||||
return std::nullopt;
|
||||
}
|
||||
if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
|
||||
if (is_partial()) {
|
||||
throw common_chat_msg_partial_exception(regex.str());
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
if (m.groups[0].begin != pos_) {
|
||||
// Didn't match at the current position.
|
||||
return std::nullopt;
|
||||
}
|
||||
pos_ = m.groups[0].end;
|
||||
|
||||
return find_regex_result {
|
||||
/* .prelude = */ "",
|
||||
m.groups,
|
||||
};
|
||||
}
|
||||
|
||||
std::optional<common_json> common_chat_msg_parser::try_consume_json() {
|
||||
auto it = input_.cbegin() + pos_;
|
||||
const auto end = input_.cend();
|
||||
common_json result;
|
||||
if (!common_json_parse(it, end, healing_marker_, result)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
pos_ = std::distance(input_.cbegin(), it);
|
||||
if (result.healing_marker.marker.empty()) {
|
||||
// No healing marker, just return the parsed json
|
||||
return result;
|
||||
}
|
||||
if (!is_partial()) {
|
||||
throw common_chat_msg_partial_exception("JSON");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
common_json common_chat_msg_parser::consume_json() {
|
||||
if (auto result = try_consume_json()) {
|
||||
return *result;
|
||||
}
|
||||
throw common_chat_msg_partial_exception("JSON");
|
||||
}
|
||||
|
||||
common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths,
|
||||
const std::vector<std::vector<std::string>> & content_paths
|
||||
) {
|
||||
if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
|
||||
return *result;
|
||||
}
|
||||
throw common_chat_msg_partial_exception("JSON");
|
||||
}
|
||||
|
||||
std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths,
|
||||
const std::vector<std::vector<std::string>> & content_paths
|
||||
) {
|
||||
auto partial = try_consume_json();
|
||||
if (!partial) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto is_arguments_path = [&](const std::vector<std::string> & path) {
|
||||
return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
|
||||
};
|
||||
auto is_content_path = [&](const std::vector<std::string> & path) {
|
||||
return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
|
||||
};
|
||||
|
||||
if (partial->healing_marker.marker.empty()) {
|
||||
if (args_paths.empty()) {
|
||||
// No arguments to dump, and JSON was parsed fully.
|
||||
return consume_json_result {
|
||||
partial->json,
|
||||
/* .is_partial = */ false,
|
||||
};
|
||||
}
|
||||
if (is_arguments_path({})) {
|
||||
// Entire JSON is the arguments and was parsed fully.
|
||||
return consume_json_result {
|
||||
partial->json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true),
|
||||
/* .is_partial = */ false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
|
||||
|
||||
auto found_healing_marker = false;
|
||||
std::vector<std::string> path;
|
||||
std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
|
||||
if (is_arguments_path(path)) {
|
||||
auto arguments = j.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true);
|
||||
if (is_partial() && !partial->healing_marker.marker.empty()) {
|
||||
auto idx = arguments.find(partial->healing_marker.json_dump_marker);
|
||||
if (idx != std::string::npos) {
|
||||
arguments.resize(idx);
|
||||
found_healing_marker = true;
|
||||
}
|
||||
if (arguments == "\"") {
|
||||
// This happens because of completing `:"$magic` after `"arguments"`
|
||||
arguments = "";
|
||||
}
|
||||
}
|
||||
return arguments;
|
||||
}
|
||||
if (is_content_path(path)) {
|
||||
if (!j.is_string()) {
|
||||
throw std::runtime_error("Content path must be a string");
|
||||
}
|
||||
std::string str = j;
|
||||
auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
|
||||
if (idx != std::string::npos) {
|
||||
str.resize(idx);
|
||||
found_healing_marker = true;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
if (j.is_object()) {
|
||||
auto obj = json::object();
|
||||
for (const auto & p : j.items()) {
|
||||
const auto & key = p.key();
|
||||
const auto & value = p.value();
|
||||
const std::string key_str = key; // NOLINT
|
||||
auto idx = key_str.find(healing_marker_);
|
||||
if (idx != std::string::npos) {
|
||||
found_healing_marker = true;
|
||||
break;
|
||||
}
|
||||
path.push_back(key_str);
|
||||
if (value.is_string()) {
|
||||
const std::string value_str = value;
|
||||
if (value_str.find(healing_marker_) != std::string::npos) {
|
||||
found_healing_marker = true;
|
||||
if (is_content_path(path)) {
|
||||
if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
|
||||
// The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
|
||||
obj[key] = remove_unsupported_healings_and_dump_args(value);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
obj[key] = value;
|
||||
} else {
|
||||
obj[key] = remove_unsupported_healings_and_dump_args(value);
|
||||
}
|
||||
path.pop_back();
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
if (j.is_array()) {
|
||||
auto arr = json::array();
|
||||
for (const auto & value : j) {
|
||||
if (value.is_string()) {
|
||||
std::string str = value;
|
||||
auto idx = str.find(healing_marker_);
|
||||
if (idx != std::string::npos) {
|
||||
// Don't heal array values that aren't in the arguments.
|
||||
found_healing_marker = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
arr.push_back(remove_unsupported_healings_and_dump_args(value));
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
return j;
|
||||
};
|
||||
|
||||
auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
|
||||
LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
|
||||
return consume_json_result {
|
||||
cleaned,
|
||||
/* .is_partial = */ found_healing_marker,
|
||||
};
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::clear_tools() {
|
||||
result_.tool_calls.clear();
|
||||
}
|
||||
123
common/chat-parser.h
Normal file
123
common/chat-parser.h
Normal file
@@ -0,0 +1,123 @@
|
||||
#pragma once
|
||||
|
||||
#include "chat.h"
|
||||
#include "json-partial.h"
|
||||
#include "regex-partial.h"
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
class common_chat_msg_partial_exception : public std::runtime_error {
|
||||
public:
|
||||
common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
|
||||
};
|
||||
|
||||
class common_chat_msg_parser {
|
||||
std::string input_;
|
||||
bool is_partial_;
|
||||
common_chat_syntax syntax_;
|
||||
std::string healing_marker_;
|
||||
|
||||
size_t pos_ = 0;
|
||||
common_chat_msg result_;
|
||||
|
||||
public:
|
||||
common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
const std::string & input() const { return input_; }
|
||||
size_t pos() const { return pos_; }
|
||||
const std::string & healing_marker() const { return healing_marker_; }
|
||||
const bool & is_partial() const { return is_partial_; }
|
||||
const common_chat_msg & result() const { return result_; }
|
||||
const common_chat_syntax & syntax() const { return syntax_; }
|
||||
|
||||
void move_to(size_t pos) {
|
||||
if (pos > input_.size()) {
|
||||
throw std::runtime_error("Invalid position!");
|
||||
}
|
||||
pos_ = pos;
|
||||
}
|
||||
void move_back(size_t n) {
|
||||
if (pos_ < n) {
|
||||
throw std::runtime_error("Can't move back that far!");
|
||||
}
|
||||
pos_ -= n;
|
||||
}
|
||||
|
||||
// Get the substring of the input at the given range
|
||||
std::string str(const common_string_range & rng) const;
|
||||
|
||||
// Appends to the result.content field
|
||||
void add_content(const std::string & content);
|
||||
|
||||
// Appends to the result.reasoning_content field
|
||||
void add_reasoning_content(const std::string & reasoning_content);
|
||||
|
||||
// Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
|
||||
bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
|
||||
|
||||
// Adds a tool call using the "name", "id" and "arguments" fields of the json object
|
||||
bool add_tool_call(const nlohmann::ordered_json & tool_call);
|
||||
|
||||
// Adds an array of tool calls using their "name", "id" and "arguments" fields.
|
||||
bool add_tool_calls(const nlohmann::ordered_json & arr);
|
||||
|
||||
// Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
|
||||
bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
|
||||
|
||||
void finish();
|
||||
|
||||
bool consume_spaces();
|
||||
|
||||
void consume_literal(const std::string & literal);
|
||||
|
||||
bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
|
||||
|
||||
std::string consume_rest();
|
||||
|
||||
struct find_regex_result {
|
||||
std::string prelude;
|
||||
std::vector<common_string_range> groups;
|
||||
};
|
||||
|
||||
std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
|
||||
|
||||
bool try_consume_literal(const std::string & literal);
|
||||
|
||||
std::optional<find_regex_result> try_find_literal(const std::string & literal);
|
||||
|
||||
find_regex_result consume_regex(const common_regex & regex);
|
||||
|
||||
std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
|
||||
|
||||
std::optional<common_json> try_consume_json();
|
||||
common_json consume_json();
|
||||
|
||||
struct consume_json_result {
|
||||
nlohmann::ordered_json value;
|
||||
bool is_partial;
|
||||
};
|
||||
|
||||
/*
|
||||
Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
|
||||
|
||||
By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
|
||||
e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
|
||||
|
||||
But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
|
||||
- with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
|
||||
- with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
|
||||
*/
|
||||
consume_json_result consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||
);
|
||||
std::optional<consume_json_result> try_consume_json_with_dumped_args(
|
||||
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||
);
|
||||
|
||||
void clear_tools();
|
||||
};
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,195 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "chat.h"
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
class common_chat_peg_mapper {
|
||||
public:
|
||||
common_chat_msg & result;
|
||||
|
||||
common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
|
||||
|
||||
virtual ~common_chat_peg_mapper() = default;
|
||||
|
||||
virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
|
||||
virtual void map(const common_peg_ast_node & node);
|
||||
protected:
|
||||
virtual std::string normalize_container_value(const std::string & input);
|
||||
private:
|
||||
// Tool call handling state
|
||||
std::optional<common_chat_tool_call> pending_tool_call; // Tool call waiting for name
|
||||
common_chat_tool_call * current_tool = nullptr;
|
||||
int arg_count = 0;
|
||||
bool closing_quote_pending = false;
|
||||
std::string args_buffer; // Buffer to delay arguments until tool name is known
|
||||
|
||||
// Returns a reference to the active argument destination string.
|
||||
// Before tool_name is known, writes go to args_buffer; after, to current_tool->arguments.
|
||||
std::string & args_target();
|
||||
};
|
||||
|
||||
class common_chat_peg_gemma4_mapper : public common_chat_peg_mapper {
|
||||
public:
|
||||
common_chat_peg_gemma4_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
|
||||
virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
|
||||
private:
|
||||
void visit(const common_peg_ast_arena & arena, common_peg_ast_id id);
|
||||
};
|
||||
|
||||
struct content_structure;
|
||||
struct tool_call_structure;
|
||||
|
||||
class common_chat_peg_builder : public common_peg_parser_builder {
|
||||
public:
|
||||
// Tag constants (from former common_chat_peg_base_builder)
|
||||
static constexpr const char * REASONING_BLOCK = "reasoning-block";
|
||||
static constexpr const char * REASONING = "reasoning";
|
||||
static constexpr const char * CONTENT = "content";
|
||||
|
||||
// Tag constants
|
||||
static constexpr const char * TOOL = "tool";
|
||||
static constexpr const char * TOOL_OPEN = "tool-open";
|
||||
static constexpr const char * TOOL_CLOSE = "tool-close";
|
||||
static constexpr const char * TOOL_ID = "tool-id";
|
||||
static constexpr const char * TOOL_NAME = "tool-name";
|
||||
static constexpr const char * TOOL_ARGS = "tool-args";
|
||||
static constexpr const char * TOOL_ARG = "tool-arg";
|
||||
static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
|
||||
static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
|
||||
static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
|
||||
static constexpr const char * TOOL_ARG_VALUE = "tool-arg-value";
|
||||
static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value"; // For schema-declared string types
|
||||
|
||||
// Low-level tag methods (from former common_chat_peg_base_builder)
|
||||
common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
|
||||
|
||||
common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
|
||||
|
||||
common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
|
||||
|
||||
common_peg_parser tag_with_safe_content(const std::string & tag_name,
|
||||
const std::string & marker,
|
||||
const common_peg_parser & p);
|
||||
|
||||
// Low-level tag methods
|
||||
common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
|
||||
common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
|
||||
common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
|
||||
common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
|
||||
common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
|
||||
common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
|
||||
common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
|
||||
common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
|
||||
common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
|
||||
common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
|
||||
common_peg_parser tool_arg_value(const common_peg_parser & p) { return tag(TOOL_ARG_VALUE, p); }
|
||||
|
||||
// Use for schema-declared string types - won't be treated as potential JSON container
|
||||
common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
|
||||
common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }
|
||||
|
||||
|
||||
// Return a parser that parses the prefix of a string, up to a given delimiter.
|
||||
common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});
|
||||
|
||||
// Legacy-compatible helper for building standard JSON tool calls
|
||||
// Used by tests and manual parsers
|
||||
// name_key/args_key: JSON key names for function name and arguments
|
||||
// Empty or "name"/"arguments" will accept both common variations
|
||||
// Supports dot notation for nested objects (e.g., "function.name")
|
||||
// array_wrapped: if true, tool calls are wrapped in JSON array [...]
|
||||
// function_is_key: if true, function name is the JSON key (e.g., {"func_name": {...}})
|
||||
// call_id_key: JSON key for string call ID (e.g., "id")
|
||||
// gen_call_id_key: JSON key for generated integer call ID (e.g., "tool_call_id")
|
||||
// parameters_order: order in which JSON fields should be parsed
|
||||
common_peg_parser standard_json_tools(const std::string & section_start,
|
||||
const std::string & section_end,
|
||||
const nlohmann::ordered_json & tools,
|
||||
bool parallel_tool_calls,
|
||||
bool force_tool_calls,
|
||||
const std::string & name_key = "",
|
||||
const std::string & args_key = "",
|
||||
bool array_wrapped = false,
|
||||
bool function_is_key = false,
|
||||
const std::string & call_id_key = "",
|
||||
const std::string & gen_call_id_key = "",
|
||||
const std::vector<std::string> & parameters_order = {});
|
||||
|
||||
// Legacy-compatible helper for building XML/tagged style tool calls
|
||||
// Used by tests and manual parsers
|
||||
common_peg_parser standard_constructed_tools(const std::map<std::string, std::string> & markers,
|
||||
const nlohmann::ordered_json & tools,
|
||||
bool parallel_tool_calls,
|
||||
bool force_tool_calls);
|
||||
|
||||
// Helper for Python-style function call format: name(arg1="value1", arg2=123)
|
||||
// Used by LFM2 and similar templates
|
||||
common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
|
||||
bool parallel_tool_calls);
|
||||
|
||||
private:
|
||||
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
|
||||
common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
|
||||
const std::string & args_key,
|
||||
const std::string & effective_args_key,
|
||||
const std::string & call_id_key,
|
||||
const std::string & gen_call_id_key);
|
||||
|
||||
common_peg_parser build_json_tools_nested_keys(const nlohmann::ordered_json & tools,
|
||||
const std::string & effective_name_key,
|
||||
const std::string & effective_args_key,
|
||||
const std::string & call_id_key,
|
||||
const std::string & gen_call_id_key);
|
||||
|
||||
common_peg_parser build_json_tools_flat_keys(const nlohmann::ordered_json & tools,
|
||||
const std::string & effective_name_key,
|
||||
const std::string & effective_args_key,
|
||||
const std::string & call_id_key,
|
||||
const std::string & gen_call_id_key,
|
||||
const std::vector<std::string> & parameters_order);
|
||||
};
|
||||
|
||||
inline common_peg_arena build_chat_peg_parser(
|
||||
const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
|
||||
common_chat_peg_builder builder;
|
||||
builder.set_root(fn(builder));
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
class tag_based_peg_mapper {
|
||||
public:
|
||||
std::map<std::string, std::string> tags;
|
||||
|
||||
void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
|
||||
};
|
||||
|
||||
struct tagged_parse_result {
|
||||
common_peg_parse_result result;
|
||||
std::map<std::string, std::string> tags;
|
||||
};
|
||||
|
||||
struct tagged_peg_parser {
|
||||
common_peg_arena arena;
|
||||
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE;
|
||||
|
||||
tagged_peg_parser & withDebug() {
|
||||
flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
|
||||
return *this;
|
||||
}
|
||||
|
||||
tagged_peg_parser & withoutDebug() {
|
||||
flags = flags & ~COMMON_PEG_PARSE_FLAG_DEBUG;
|
||||
return *this;
|
||||
}
|
||||
|
||||
tagged_parse_result parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags = COMMON_PEG_PARSE_FLAG_NONE) const;
|
||||
tagged_parse_result parse_anywhere_and_extract(const std::string & input) const;
|
||||
};
|
||||
|
||||
tagged_peg_parser build_tagged_peg_parser(
|
||||
const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user