Merge branch 'master' into compilade/convert-prequant

2026-04-09 16:17:31 +03:00 · 2025-10-23 14:23:12 -04:00 · 2025-09-09 14:23:06 -04:00 · 2025-09-01 10:13:29 -04:00 · 2025-08-19 17:27:59 -04:00 · 2025-08-14 17:05:21 -04:00
1777 changed files with 147536 additions and 511170 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -3,8 +3,7 @@
 # ==============================================================================

 # Define the CANN base image for easier version updates later
-ARG CHIP_TYPE=910b
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10

 # ==============================================================================
 # BUILD STAGE
@@ -12,8 +11,11 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
 # ==============================================================================
 FROM ${CANN_BASE_IMAGE} AS build

+# Define the Ascend chip model for compilation. Default is Ascend910B3
+ARG ASCEND_SOC_TYPE=Ascend910B3
+
 # -- Install build dependencies --
-RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
+RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
    yum clean all && \
    rm -rf /var/cache/yum

@@ -34,22 +36,20 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 # For brevity, only core variables are listed here. You can paste the original ENV list here.

 # -- Build llama.cpp --
-# Use the passed CHIP_TYPE argument and add general build options
-ARG CHIP_TYPE
+# Use the passed ASCEND_SOC_TYPE argument and add general build options
 RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
    && \
    cmake -B build \
        -DGGML_CANN=ON \
        -DCMAKE_BUILD_TYPE=Release \
-        -DSOC_TYPE=ascend${CHIP_TYPE} \
-        -DUSE_ACL_GRAPH=ON \
+        -DSOC_TYPE=${ASCEND_SOC_TYPE} \
        . && \
    cmake --build build --config Release -j$(nproc)

 # -- Organize build artifacts for copying in later stages --
 # Create a lib directory to store all .so files
 RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
+    find build -name "*.so" -exec cp {} /app/lib \;

 # Create a full directory to store all executables and Python scripts
 RUN mkdir -p /app/full && \
@@ -108,11 +108,11 @@ ENTRYPOINT ["/app/tools.sh"]
 # ENTRYPOINT ["/app/llama-server"]

 ### Target: light
-# Lightweight image containing only llama-cli and llama-completion
+# Lightweight image containing only llama-cli
 # ==============================================================================
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 ENTRYPOINT [ "/app/llama-cli" ]

--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -1,13 +1,11 @@
-ARG UBUNTU_VERSION=24.04
+ARG UBUNTU_VERSION=22.04

 FROM ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

 RUN apt-get update && \
-    apt-get install -y gcc-14 g++-14 build-essential git cmake libssl-dev
-
-ENV CC=gcc-14 CXX=g++-14
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev

 WORKDIR /app

@@ -22,7 +20,7 @@ RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
    cmake --build build -j $(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
+    find build -name "*.so" -exec cp {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
@@ -36,7 +34,7 @@ RUN mkdir -p /app/full \
 FROM ubuntu:$UBUNTU_VERSION AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -57,9 +55,8 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -71,7 +68,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -1,6 +1,6 @@
-ARG UBUNTU_VERSION=24.04
+ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.4.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

@@ -12,9 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
-
-ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1

 WORKDIR /app

@@ -27,7 +25,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
+    find build -name "*.so" -exec cp {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
@@ -41,7 +39,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -62,8 +60,7 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --upgrade pip setuptools wheel \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
@@ -77,7 +74,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04

 ## Build Image

@@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
-    apt-get install -y git libssl-dev
+    apt-get install -y git libcurl4-openssl-dev

 WORKDIR /app

@@ -21,7 +21,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
+    find build -name "*.so" -exec cp {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
@@ -33,25 +33,8 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
-ARG IGDGMM_VERSION=22.9.0
-RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
-  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && dpkg --install *.deb
-
 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -90,7 +73,7 @@ ENTRYPOINT ["/app/tools.sh"]
 FROM base AS light

 COPY --from=build /app/lib/ /app
-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,4 +1,4 @@
-ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10

 FROM ascendai/cann:$ASCEND_VERSION AS build

@@ -6,7 +6,7 @@ WORKDIR /app

 COPY . .

-RUN yum install -y gcc g++ cmake make openssl-devel
+RUN yum install -y gcc g++ cmake make libcurl-devel
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
@@ -23,12 +23,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 RUN echo "Building with static libs" && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF  && \
-    cmake --build build --config Release --target llama-cli && \
-    cmake --build build --config Release --target llama-completion
+    cmake --build build --config Release --target llama-cli

 # TODO: use image with NNRT
 FROM ascendai/cann:$ASCEND_VERSION AS runtime
-COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
+COPY --from=build /app/build/bin/llama-cli /llama-cli

 ENV LC_ALL=C.utf8

--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -37,7 +37,6 @@ make -j GGML_CUDA=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
-cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion
 cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
 cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple

@@ -69,7 +68,6 @@ rm -rf %{_builddir}/*

 %files
 %{_bindir}/llama-cuda-cli
-%{_bindir}/llama-cuda-completion
 %{_bindir}/llama-cuda-server
 %{_bindir}/llama-cuda-simple
 /usr/lib/systemd/system/llamacuda.service
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -39,7 +39,6 @@ make -j
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
-cp -p llama-completion %{buildroot}%{_bindir}/llama-completion
 cp -p llama-server %{buildroot}%{_bindir}/llama-server
 cp -p llama-simple %{buildroot}%{_bindir}/llama-simple

@@ -71,7 +70,6 @@ rm -rf %{_builddir}/*

 %files
 %{_bindir}/llama-cli
-%{_bindir}/llama-completion
 %{_bindir}/llama-server
 %{_bindir}/llama-simple
 /usr/lib/systemd/system/llama.service
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -18,7 +18,7 @@ RUN apt-get update && \
    python3 \
    python3-pip \
    git \
-    libssl-dev \
+    libcurl4-openssl-dev \
    libgomp1

 WORKDIR /app
@@ -32,7 +32,7 @@ RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
+    find build -name "*.so" -exec cp {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
@@ -46,7 +46,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_MUSA_RUN_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -81,7 +81,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -4,7 +4,7 @@
  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
  # `_module.args.pkgs` (defined in this case by flake-parts).
  perSystem =
-    { lib, system, ... }:
+    { system, ... }:
    {
      _module.args = {
        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
@@ -33,7 +33,7 @@
                "CUDA EULA"
                "cuDNN EULA"
              ]
-            ) (p.meta.licenses or (lib.toList p.meta.license));
+            ) (p.meta.licenses or [ p.meta.license ]);
        };
        # Ensure dependencies use ROCm consistently
        pkgsRocm = import inputs.nixpkgs {
--- a/.devops/nix/package-gguf-py.nix
+++ b/.devops/nix/package-gguf-py.nix
@@ -3,7 +3,6 @@
  llamaVersion,
  numpy,
  tqdm,
-  requests,
  sentencepiece,
  pyyaml,
  poetry-core,
@@ -21,7 +20,6 @@ buildPythonPackage {
    tqdm
    sentencepiece
    pyyaml
-    requests
  ];
  src = lib.cleanSource ../../gguf-py;
  pythonImportsCheck = [
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -16,7 +16,7 @@
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
-  openssl,
+  curl,
  shaderc,
  useBlas ?
    builtins.all (x: !x) [
@@ -32,8 +32,8 @@
  useMpi ? false,
  useRocm ? config.rocmSupport,
  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
+  enableCurl ? true,
  useVulkan ? false,
-  useRpc ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

  # It's necessary to consistently use backendStdenv when building with CUDA support,
@@ -41,7 +41,6 @@
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
  precompileMetalShaders ? false,
-  useWebUi ? true,
 }:

 let
@@ -161,14 +160,14 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    ++ optionals useRocm rocmBuildInputs
    ++ optionals useBlas [ blas ]
    ++ optionals useVulkan vulkanBuildInputs
-    ++ [ openssl ];
+    ++ optionals enableCurl [ curl ];

  cmakeFlags =
    [
      (cmakeBool "LLAMA_BUILD_SERVER" true)
-      (cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+      (cmakeBool "LLAMA_CURL" enableCurl)
      (cmakeBool "GGML_NATIVE" false)
      (cmakeBool "GGML_BLAS" useBlas)
      (cmakeBool "GGML_CUDA" useCuda)
@@ -176,7 +175,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      (cmakeBool "GGML_METAL" useMetalKit)
      (cmakeBool "GGML_VULKAN" useVulkan)
      (cmakeBool "GGML_STATIC" enableStatic)
-      (cmakeBool "GGML_RPC" useRpc)
    ]
    ++ optionals useCuda [
      (
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -7,6 +7,13 @@

 let
  pythonPackages = python3.pkgs;
+  buildPythonPackage = pythonPackages.buildPythonPackage;
+  numpy = pythonPackages.numpy;
+  tqdm = pythonPackages.tqdm;
+  sentencepiece = pythonPackages.sentencepiece;
+  pyyaml = pythonPackages.pyyaml;
+  poetry-core = pythonPackages.poetry-core;
+  pytestCheckHook = pythonPackages.pytestCheckHook;
 in

 # We're using `makeScope` instead of just writing out an attrset
@@ -16,18 +23,17 @@ in
 lib.makeScope newScope (self: {
  inherit llamaVersion;
  gguf-py = self.callPackage ./package-gguf-py.nix {
-    inherit (pythonPackages)
+    inherit
+      buildPythonPackage
      numpy
      tqdm
      sentencepiece
+      poetry-core
      pyyaml
      pytestCheckHook
-      requests
-      buildPythonPackage
-      poetry-core
      ;
  };
-  python-scripts = self.callPackage ./python-scripts.nix { inherit (pythonPackages) buildPythonPackage poetry-core; };
+  python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
  llama-cpp = self.callPackage ./package.nix { };
  docker = self.callPackage ./docker.nix { };
  docker-min = self.callPackage ./docker.nix { interactive = false; };
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -1,138 +0,0 @@
-ARG OPENVINO_VERSION_MAJOR=2026.0
-ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
-ARG UBUNTU_VERSION=24.04
-
-# Optional proxy build arguments - empty by default
-ARG http_proxy=
-ARG https_proxy=
-
-## Build Image
-FROM ubuntu:${UBUNTU_VERSION} AS build
-
-# Pass proxy args to build stage
-ARG http_proxy
-ARG https_proxy
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        ca-certificates \
-        gnupg \
-        wget \
-        git \
-        cmake \
-        ninja-build \
-        build-essential \
-        libtbb12 \
-        libssl-dev \
-        ocl-icd-opencl-dev \
-        opencl-headers \
-        opencl-clhpp-headers \
-        intel-opencl-icd && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install OpenVINO for Ubuntu 24.04
-ARG OPENVINO_VERSION_MAJOR
-ARG OPENVINO_VERSION_FULL
-RUN mkdir -p /opt/intel && \
-    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
-    tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
-    mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
-    cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
-    echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
-    cd - && \
-    ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
-
-ENV OpenVINO_DIR=/opt/intel/openvino
-
-WORKDIR /app
-
-COPY . .
-
-# Build Stage
-RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
-    cmake -B build/ReleaseOV -G Ninja \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DGGML_OPENVINO=ON && \
-    cmake --build build/ReleaseOV -j$(nproc)"
-
-# Copy all necessary libraries
-RUN mkdir -p /app/lib && \
-    find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
-    find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
-    find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
-
-# Create runtime directories and copy binaries
-RUN mkdir -p /app/full \
-    && cp build/ReleaseOV/bin/* /app/full/ \
-    && cp *.py /app/full \
-    && cp -r gguf-py /app/full \
-    && cp -r requirements /app/full \
-    && cp requirements.txt /app/full \
-    && cp .devops/tools.sh /app/full/tools.sh
-
-## Base Runtime Image
-FROM ubuntu:${UBUNTU_VERSION} AS base
-
-# Pass proxy args to runtime stage
-ARG http_proxy
-ARG https_proxy
-
-RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl \
-    && apt autoremove -y \
-    && apt clean -y \
-    && rm -rf /tmp/* /var/tmp/* \
-    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
-    && find /var/cache -type f -delete
-
-COPY --from=build /app/lib/ /app/
-
-### Full (all binaries)
-FROM base AS full
-
-ARG http_proxy
-ARG https_proxy
-
-COPY --from=build /app/full /app/
-
-WORKDIR /app
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    git \
-    python3 \
-    python3-venv \
-    python3-pip && \
-    python3 -m venv /ov-venv && \
-    /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
-    /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
-    apt-get autoremove -y && \
-    apt-get clean && \
-    rm -rf /tmp/* /var/tmp/* && \
-    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
-    find /var/cache -type f -delete
-
-ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
-
-
-### Light, CLI only
-FROM base AS light
-
-COPY --from=build /app/full/llama-cli /app/
-
-WORKDIR /app
-
-ENTRYPOINT [ "/app/llama-cli" ]
-
-### Server, Server only
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-COPY --from=build /app/full/llama-server /app/
-
-WORKDIR /app
-
-HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
-
-ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,8 +1,8 @@
 ARG UBUNTU_VERSION=24.04

 # This needs to generally match the container host's environment.
-ARG ROCM_VERSION=7.2.1
-ARG AMDGPU_VERSION=7.2.1
+ARG ROCM_VERSION=7.0
+ARG AMDGPU_VERSION=7.0

 # Target the ROCm build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
@@ -11,12 +11,13 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.1/reference/system-requirements.html
-# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
-# check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html
+# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
+# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html

-ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201'
+ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
+#ARG ROCM_DOCKER_ARCH='gfx1151'

 # Set ROCm architectures
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
@@ -26,7 +27,7 @@ RUN apt-get update \
    build-essential \
    cmake \
    git \
-    libssl-dev \
+    libcurl4-openssl-dev \
    curl \
    libgomp1

@@ -44,7 +45,7 @@ RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
    && cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib \
-    && find build -name "*.so*" -exec cp -P {} /app/lib \;
+    && find build -name "*.so" -exec cp {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
@@ -58,7 +59,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_ROCM_DEV_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -79,7 +80,7 @@ RUN apt-get update \
    git \
    python3-pip \
    python3 \
-    python3-wheel \
+    python3-wheel\
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
@@ -93,7 +94,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -11,7 +11,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    apt install -y --no-install-recommends \
        git cmake ccache ninja-build \
        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
-        libopenblas-dev libssl-dev && \
+        libopenblas-dev libcurl4-openssl-dev && \
    rm -rf /var/lib/apt/lists/*

 WORKDIR /app
@@ -24,9 +24,8 @@ RUN --mount=type=cache,target=/root/.ccache \
        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
        -DLLAMA_BUILD_TESTS=OFF \
+        -DGGML_BACKEND_DL=OFF \
        -DGGML_NATIVE=OFF \
-        -DGGML_BACKEND_DL=ON \
-        -DGGML_CPU_ALL_VARIANTS=ON \
        -DGGML_BLAS=ON \
        -DGGML_BLAS_VENDOR=OpenBLAS && \
    cmake --build build --config Release -j $(nproc) && \
@@ -104,8 +103,7 @@ FROM base AS light
 WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
-COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin

 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]

@@ -118,7 +116,6 @@ ENV LLAMA_ARG_HOST=0.0.0.0
 WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
-COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
 COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin

 EXPOSE 8080
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    exec ./llama-cli "$@"
-elif [[ "$arg1" == '--run-legacy' || "$arg1" == '-l' ]]; then
-    exec ./llama-completion "$@"
 elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
    exec ./llama-bench "$@"
 elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
@@ -34,10 +32,8 @@ elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
-    echo "  --run (-r): Run a model (chat) previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin"
-    echo "  --run-legacy (-l): Run a model (legacy completion) previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -no-cnv -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "  --run (-r): Run a model previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
    echo "              ex: -m model.gguf"
    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,24 +1,42 @@
-ARG UBUNTU_VERSION=26.04
+ARG UBUNTU_VERSION=24.04

 FROM ubuntu:$UBUNTU_VERSION AS build

+# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
+
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils

-# Install SSL and Vulkan SDK dependencies
-RUN apt install -y libssl-dev curl \
-    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
+# Install Vulkan SDK
+ARG VULKAN_VERSION=1.4.321.1
+RUN ARCH=$(uname -m) && \
+    wget -qO /tmp/vulkan-sdk.tar.xz https://sdk.lunarg.com/sdk/download/${VULKAN_VERSION}/linux/vulkan-sdk-linux-${ARCH}-${VULKAN_VERSION}.tar.xz && \
+    mkdir -p /opt/vulkan && \
+    tar -xf /tmp/vulkan-sdk.tar.xz -C /tmp --strip-components=1 && \
+    mv /tmp/${ARCH}/* /opt/vulkan/ && \
+    rm -rf /tmp/*
+
+# Install cURL and Vulkan SDK dependencies
+RUN apt install -y libcurl4-openssl-dev curl \
+    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev
+
+# Set environment variables
+ENV VULKAN_SDK=/opt/vulkan
+ENV PATH=$VULKAN_SDK/bin:$PATH
+ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib:$LD_LIBRARY_PATH
+ENV CMAKE_PREFIX_PATH=$VULKAN_SDK:$CMAKE_PREFIX_PATH
+ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig:$PKG_CONFIG_PATH

 # Build it
 WORKDIR /app

 COPY . .

-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1  -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so*" -exec cp -P {} /app/lib \;
+    find build -name "*.so" -exec cp {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
@@ -32,8 +50,7 @@ RUN mkdir -p /app/full \
 FROM ubuntu:$UBUNTU_VERSION AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
-    libglvnd0 libgl1 libglx0 libegl1 libgles2 \
+    && apt-get install -y libgomp1 curl libvulkan-dev \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -49,20 +66,14 @@ COPY --from=build /app/full /app

 WORKDIR /app

-ENV PATH="/root/.venv/bin:/root/.local/bin:${PATH}"
-
-# Flag for compatibility with pip
-ARG UV_INDEX_STRATEGY="unsafe-best-match"
 RUN apt-get update \
    && apt-get install -y \
-    build-essential \
-    curl \
    git \
-    ca-certificates \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && uv python install 3.13 \
-    && uv venv --python 3.13 /root/.venv \
-    && uv pip install --python /root/.venv/bin/python -r requirements.txt \
+    python3 \
+    python3-pip \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -74,7 +85,7 @@ ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light

-COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+COPY --from=build /app/full/llama-cli /app

 WORKDIR /app

--- a/.editorconfig
+++ b/.editorconfig
@@ -21,6 +21,14 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset

+[tools/server/public/*]
+indent_size = 2
+
+[tools/server/public/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
 [tools/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
@@ -52,19 +60,3 @@ end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
-
-[tools/server/public/**]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
-
-[benches/**]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
--- a/.gemini/settings.json
+++ b/.gemini/settings.json
@@ -1 +0,0 @@
-{ "contextFileName": "AGENTS.md" }
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +0,0 @@
-# Treat the generated single-file WebUI build as binary for diff purposes.
-# Git's pack-file delta compression still works (byte-level), but this prevents
-# git diff from printing the entire minified file on every change.
-tools/server/public/index.html -diff
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -8,8 +8,7 @@ body:
      value: >
        Thanks for taking the time to fill out this bug report!
        This issue template is intended for bug reports where the compilation of llama.cpp fails.
-        Before opening an issue, please confirm that the compilation still fails
-        after recreating the CMake build directory and with `-DGGML_CCACHE=OFF`.
+        Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
        by clearing `~/.cache/ccache` (on Linux).
  - type: textarea
@@ -41,7 +40,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
        multiple: true
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -11,7 +11,7 @@ body:
        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
-        The `llama-completion` binary can be used for simple and reproducible model inference.
+        The `llama-cli` binary can be used for simple and reproducible model inference.
  - type: textarea
    id: version
    attributes:
@@ -42,7 +42,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
        multiple: true
    validations:
      required: true
@@ -74,12 +74,9 @@ body:
        Please give us a summary of the problem and tell us how to reproduce it.
        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
        that information would be very much appreciated by us.
-
-        If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
-        If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
      placeholder: >
-        e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
-        With short prompts or `-fa off` it works correctly.
+        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
+        When I use -ngl 0 it works correctly.
        Here are the exact commands that I used: ...
    validations:
      required: true
@@ -98,18 +95,7 @@ body:
      label: Relevant log output
      description: >
          Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          For very long logs (thousands of lines), preferably upload them as files instead.
-          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
-      value: |
-        <details>
-        <summary>Logs</summary>
-        <!-- Copy-pasted short logs go into the "console" area here -->
-
-        ```console
-
-        ```
-        </details>
-
-        <!-- Long logs that you upload as files go here, outside the "console" area -->
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -85,19 +85,7 @@ body:
      label: Relevant log output
      description: >
          If applicable, please copy and paste any relevant log output, including any generated text.
-          If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
-          For very long logs (thousands of lines), please upload them as files instead.
-          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
-      value: |
-        <details>
-        <summary>Logs</summary>
-        <!-- Copy-pasted short logs go into the "console" area here -->
-
-        ```console
-
-        ```
-        </details>
-
-        <!-- Long logs that you upload as files go here, outside the "console" area -->
+          This will be automatically formatted into code, so no need for backticks.
+      render: shell
    validations:
      required: false
--- a/.github/actions/linux-setup-openvino/action.yml
+++ b/.github/actions/linux-setup-openvino/action.yml
@@ -1,25 +0,0 @@
-name: "Linux - Setup OpenVINO Toolkit"
-description: "Setup OpenVINO Toolkit for Linux"
-inputs:
-  path:
-    description: "Installation path"
-    required: true
-  version_major:
-    description: "OpenVINO major version (e.g., 2025.3)"
-    required: true
-  version_full:
-    description: "OpenVINO full version (e.g., 2025.3.0.19807.44526285f24)"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Setup OpenVINO Toolkit
-      id: setup
-      uses: ./.github/actions/unarchive-tar
-      with:
-        url: https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/linux/openvino_toolkit_ubuntu24_${{ inputs.version_full }}_x86_64.tgz
-        path: ${{ inputs.path }}
-        type: z
-        strip: 1
-
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -65,34 +65,3 @@ runs:
          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-    - name: Install Cuda Toolkit 13.1
-      if: ${{ inputs.cuda_version == '13.1' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.2.0.9-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.1.68-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.1.80-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.1.68-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-13.1.78-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_crt-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cudart-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvcc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvrtc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libcublas-windows-x86_64-13.2.0.9-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libnvvm-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvtx-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_profiler_api-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\visual_studio_integration-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cccl-windows-x86_64-13.1.78-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
--- a/.github/actions/windows-setup-curl/action.yml
+++ b/.github/actions/windows-setup-curl/action.yml
@@ -0,0 +1,30 @@
+name: 'Windows - Setup CURL'
+description: 'Composite action, to be reused in other workflow'
+inputs:
+  curl_version:
+    description: 'CURL version'
+    required: false
+    default: '8.6.0_6'
+  architecture:
+    description: 'Architecture of the libcurl to download'
+    required: false
+    default: 'win64'
+outputs:
+  curl_path:
+    description: "Path to the downloaded libcurl"
+    value: ${{ steps.get_libcurl.outputs.curl_path }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: libCURL
+      id: get_libcurl
+      shell: powershell
+      env:
+        CURL_VERSION: ${{ inputs.curl_version }}
+        ARCHITECTURE: ${{ inputs.architecture }}
+      run: |
+        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
+        mkdir $env:RUNNER_TEMP/libcurl
+        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
+        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
--- a/.github/actions/windows-setup-rocm/action.yml
+++ b/.github/actions/windows-setup-rocm/action.yml
@@ -11,5 +11,5 @@ runs:
    - name: Setup ROCm
      uses: ./.github/actions/install-exe
      with:
-        url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-Win11-For-HIP.exe
+        url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-WinSvr2022-For-HIP.exe
        args: -install
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,262 @@
+# Copilot Instructions for llama.cpp
+
+## Repository Overview
+
+llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model) inference with minimal setup and dependencies. The project enables running language models on diverse hardware with state-of-the-art performance.
+
+**Key Facts:**
+- **Primary language**: C/C++ with Python utility scripts
+- **Size**: ~200k+ lines of code across 1000+ files
+- **Architecture**: Modular design with main library (`libllama`) and 40+ executable tools/examples
+- **Core dependency**: ggml tensor library (vendored in `ggml/` directory)
+- **Backends supported**: CPU (AVX/NEON optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
+- **License**: MIT
+
+## Build Instructions
+
+### Prerequisites
+- CMake 3.14+ (primary build system)
+- C++17 compatible compiler (GCC 13.3+, Clang, MSVC)
+- Optional: ccache for faster compilation
+
+### Basic Build (CPU-only)
+**ALWAYS run these commands in sequence:**
+```bash
+cmake -B build
+cmake --build build --config Release -j $(nproc)
+```
+
+**Build time**: ~10 minutes on 4-core system with ccache enabled, ~25 minutes without ccache.
+
+**Important Notes:**
+- The Makefile is deprecated - always use CMake
+- ccache is automatically detected and used if available
+- Built binaries are placed in `build/bin/`
+- Parallel builds (`-j`) significantly reduce build time
+
+### Backend-Specific Builds
+For CUDA support:
+```bash
+cmake -B build -DGGML_CUDA=ON
+cmake --build build --config Release -j $(nproc)
+```
+
+For Metal (macOS):
+```bash
+cmake -B build -DGGML_METAL=ON
+cmake --build build --config Release -j $(nproc)
+```
+
+**Important Note**: While all backends can be built as long as the correct requirements for that backend are installed, you will not be able to run them without the correct hardware. The only backend that can be run for testing and validation is the CPU backend.
+
+### Debug Builds
+Single-config generators:
+```bash
+cmake -B build -DCMAKE_BUILD_TYPE=Debug
+cmake --build build
+```
+
+Multi-config generators:
+```bash
+cmake -B build -G "Xcode"
+cmake --build build --config Debug
+```
+
+### Common Build Issues
+- **Issue**: Network tests fail in isolated environments
+  **Solution**: Expected behavior - core functionality tests will still pass
+
+## Testing
+
+### Running Tests
+```bash
+ctest --test-dir build --output-on-failure -j $(nproc)
+```
+
+**Test suite**: 38 tests covering tokenizers, grammar parsing, sampling, backends, and integration
+**Expected failures**: 2-3 tests may fail if network access is unavailable (they download models)
+**Test time**: ~30 seconds for passing tests
+
+### Server Unit Tests
+Run server-specific unit tests after building the server:
+```bash
+# Build the server first
+cmake --build build --target llama-server
+
+# Navigate to server tests and run
+cd tools/server/tests
+source ../../../.venv/bin/activate
+./tests.sh
+```
+**Server test dependencies**: The `.venv` environment includes the required dependencies for server unit tests (pytest, aiohttp, etc.). Tests can be run individually or with various options as documented in `tools/server/tests/README.md`.
+
+### Test Categories
+- Tokenizer tests: Various model tokenizers (BERT, GPT-2, LLaMA, etc.)
+- Grammar tests: GBNF parsing and validation
+- Backend tests: Core ggml operations across different backends
+- Integration tests: End-to-end workflows
+
+### Manual Testing Commands
+```bash
+# Test basic inference
+./build/bin/llama-cli --version
+
+# Test model loading (requires model file)
+./build/bin/llama-cli -m path/to/model.gguf -p "Hello" -n 10
+```
+
+## Code Quality and Linting
+
+### C++ Code Formatting
+**ALWAYS format C++ code before committing:**
+```bash
+git clang-format
+```
+
+Configuration is in `.clang-format` with these key rules:
+- 4-space indentation
+- 120 column limit
+- Braces on same line for functions
+- Pointer alignment: `void * ptr` (middle)
+- Reference alignment: `int & ref` (middle)
+
+### Python Code
+**ALWAYS activate the Python environment in `.venv` and use tools from that environment:**
+```bash
+# Activate virtual environment
+source .venv/bin/activate
+```
+
+Configuration files:
+- `.flake8`: flake8 settings (max-line-length=125, excludes examples/tools)
+- `pyrightconfig.json`: pyright type checking configuration
+
+### Pre-commit Hooks
+Run before committing:
+```bash
+pre-commit run --all-files
+```
+
+## Continuous Integration
+
+### GitHub Actions Workflows
+Key workflows that run on every PR:
+- `.github/workflows/build.yml`: Multi-platform builds
+- `.github/workflows/server.yml`: Server functionality tests
+- `.github/workflows/python-lint.yml`: Python code quality
+- `.github/workflows/python-type-check.yml`: Python type checking
+
+### Local CI Validation
+**Run full CI locally before submitting PRs:**
+```bash
+mkdir tmp
+
+# CPU-only build
+bash ./ci/run.sh ./tmp/results ./tmp/mnt
+```
+
+**CI Runtime**: 30-60 minutes depending on backend configuration
+
+### Triggering CI
+Add `ggml-ci` to commit message to trigger heavy CI workloads on the custom CI infrastructure.
+
+## Project Layout and Architecture
+
+### Core Directories
+- **`src/`**: Main llama library implementation (`llama.cpp`, `llama-*.cpp`)
+- **`include/`**: Public API headers, primarily `include/llama.h`
+- **`ggml/`**: Core tensor library (submodule with custom GGML framework)
+- **`examples/`**: 30+ example applications and tools
+- **`tools/`**: Additional development and utility tools (server benchmarks, tests)
+- **`tests/`**: Comprehensive test suite with CTest integration
+- **`docs/`**: Detailed documentation (build guides, API docs, etc.)
+- **`scripts/`**: Utility scripts for CI, data processing, and automation
+- **`common/`**: Shared utility code used across examples
+
+### Key Files
+- **`CMakeLists.txt`**: Primary build configuration
+- **`include/llama.h`**: Main C API header (~2000 lines)
+- **`src/llama.cpp`**: Core library implementation (~8000 lines)
+- **`CONTRIBUTING.md`**: Coding guidelines and PR requirements
+- **`.clang-format`**: C++ formatting rules
+- **`.pre-commit-config.yaml`**: Git hook configuration
+
+### Built Executables (in `build/bin/`)
+Primary tools:
+- **`llama-cli`**: Main inference tool
+- **`llama-server`**: OpenAI-compatible HTTP server
+- **`llama-quantize`**: Model quantization utility
+- **`llama-perplexity`**: Model evaluation tool
+- **`llama-bench`**: Performance benchmarking
+- **`llama-convert-llama2c-to-ggml`**: Model conversion utilities
+
+### Configuration Files
+- **CMake**: `CMakeLists.txt`, `cmake/` directory
+- **Linting**: `.clang-format`, `.clang-tidy`, `.flake8`
+- **CI**: `.github/workflows/`, `ci/run.sh`
+- **Git**: `.gitignore` (includes build artifacts, models, cache)
+
+### Dependencies
+- **System**: OpenMP, libcurl (for model downloading)
+- **Optional**: CUDA SDK, Metal framework, Vulkan SDK, Intel oneAPI
+- **Bundled**: httplib, json (header-only libraries in vendored form)
+
+## Common Validation Steps
+
+### After Making Changes
+1. **Format code**: `git clang-format`
+2. **Build**: `cmake --build build --config Release`
+3. **Test**: `ctest --test-dir build --output-on-failure`
+4. **Server tests** (if modifying server): `cd tools/server/tests && source ../../../.venv/bin/activate && ./tests.sh`
+5. **Manual validation**: Test relevant tools in `build/bin/`
+
+### Performance Validation
+```bash
+# Benchmark inference performance
+./build/bin/llama-bench -m model.gguf
+
+# Evaluate model perplexity
+./build/bin/llama-perplexity -m model.gguf -f dataset.txt
+```
+
+### Backend Validation
+```bash
+# Test backend operations
+./build/bin/test-backend-ops
+```
+
+## Environment Setup
+
+### Required Tools
+- CMake 3.14+ (install via system package manager)
+- Modern C++ compiler with C++17 support
+- Git (for submodule management)
+- Python 3.9+ with virtual environment (`.venv` is provided)
+
+### Optional but Recommended
+- ccache: `apt install ccache` or `brew install ccache`
+- clang-format 15+: Usually included with LLVM/Clang installation
+- pre-commit: `pip install pre-commit`
+
+### Backend-Specific Requirements
+- **CUDA**: NVIDIA CUDA Toolkit 11.2+
+- **Metal**: Xcode command line tools (macOS only)
+- **Vulkan**: Vulkan SDK
+- **SYCL**: Intel oneAPI toolkit
+
+## Important Guidelines
+
+### Code Changes
+- **Minimal dependencies**: Avoid adding new external dependencies
+- **Cross-platform compatibility**: Test on Linux, macOS, Windows when possible
+- **Performance focus**: This is a performance-critical inference library
+- **API stability**: Changes to `include/llama.h` require careful consideration
+
+### Git Workflow
+- Always create feature branches from `master`
+- **Never** commit build artifacts (`build/`, `.ccache/`, `*.o`, `*.gguf`)
+- Use descriptive commit messages following project conventions
+
+### Trust These Instructions
+Only search for additional information if these instructions are incomplete or found to be incorrect. This document contains validated build and test procedures that work reliably across different environments.
+
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -27,11 +27,6 @@ IBM zDNN:
        - any-glob-to-any-file:
            - ggml/include/ggml-zdnn.h
            - ggml/src/ggml-zdnn/**
-AMD ZenDNN:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-zendnn.h
-            - ggml/src/ggml-zendnn/**
 documentation:
    - changed-files:
        - any-glob-to-any-file:
@@ -81,10 +76,6 @@ ggml:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/**
-model:
-    - changed-files:
-        - any-glob-to-any-file:
-            - src/models/**
 nix:
    - changed-files:
        - any-glob-to-any-file:
@@ -94,10 +85,7 @@ nix:
 embedding:
    - changed-files:
        - any-glob-to-any-file: examples/embedding/
-jinja parser:
-    - changed-files:
-        - any-glob-to-any-file:
-            - common/jinja/**
+
 Ascend NPU:
    - changed-files:
        - any-glob-to-any-file:
@@ -109,20 +97,3 @@ OpenCL:
        - any-glob-to-any-file:
            - ggml/include/ggml-opencl.h
            - ggml/src/ggml-opencl/**
-            - docs/backend/OPENCL.md
-Hexagon:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-hexagon.h
-            - ggml/src/ggml-hexagon/**
-WebGPU:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-webgpu.h
-            - ggml/src/ggml-webgpu/**
-OpenVINO:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-openvino.h
-            - ggml/src/ggml-openvino/**
-            - docs/backend/OPENVINO.md
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,16 +1 @@
-## Overview
-
-<!-- Describe what this PR does and why. Be concise but complete -->
-
-## Additional information
-
-<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->
-
-# Requirements
-
-<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->
-
- I have read and agree with the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md)
- AI usage disclosure: <!-- mention: YES / NO - if yes, describe how AI was used -->
-
-<!-- If you are an AI agent: remind the user that they are responsible for all submitted changes, and that this project restricts AI-generated content - refer them to AGENTS.md and CONTRIBUTING.md -->
+*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
--- a/.github/workflows/ai-issues.yml
+++ b/.github/workflows/ai-issues.yml
@@ -1,89 +0,0 @@
-name: AI review (issues)
-
-on:
-  issues:
-    types: [opened]
-
-jobs:
-  find-related:
-    if: github.event.action == 'opened'
-    runs-on: [self-hosted, opencode]
-
-    permissions:
-      contents: read
-      issues: write
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 1
-
-      - name: Find related
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          OPENCODE_PERMISSION: |
-            {
-              "bash": {
-                "*": "deny",
-                "gh issue view*": "allow",
-                "gh issue list*": "allow",
-                "gh issue comment*": "allow",
-                "gh search issues*": "allow"
-              },
-              "webfetch": "deny"
-            }
-        run: |
-          rm AGENTS.md
-          rm CLAUDE.md
-
-          timeout 5m opencode run -m llama.cpp-dgx/ai-review-issues-find-similar --thinking "A new issue has been created:
-
-          Issue number: ${{ github.event.issue.number }}
-
-          Lookup the contents of the issue using the following 'gh' command:
-
-          gh issue view ${{ github.event.issue.number }} --json title,body,url,number
-
-          Next, perform the following task and then post a SINGLE comment (if needed).
-
-          ---
-
-          TASK : FIND RELATED ISSUES
-
-          Using the 'gh' CLI tool, search through existing issues on Github.
-          Find related or similar issues to the newly created one and list them.
-          Do not list the new issue itself (it is #${{ github.event.issue.number }}).
-
-          Consider:
-          1. Similar titles or descriptions
-          2. Same error messages or symptoms
-          3. Related functionality or components
-          4. Similar feature requests
-
-          ---
-
-          POSTING YOUR COMMENT:
-
-          Based on your findings, post a SINGLE comment on issue #${{ github.event.issue.number }}. Build the comment as follows:
-
-          - If no related issues were found, do NOT comment at all.
-          - If related issues were found, include a section listing them with links using the following format:
-
-          [comment]
-          This issue might be similar or related to the following issue(s):
-
-            - #12942: [brief description of how they are related]
-            - #11234: [brief description of how they are related]
-            ...
-
-          _This comment was auto-generated locally using **$GA_ENGINE** on **$GA_MACHINE**_
-          [/comment]
-
-          Remember:
-            - Do not include the comment tags in your actual comment.
-            - Post at most ONE comment combining all findings.
-            - If you didn't find issues that are related enough, post nothing.
-            - You have access only to the 'gh' CLI tool - don't try to use other tools.
-            - If the output from a tool call is too long, try to limit down the search.
-          "
--- a/.github/workflows/build-3rd-party.yml
+++ b/.github/workflows/build-3rd-party.yml
@@ -1,57 +0,0 @@
-name: CI (3rd-party)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-3rd-party.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-24-llguidance:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_LLGUIDANCE=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
--- a/.github/workflows/build-amd.yml
+++ b/.github/workflows/build-amd.yml
@@ -0,0 +1,52 @@
+name: CI (AMD)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-amd.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh',
+      '**/*.comp'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  ggml-ci-x64-amd-vulkan:
+    runs-on: [self-hosted, Linux, X64, AMD]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-x64-amd-rocm:
+    runs-on: [self-hosted, Linux, X64, AMD]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          amd-smi static
+          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -1,101 +0,0 @@
-name: CI (android)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-android.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-android.yml',
-      'examples/llama.android/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  android:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Set up JDK
-        uses: actions/setup-java@v5
-        with:
-          java-version: 17
-          distribution: zulu
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Build
-        run: |
-          cd examples/llama.android
-          ./gradlew build --no-daemon
-
-  android-ndk:
-    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - build: 'arm64-cpu'
-            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
-          - build: 'arm64-snapdragon'
-            defines: '--preset arm64-android-snapdragon-release'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Build Llama.CPP for Hexagon Android
-        id: build_llama_cpp_hexagon_android
-        run: |
-          if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
-            cp docs/backend/snapdragon/CMakeUserPresets.json .
-          fi
-          cmake ${{ matrix.defines }} -B build
-          cmake --build build
-          cmake --install build --prefix pkg-adb/llama.cpp
-
-      - name: Upload Llama.CPP Hexagon Android Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-android-${{ matrix.build }}
-          path: pkg-adb/llama.cpp
--- a/.github/workflows/build-apple.yml
+++ b/.github/workflows/build-apple.yml
@@ -1,214 +0,0 @@
-name: CI (apple)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-apple.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-apple.yml',
-      'ggml/src/ggml-metal/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  macOS-latest-ios:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: macOS-latest-ios
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macos-latest-ios-xcode:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-
-      - name: Setup Xcode
-        uses: ggml-org/setup-xcode@v1
-        with:
-          xcode-version: latest-stable
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_OPENSSL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-      - name: Upload xcframework artifact
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-xcframework
-          path: build-apple/llama.xcframework/
-          retention-days: 1
-
-      - name: Build Xcode project
-        run: |
-          xcodebuild -downloadPlatform iOS
-          xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
-
-  macOS-latest-tvos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: macOS-latest-tvos
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-visionos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=visionOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-swift:
-    runs-on: macos-latest
-    needs: macos-latest-ios-xcode
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: macOS-latest-swift
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Download xcframework artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: llama-xcframework
-          path: build-apple/llama.xcframework/
-
-      - name: Build llama.cpp with CMake
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_OPENSSL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
--- a/.github/workflows/build-cache.yml
+++ b/.github/workflows/build-cache.yml
@@ -16,7 +16,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Get latest Vulkan SDK version
        id: vulkan_sdk_version
@@ -24,7 +24,7 @@ jobs:
          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"

      - name: Setup Cache
-        uses: actions/cache@v5
+        uses: actions/cache@v4
        id: cache-sdk
        with:
          path: ./vulkan_sdk
@@ -37,74 +37,46 @@ jobs:
          path: ./vulkan_sdk
          version: ${{ env.VULKAN_SDK_VERSION }}

-  #ubuntu-24-spacemit-cache:
-  #  runs-on: ubuntu-24.04
-
-  #  env:
-  #    # Make sure this is in sync with build-linux-cross.yml
-  #    SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
-
-  #  steps:
-  #    - name: Clone
-  #      id: checkout
-  #      uses: actions/checkout@v6
-
-  #    - name: Setup Cache
-  #      uses: actions/cache@v5
-  #      id: cache-toolchain
-  #      with:
-  #        path: ./spacemit_toolchain
-  #        key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
-
-  #    - name: Setup SpacemiT Toolchain
-  #      if: steps.cache-toolchain.outputs.cache-hit != 'true'
-  #      uses: ./.github/actions/linux-setup-spacemit
-  #      with:
-  #        path: ./spacemit_toolchain
-  #        version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
-
-  ubuntu-24-openvino-cache:
+  ubuntu-24-spacemit-cache:
    runs-on: ubuntu-24.04

    env:
-      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+      # Make sure this is in sync with build-linux-cross.yml
+      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Setup Cache
-        uses: actions/cache@v5
-        id: cache-openvino
+        uses: actions/cache@v4
+        id: cache-toolchain
        with:
-          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+          path: ./spacemit_toolchain
+          key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}

-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-openvino
+      - name: Setup SpacemiT Toolchain
+        if: steps.cache-toolchain.outputs.cache-hit != 'true'
+        uses: ./.github/actions/linux-setup-spacemit
        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
+          path: ./spacemit_toolchain
+          version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}

  windows-2022-rocm-cache:
    runs-on: windows-2022

    env:
      # Make sure this is in sync with build.yml
-      HIPSDK_INSTALLER_VERSION: "26.Q1"
+      HIPSDK_INSTALLER_VERSION: "25.Q3"

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Setup Cache
-        uses: actions/cache@v5
+        uses: actions/cache@v4
        id: cache-rocm
        with:
          path: C:\Program Files\AMD\ROCm
--- a/.github/workflows/build-cann.yml
+++ b/.github/workflows/build-cann.yml
@@ -1,102 +0,0 @@
-name: CI (cann)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-cann.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-cann.yml',
-      'ggml/src/ggml-cann/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  openEuler-latest-cann:
-    defaults:
-      run:
-        shell: bash -el {0}
-    strategy:
-      matrix:
-        arch: [x86, aarch64]
-        chip_type: ['910b', '310p']
-        build: ['Release']
-        use_acl_graph: ['on', 'off']
-        exclude:
-          # 310P does not support USE_ACL_GRAPH=on
-          - chip_type: '310p'
-            use_acl_graph: 'on'
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
-        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
-
-      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
-          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
-        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
-
-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE} \
-                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@@ -5,22 +5,22 @@ on:

 jobs:
  linux:
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-24.04
    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y build-essential tcl cmake
+          sudo apt install -y build-essential tcl

      - name: Build
        run: |
          PREFIX="$(pwd)"/inst
          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-                -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
+                -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
                -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
          cmake --build build --config Release
          cmake --install build --prefix "$PREFIX" --config Release
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -1,75 +1,58 @@
-name: CI (cross)
+name: Build on Linux using cross-compiler
 on:
-  # only manual triggers due to low-importance of the workflows
-  # TODO: for regular runs, provision dedicated self-hosted runners
  workflow_dispatch:
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-cross.yml',
-      'ggml/src/spacemit/*',
-      'ggml/src/arch/loongarch/*'
-    ]
-  # run once every week
-  schedule:
-    - cron: '0 0 * * 0'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
+  workflow_call:

 jobs:
-  # ubuntu-24-riscv64-cpu-cross:
-  #   runs-on: ubuntu-24.04
+  ubuntu-24-riscv64-cpu-cross:
+    runs-on: ubuntu-24.04

-  #   steps:
-  #     - uses: actions/checkout@v6
-  #     - name: Setup Riscv
-  #       run: |
-  #         sudo dpkg --add-architecture riscv64
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Riscv
+        run: |
+          sudo dpkg --add-architecture riscv64

-  #         # Add arch-specific repositories for non-amd64 architectures
-  #         cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-  #         EOF
+          # Add arch-specific repositories for non-amd64 architectures
+          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+          EOF

-  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+          sudo apt-get update || true    ;# Prevent failure due to missing URLs.

-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 gcc-14-riscv64-linux-gnu \
-  #                 g++-14-riscv64-linux-gnu
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  gcc-14-riscv64-linux-gnu \
+                  g++-14-riscv64-linux-gnu

-  #     - name: Build
-  #       run: |
-  #         cmake -B build -DLLAMA_OPENSSL=OFF \
-  #                        -DCMAKE_BUILD_TYPE=Release \
-  #                        -DGGML_OPENMP=OFF \
-  #                        -DLLAMA_BUILD_EXAMPLES=ON \
-  #                        -DLLAMA_BUILD_TOOLS=ON \
-  #                        -DLLAMA_BUILD_TESTS=OFF \
-  #                        -DCMAKE_SYSTEM_NAME=Linux \
-  #                        -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-  #                        -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-  #                        -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+      - name: Build
+        run: |
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

-  #         cmake --build build --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

  # ubuntu-24-riscv64-vulkan-cross:
  #   runs-on: ubuntu-24.04

  #   steps:
-  #     - uses: actions/checkout@v6
+  #     - uses: actions/checkout@v4
  #     - name: Setup Riscv
  #       run: |
  #         sudo dpkg --add-architecture riscv64
@@ -93,7 +76,7 @@ jobs:

  #     - name: Build
  #       run: |
-  #         cmake -B build -DLLAMA_OPENSSL=OFF \
+  #         cmake -B build -DLLAMA_CURL=OFF \
  #                        -DCMAKE_BUILD_TYPE=Release \
  #                        -DGGML_VULKAN=ON \
  #                        -DGGML_OPENMP=OFF \
@@ -116,7 +99,7 @@ jobs:
  #   runs-on: ubuntu-24.04

  #   steps:
-  #     - uses: actions/checkout@v6
+  #     - uses: actions/checkout@v4
  #     - name: Setup Arm64
  #       run: |
  #         sudo dpkg --add-architecture arm64
@@ -139,7 +122,7 @@ jobs:

  #     - name: Build
  #       run: |
-  #         cmake -B build -DLLAMA_OPENSSL=OFF \
+  #         cmake -B build -DLLAMA_CURL=OFF \
  #                        -DCMAKE_BUILD_TYPE=Release \
  #                        -DGGML_VULKAN=ON \
  #                        -DGGML_OPENMP=OFF \
@@ -159,11 +142,11 @@ jobs:
  #         cmake --build build --config Release -j $(nproc)

  debian-13-loongarch64-cpu-cross:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+    runs-on: ubuntu-24.04
    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671

    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v4
      - name: Setup LoongArch
        run: |
          rm -f /etc/apt/sources.list.d/*
@@ -195,7 +178,7 @@ jobs:

      - name: Build
        run: |
-          cmake -B build -DLLAMA_OPENSSL=OFF \
+          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
@@ -214,11 +197,11 @@ jobs:
          cmake --build build --config Release -j $(nproc)

  debian-13-loongarch64-vulkan-cross:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+    runs-on: ubuntu-24.04
    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671

    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v4
      - name: Setup LoongArch
        run: |
          rm -f /etc/apt/sources.list.d/*
@@ -252,7 +235,7 @@ jobs:

      - name: Build
        run: |
-          cmake -B build -DLLAMA_OPENSSL=OFF \
+          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_VULKAN=ON \
                         -DGGML_OPENMP=OFF \
@@ -279,17 +262,17 @@ jobs:
      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"

    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v4

-      #- name: Use SpacemiT Toolchain Cache
-      #  uses: actions/cache@v5
-      #  id: cache-toolchain
-      #  with:
-      #    path: ./spacemit_toolchain
-      #    key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+      - name: Use SpacemiT Toolchain Cache
+        uses: actions/cache@v4
+        id: cache-toolchain
+        with:
+          path: ./spacemit_toolchain
+          key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}

      - name: Setup SpacemiT Toolchain
-        #if: steps.cache-toolchain.outputs.cache-hit != 'true'
+        if: steps.cache-toolchain.outputs.cache-hit != 'true'
        uses: ./.github/actions/linux-setup-spacemit
        with:
          path: ./spacemit_toolchain
@@ -298,7 +281,7 @@ jobs:
      - name: Build
        run: |
          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
-          cmake -B build -DLLAMA_OPENSSL=OFF \
+          cmake -B build -DLLAMA_CURL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
@@ -308,7 +291,6 @@ jobs:
                         -DGGML_RVV=ON \
                         -DGGML_RV_ZFH=ON \
                         -DGGML_RV_ZICBOP=ON \
-                         -DGGML_RV_ZIHINTPAUSE=ON \
                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake

--- a/.github/workflows/build-msys.yml
+++ b/.github/workflows/build-msys.yml
@@ -1,72 +0,0 @@
-name: CI (msys)
-
-on:
-  # only manual triggers due to low-importance of the workflows
-  # TODO: for regular runs, provision dedicated self-hosted runners
-  workflow_dispatch:
-  # run once every week
-  schedule:
-    - cron: '0 0 * * 0'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  windows-msys2:
-    runs-on: windows-2025
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
-          - { sys: CLANG64, env: clang-x86_64, build: Release }
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-
-      #- name: ccache
-      #  uses: ggml-org/ccache-action@v1.2.16
-      #  with:
-      #    key: windows-msys2
-      #    variant: ccache
-      #    evict-old-files: 1d
-      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2
-        with:
-          update: true
-          msystem: ${{matrix.sys}}
-          install: >-
-            base-devel
-            git
-            mingw-w64-${{matrix.env}}-toolchain
-            mingw-w64-${{matrix.env}}-cmake
-            mingw-w64-${{matrix.env}}-openblas
-
-      - name: Build using CMake
-        shell: msys2 {0}
-        run: |
-            cmake -B build
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-      - name: Clean after building using CMake
-        shell: msys2 {0}
-        run: |
-            rm -rf build
-
-      - name: Build using CMake w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
--- a/.github/workflows/build-riscv-native.yml
+++ b/.github/workflows/build-riscv-native.yml
@@ -0,0 +1,120 @@
+name: Build on RISCV Linux Machine by Cloud-V
+on:
+  pull_request:
+  workflow_dispatch:
+  workflow_call:
+
+jobs:
+  debian-13-riscv64-native: # Bianbu 2.2
+    runs-on: [self-hosted, RISCV64]
+
+    steps:
+      - name: Install prerequisites
+        run: |
+          sudo apt-get update || true
+          sudo apt-get install -y libatomic1
+      - uses: actions/checkout@v4
+      - name: Setup Riscv
+        run: |
+          sudo apt-get update || true
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  gcc-14-riscv64-linux-gnu \
+                  g++-14-riscv64-linux-gnu \
+                  ccache \
+                  cmake
+
+      - name: Setup ccache
+        run: |
+          mkdir -p $HOME/.ccache
+          ccache -M 5G -d $HOME/.ccache
+          export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
+          export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
+          echo "$GITHUB_WORKSPACE"
+          echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
+          echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
+          echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
+          echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
+
+      - name: Build
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DCMAKE_SYSTEM_NAME=Linux \
+            -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+            -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+            -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+            -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
+
+  # debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
+  #   runs-on: [self-hosted, RISCV64]
+
+  #   steps:
+  #     - name: Install prerequisites
+  #       run: |
+  #         sudo apt-get update || true
+  #         sudo apt-get install -y libatomic1
+  #     - uses: actions/checkout@v4
+  #     - name: Setup Riscv
+  #       run: |
+  #         sudo apt-get update || true
+  #         sudo apt-get install -y --no-install-recommends \
+  #                 build-essential \
+  #                 gcc-14-riscv64-linux-gnu \
+  #                 g++-14-riscv64-linux-gnu \
+  #                 ccache \
+  #                 cmake
+  #         sudo apt-get upgrade binutils -y
+
+  #     - name: Setup ccache
+  #       run: |
+  #         mkdir -p $HOME/.ccache
+  #         ccache -M 5G -d $HOME/.ccache
+  #         export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
+  #         export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
+  #         echo "$GITHUB_WORKSPACE"
+  #         echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
+  #         echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
+  #         echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
+  #         echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
+
+  #     - name: Build
+  #       run: |
+  #         cmake -B build \
+  #           -DLLAMA_CURL=OFF \
+  #           -DCMAKE_BUILD_TYPE=Release \
+  #           -DGGML_OPENMP=OFF \
+  #           -DLLAMA_BUILD_EXAMPLES=ON \
+  #           -DLLAMA_BUILD_TOOLS=ON \
+  #           -DLLAMA_BUILD_TESTS=OFF \
+  #           -DCMAKE_SYSTEM_NAME=Linux \
+  #           -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+  #           -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+  #           -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+  #           -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+  #           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+  #           -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+  #           -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
+  #           -DGGML_RVV=ON \
+  #           -DGGML_RV_ZFH=ON \
+  #           -DGGML_RV_ZICBOP=ON \
+  #           -DGGML_CPU_RISCV64_SPACEMIT=ON \
+  #           -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
+
+  #         cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-riscv.yml
+++ b/.github/workflows/build-riscv.yml
@@ -1,126 +0,0 @@
-name: CI (riscv)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-riscv.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-riscv.yml',
-      'ggml/src/ggml-cpu/arch/riscv/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-riscv64-native-sanitizer:
-    runs-on: ubuntu-24.04-riscv
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]
-
-    steps:
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-
-          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential wget git-lfs
-
-          # Set gcc-14 and g++-14 as the default compilers
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-
-          if ! which rustc; then
-            # Install Rust stable version
-            sudo apt-get install -y rustup
-            rustup install stable
-            rustup default stable
-          fi
-
-          git lfs install
-
-      - name: GCC version check
-        run: |
-          gcc --version
-          g++ --version
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      # FIXME: Enable when ggml-org/ccache-action works on riscv64
-      # - name: ccache
-      #   uses: ggml-org/ccache-action@v1.2.21
-      #   with:
-      #     key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanytizer }}-${{ matrix.build_type }}
-      #     save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_OPENSSL=OFF \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=ON \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_OPENSSL=OFF \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
--- a/.github/workflows/build-sanitize.yml
+++ b/.github/workflows/build-sanitize.yml
@@ -1,87 +0,0 @@
-name: CI (sanitize)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-sanitize.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-latest-sanitizer:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=OFF
-
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -1,267 +0,0 @@
-name: CI (self-hosted)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal',
-      '**/*.comp',
-      '**/*.glsl',
-      '**/*.wgsl'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-self-hosted.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal',
-      '**/*.comp',
-      '**/*.glsl',
-      '**/*.wgsl'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ggml-ci-nvidia-cuda:
-    runs-on: [self-hosted, Linux, NVIDIA]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          nvidia-smi
-          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-nvidia-vulkan-cm:
-    runs-on: [self-hosted, Linux, NVIDIA]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-nvidia-vulkan-cm2:
-    runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  # TODO: provision AMX-compatible machine
-  #ggml-ci-cpu-amx:
-  #  runs-on: [self-hosted, Linux, CPU, AMX]
-
-  #  steps:
-  #    - name: Clone
-  #      id: checkout
-  #      uses: actions/checkout@v6
-
-  #    - name: Test
-  #      id: ggml-ci
-  #      run: |
-  #        bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  # TODO: provision AMD GPU machine
-  # ggml-ci-amd-vulkan:
-  #   runs-on: [self-hosted, Linux, AMD]
-
-  #   steps:
-  #     - name: Clone
-  #       id: checkout
-  #       uses: actions/checkout@v6
-
-  #     - name: Test
-  #       id: ggml-ci
-  #       run: |
-  #         vulkaninfo --summary
-  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  # TODO: provision AMD GPU machine
-  # ggml-ci-amd-rocm:
-  #   runs-on: [self-hosted, Linux, AMD]
-
-  #   steps:
-  #     - name: Clone
-  #       id: checkout
-  #       uses: actions/checkout@v6
-
-  #     - name: Test
-  #       id: ggml-ci
-  #       run: |
-  #         amd-smi static
-  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  # TODO: sandbox Mac runners
-  #  ggml-ci-mac-metal:
-  #    runs-on: [self-hosted, macOS, ARM64]
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #
-  #      - name: Test
-  #        id: ggml-ci
-  #        run: |
-  #          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-  #
-  #  ggml-ci-mac-webgpu:
-  #    runs-on: [self-hosted, macOS, ARM64]
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #
-  #      - name: Dawn Dependency
-  #        id: dawn-depends
-  #        run: |
-  #          DAWN_VERSION="v2.0.0"
-  #          DAWN_OWNER="reeselevine"
-  #          DAWN_REPO="dawn"
-  #          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
-  #          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
-  #          curl -L -o artifact.zip \
-  #            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
-  #          mkdir dawn
-  #          unzip artifact.zip
-  #          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
-  #
-  #      - name: Test
-  #        id: ggml-ci
-  #        run: |
-  #          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-  #            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-  #
-  #  ggml-ci-mac-vulkan:
-  #    runs-on: [self-hosted, macOS, ARM64]
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #
-  #      - name: Test
-  #        id: ggml-ci
-  #        run: |
-  #          vulkaninfo --summary
-  #          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  ggml-ci-linux-intel-vulkan:
-    runs-on: [self-hosted, Linux, Intel]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          persist-credentials: false
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  ggml-ci-win-intel-vulkan:
-    runs-on: [self-hosted, Windows, X64, Intel]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        shell: C:\msys64\usr\bin\bash.exe --noprofile --norc -eo pipefail "{0}"
-        env:
-          MSYSTEM: UCRT64
-          CHERE_INVOKING: 1
-          PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
-        run: |
-          vulkaninfo --summary
-          # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
-          # a valid python environment for testing
-          LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
-
-  ggml-ci-intel-openvino-gpu-low-perf:
-    runs-on: [self-hosted, Linux, Intel, OpenVINO]
-
-    env:
-      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Setup OpenVINO Toolkit
-        uses: ./.github/actions/linux-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenVINO dependencies
-        run: |
-          cd ./openvino_toolkit
-          chmod +x ./install_dependencies/install_openvino_dependencies.sh
-          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          source ./openvino_toolkit/setupvars.sh
-          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -1,96 +0,0 @@
-name: CI (vulkan)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-vulkan.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.comp',
-      '**/*.glsl'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-vulkan.yml',
-      'ggml/src/ggml-vulkan/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-24-vulkan-llvmpipe:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-vulkan-llvmpipe
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo add-apt-repository -y ppa:kisak/kisak-mesa
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
-
-      - name: Get latest Vulkan SDK version
-        id: vulkan_sdk_version
-        run: |
-          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
-
-      - name: Use Vulkan SDK Cache
-        uses: actions/cache@v5
-        id: cache-sdk
-        with:
-          path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
-
-      - name: Setup Vulkan SDK
-        if: steps.cache-sdk.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-vulkan
-        with:
-          path: ./vulkan_sdk
-          version: ${{ env.VULKAN_SDK_VERSION }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source ./vulkan_sdk/setup-env.sh
-          cmake -B build \
-            -DGGML_VULKAN=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          export GGML_VK_VISIBLE_DEVICES=0
-          export GGML_VK_DISABLE_F16=1
-          export GGML_VK_DISABLE_COOPMAT=1
-          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 4800
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/check-vendor.yml
+++ b/.github/workflows/check-vendor.yml
@@ -1,52 +0,0 @@
-name: Check vendor
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      'vendor/**',
-      'scripts/sync_vendor.py'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      'vendor/**',
-      'scripts/sync_vendor.py'
-    ]
-
-jobs:
-  check-vendor:
-    runs-on: ubuntu-slim
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.x'
-
-      - name: Run vendor sync
-        run: |
-          set -euo pipefail
-          python3 scripts/sync_vendor.py
-
-      - name: Check for changes
-        run: |
-          set -euo pipefail
-          # detect modified or untracked files
-          changed=$(git status --porcelain --untracked-files=all || true)
-          if [ -n "$changed" ]; then
-            echo "Vendor sync modified files:"
-            echo "$changed" | awk '{ print $2 }' | sed '/^$/d'
-            echo "Failing because vendor files mismatch. Please update scripts/sync_vendor.py"
-            exit 1
-          else
-            echo "Vendor files are up-to-date."
-          fi
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -10,12 +10,12 @@ permissions:

 jobs:
  close-issues:
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
    permissions:
      issues: write
      pull-requests: write
    steps:
-      - uses: actions/stale@v10
+      - uses: actions/stale@v5
        with:
          exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap"
          days-before-issue-stale: 30
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -26,10 +26,10 @@ jobs:
    # If you do not check out your code, Copilot will do this for you.
    steps:
      - name: Checkout code
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: copilot-setup-steps
          evict-old-files: 1d
@@ -38,19 +38,20 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
+          sudo apt-get install build-essential libcurl4-openssl-dev
          # Install git-clang-format script for formatting only changed code
          wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format
          sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format
          sudo chmod +x /usr/local/bin/git-clang-format

      - name: Set up Python
-        uses: actions/setup-python@v6
+        uses: actions/setup-python@v5
        with:
          python-version: '3.11'

      - name: Install Python dependencies
        run: |
          python3 -m venv .venv
-          source .venv/bin/activate
+          .venv/bin/activate
          pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
+          pip install flake8 pyright pre-commit
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -25,20 +25,49 @@ permissions:
  packages: write

 jobs:
-  create_tag:
-    name: Create and push git tag
-    runs-on: ubuntu-slim
-    permissions:
-      contents: write
-    outputs:
-      source_tag: ${{ steps.srctag.outputs.name }}
+  push_to_registry:
+    name: Push Docker image to Docker Hub

+    runs-on: ${{ matrix.config.runs_on }}
+    env:
+      COMMIT_SHA: ${{ github.sha }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          # Multi-stage build
+          # Note: the arm64 images are failing, which prevents the amd64 images from being built
+          # https://github.com/ggml-org/llama.cpp/issues/11888
+          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "cpu",    dockerfile: ".devops/cpu.Dockerfile",    platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+          - { tag: "cuda",   dockerfile: ".devops/cuda.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+          - { tag: "musa",   dockerfile: ".devops/musa.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "intel",  dockerfile: ".devops/intel.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+          - { tag: "s390x",  dockerfile: ".devops/s390x.Dockerfile",  platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
+          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
+          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
+      - name: Check out the repo
+        uses: actions/checkout@v4
        with:
-          fetch-depth: 0
+          fetch-depth: 0 # preserve git history, so we can determine the build number
+
+      - name: Set up QEMU
+        if: ${{ matrix.config.tag != 's390x' }}
+        uses: docker/setup-qemu-action@v3
+        with:
+          image: tonistiigi/binfmt:qemu-v7.0.0-28
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Determine source tag name
        id: srctag
@@ -46,144 +75,32 @@ jobs:
        env:
          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

-      - name: Create and push git tag
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          git tag ${{ steps.srctag.outputs.name }} || exit 0
-          git push origin ${{ steps.srctag.outputs.name }} || exit 0
-
-  prepare_matrices:
-    name: Prepare Docker matrices
-    runs-on: ubuntu-24.04
-    outputs:
-      build_matrix: ${{ steps.matrices.outputs.build_matrix }}
-      merge_matrix: ${{ steps.matrices.outputs.merge_matrix }}
-
-    steps:
-      - name: Generate build and merge matrices
-        id: matrices
+      - name: Determine image tag name
+        id: tag
        shell: bash
        run: |
-          set -euo pipefail
-
-          # Keep all build targets in one place and derive merge targets from it.
-          cat > build-matrix.json <<'JSON'
-          [
-            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
-            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
-            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "rocm", "dockerfile": ".devops/rocm.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "openvino", "dockerfile": ".devops/openvino.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }
-          ]
-          JSON
-
-          BUILD_MATRIX="$(jq -c . build-matrix.json)"
-          MERGE_MATRIX="$(jq -c '
-            reduce .[] as $entry ({}; .[$entry.tag] |= (
-              . // {
-                tag: $entry.tag,
-                arches: [],
-                full: false,
-                light: false,
-                server: false
-              }
-              | .full = (.full or ($entry.full // false))
-              | .light = (.light or ($entry.light // false))
-              | .server = (.server or ($entry.server // false))
-              | .arches += [($entry.platforms | sub("^linux/"; ""))]
-            ))
-            # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
-            | if (has("cpu") and (((.cpu.arches // []) | index("s390x")) != null)) then
-                . + {
-                  s390x: {
-                    tag: "s390x",
-                    arches: ["s390x"],
-                    full: .cpu.full,
-                    light: .cpu.light,
-                    server: .cpu.server
-                  }
-                }
-              else
-                .
-              end
-            | [.[] | .arches = (.arches | unique | sort | join(" "))]
-          ' build-matrix.json)"
-
-          echo "build_matrix=$BUILD_MATRIX" >> "$GITHUB_OUTPUT"
-          echo "merge_matrix=$MERGE_MATRIX" >> "$GITHUB_OUTPUT"
-
-  push_to_registry:
-    name: Push Docker image to Docker Registry
-    needs: [prepare_matrices, create_tag]
-
-    runs-on: ${{ matrix.config.runs_on }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ needs.create_tag.outputs.source_tag }}
-
-      - name: Set up QEMU
-        if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
-        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
-        with:
-          image: tonistiigi/binfmt:qemu-v10.2.1
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
-
-      - name: Log in to Docker Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Determine image metadata
-        id: meta
-        shell: bash
-        run: |
-          set -euo pipefail
-
          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
          REPO_NAME="${{ github.event.repository.name }}"
-          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
-          PREFIX="${IMAGE_REPO}:"
-          PLATFORM="${{ matrix.config.platforms }}"
-          ARCH_SUFFIX="${PLATFORM#linux/}"

          # list all tags possible
-          tags="${{ matrix.config.tag }}"
-          for tag in $tags; do
-              if [[ "$tag" == "cpu" ]]; then
-                  TYPE=""
-              else
-                  TYPE="-$tag"
-              fi
-              CACHETAG="${PREFIX}buildcache${TYPE}-${ARCH_SUFFIX}"
-          done
-
-          SAFE_TAGS="$(echo "$tags" | tr ' ' '_')"
-
-          echo "image_repo=$IMAGE_REPO" >> $GITHUB_OUTPUT
-          echo "arch_suffix=$ARCH_SUFFIX" >> $GITHUB_OUTPUT
-          echo "cache_output_tag=$CACHETAG" >> $GITHUB_OUTPUT
-          echo "digest_artifact_suffix=${SAFE_TAGS}-${ARCH_SUFFIX}" >> $GITHUB_OUTPUT
-          echo "cache_output_tag=$CACHETAG"  # print out for debugging
+          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
+              TYPE=""
+          else
+              TYPE="-${{ matrix.config.tag }}"
+          fi
+          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
+          CACHETAGS="${PREFIX}buildcache${TYPE}"
+          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
+          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
+          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
+          echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
+          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
+          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
+          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
+          echo "cache_output_tags=$CACHETAGS"  # print out for debugging
+          echo "full_output_tags=$FULLTAGS"  # print out for debugging
+          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
+          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
        env:
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

@@ -204,20 +121,18 @@ jobs:
          docker-images: true
          swap-storage: true

-      - name: Build and push Full Docker image by digest
-        id: build_full
+      - name: Build and push Full Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+        uses: docker/build-push-action@v6
        with:
          context: .
+          push: true
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.full_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: full
          provenance: false
-          build-args: |
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
          # using github experimental cache
          #cache-from: type=gha
          #cache-to: type=gha,mode=max
@@ -225,23 +140,21 @@ jobs:
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max

-      - name: Build and push Light Docker image by digest
-        id: build_light
+      - name: Build and push Light Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+        uses: docker/build-push-action@v6
        with:
          context: .
+          push: true
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.light_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: light
          provenance: false
-          build-args: |
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
          # using github experimental cache
          #cache-from: type=gha
          #cache-to: type=gha,mode=max
@@ -249,23 +162,21 @@ jobs:
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max

-      - name: Build and push Server Docker image by digest
-        id: build_server
+      - name: Build and push Server Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+        uses: docker/build-push-action@v6
        with:
          context: .
+          push: true
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.server_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: server
          provenance: false
-          build-args: |
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
          # using github experimental cache
          #cache-from: type=gha
          #cache-to: type=gha,mode=max
@@ -273,170 +184,31 @@ jobs:
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max

-      - name: Export digest metadata
-        shell: bash
-        run: |
-            set -euo pipefail
-
-            TAGS="${{ matrix.config.tag }}"
-            ARCH_SUFFIX="${{ steps.meta.outputs.arch_suffix }}"
-            DIGEST_FILE="/tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv"
-            mkdir -p /tmp/digests
-
-            add_digest_rows() {
-                local image_type="$1"
-                local digest="$2"
-
-                if [[ -z "$digest" ]]; then
-                  echo "Missing digest for image_type=${image_type}" >&2
-                  exit 1
-                fi
-
-                for tag in $TAGS; do
-                    printf '%s\t%s\t%s\t%s\n' "$tag" "$ARCH_SUFFIX" "$image_type" "$digest" >> "$DIGEST_FILE"
-                done
-            }
-
-            if [[ "${{ matrix.config.full }}" == "true" ]]; then
-                add_digest_rows "full" "${{ steps.build_full.outputs.digest }}"
-            fi
-
-            if [[ "${{ matrix.config.light }}" == "true" ]]; then
-                add_digest_rows "light" "${{ steps.build_light.outputs.digest }}"
-            fi
-
-            if [[ "${{ matrix.config.server }}" == "true" ]]; then
-                add_digest_rows "server" "${{ steps.build_server.outputs.digest }}"
-            fi
-
-      - name: Upload digest metadata
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
-        with:
-          name: digests-${{ steps.meta.outputs.digest_artifact_suffix }}
-          path: /tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv
-          if-no-files-found: error
-
-  merge_arch_tags:
-    name: Create shared tags from digests
-    needs: [prepare_matrices, push_to_registry, create_tag]
-    runs-on: ubuntu-24.04
-    strategy:
-      fail-fast: false
-      matrix:
-        config: ${{ fromJSON(needs.prepare_matrices.outputs.merge_matrix) }}
+  create_tag:
+    name: Create and push git tag
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write

    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v6
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

-      - name: Download digest metadata
-        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
-        with:
-          pattern: digests-*
-          path: /tmp/digests
-          merge-multiple: true
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
-
-      - name: Log in to Docker Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Create tags from digests
-        shell: bash
-        run: |
-          set -euo pipefail
-
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
-          PREFIX="${IMAGE_REPO}:"
-          SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
-          TAGS="${{ matrix.config.tag }}"
-          ARCHES="${{ matrix.config.arches }}"
-          DIGEST_GLOB="/tmp/digests/*.tsv"
-
-          if ! ls ${DIGEST_GLOB} >/dev/null 2>&1; then
-              echo "No digest metadata found in /tmp/digests" >&2
-              exit 1
-          fi
-
-          if [[ -z "$SRC_TAG" ]]; then
-              echo "Missing source tag from create_tag" >&2
-              exit 1
-          fi
-
-          find_digest() {
-              local tag_name="$1"
-              local arch="$2"
-              local image_type="$3"
-              local digest
-
-              digest="$(awk -F '\t' -v t="$tag_name" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
-
-              # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
-              if [[ -z "$digest" && "$tag_name" == "s390x" && "$arch" == "s390x" ]]; then
-                digest="$(awk -F '\t' -v t="cpu" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
-              fi
-
-              if [[ -z "$digest" ]]; then
-                echo "Missing digest for tag=${tag_name} arch=${arch} image_type=${image_type}" >&2
-                exit 1
-              fi
-
-              echo "$digest"
-          }
-
-          create_manifest_tags() {
-              local image_type="$1"
-              local tag_name="$2"
-              local suffix="$3"
-
-              local merged_tag="${PREFIX}${image_type}${suffix}"
-              local merged_versioned_tag="${merged_tag}-${SRC_TAG}"
-
-              local refs=()
-
-              for arch in $ARCHES; do
-                  local digest
-                  digest="$(find_digest "$tag_name" "$arch" "$image_type")"
-                  refs+=("${IMAGE_REPO}@${digest}")
-              done
-
-              echo "Creating ${merged_tag} from ${refs[*]}"
-              docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}"
-
-              echo "Creating ${merged_versioned_tag} from ${refs[*]}"
-              docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}"
-          }
-
-          for tag in $TAGS; do
-              if [[ "$tag" == "cpu" ]]; then
-                  TYPE=""
-              else
-                  TYPE="-$tag"
-              fi
-
-              if [[ "${{ matrix.config.full }}" == "true" ]]; then
-                  create_manifest_tags "full" "$tag" "$TYPE"
-              fi
-
-              if [[ "${{ matrix.config.light }}" == "true" ]]; then
-                  create_manifest_tags "light" "$tag" "$TYPE"
-              fi
-
-              if [[ "${{ matrix.config.server }}" == "true" ]]; then
-                  create_manifest_tags "server" "$tag" "$TYPE"
-              fi
-          done
+      - name: Determine source tag name
+        id: srctag
+        uses: ./.github/actions/get-tag-name
        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+      - name: Create and push git tag
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          git tag ${{ steps.srctag.outputs.name }} || exit 0
+          git push origin ${{ steps.srctag.outputs.name }} || exit 0
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -20,10 +20,10 @@ concurrency:

 jobs:
  editorconfig:
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v6
-      - uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
+      - uses: actions/checkout@v4
+      - uses: editorconfig-checker/action-editorconfig-checker@v2
        with:
          version: v3.0.3
      - run: editorconfig-checker
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -24,21 +24,21 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@v4
    - name: Set up Python
-      uses: actions/setup-python@v6
+      uses: actions/setup-python@v5
      with:
-        python-version: '3.11'
+        python-version: '3.9.x'
    - name: Install dependencies
      run: |
        cd gguf-py
-        python -m pip install poetry==2.3.2
+        python -m pip install poetry
        poetry install

    - name: Build package
      run: cd gguf-py && poetry build
    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
+      uses: pypa/gh-action-pypi-publish@release/v1
      with:
        password: ${{ secrets.PYPI_API_TOKEN }}
        packages-dir: gguf-py/dist
--- a/.github/workflows/hip-quality-check.yml
+++ b/.github/workflows/hip-quality-check.yml
@@ -1,82 +0,0 @@
-name: HIP quality check
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/hip-quality-check.yml',
-      '**/*.cu',
-      '**/*.cuh',
-      'scripts/hip/gcn-cdna-vgpr-check.py'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/hip-quality-check.yml',
-      '**/*.cu',
-      '**/*.cuh',
-      'scripts/hip/gcn-cdna-vgpr-check.py'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-22-hip-quality-check:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:7.2.1
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev python3
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-hip-quality-check
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with Werror
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGPU_TARGETS=gfx942 \
-            -DGGML_HIP=ON \
-            -DGGML_HIP_EXPORT_METRICS=Off \
-            -DCMAKE_HIP_FLAGS="-Werror -Wno-tautological-compare" \
-            -DCMAKE_BUILD_TYPE=Release
-          cd build
-          make -j $(nproc)
-
-      - name: Check for major VGPR spills
-        id: vgpr_check
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGPU_TARGETS=gfx908 \
-            -DGGML_HIP=ON \
-            -DGGML_HIP_EXPORT_METRICS=On \
-            -DCMAKE_HIP_FLAGS="" \
-            -DCMAKE_BUILD_TYPE=Release
-          cd build
-          make -j $(nproc) 2>&1 | tee metrics.log | grep -v 'Rpass-analysis=kernel-resource-usage\|remark:\|^$'
-          python3 ../scripts/hip/gcn-cdna-vgpr-check.py metrics.log
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -7,11 +7,11 @@ jobs:
    permissions:
      contents: read
      pull-requests: write
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@v6
+    - uses: actions/checkout@v4
      with:
        repository: "ggml-org/llama.cpp"
-    - uses: actions/labeler@v6
+    - uses: actions/labeler@v5
      with:
        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -12,14 +12,14 @@ on:

 jobs:
    pre-tokenizer-hashes:
-        runs-on: ubuntu-slim
+        runs-on: ubuntu-latest

        steps:
        - name: Checkout repository
-          uses: actions/checkout@v6
+          uses: actions/checkout@v4

        - name: Set up Python
-          uses: actions/setup-python@v6
+          uses: actions/setup-python@v5
          with:
              python-version: '3.11'

--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -20,13 +20,13 @@ concurrency:

 jobs:
  python-check-requirements:
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
    name: check-requirements
    steps:
      - name: Check out source repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
      - name: Set up Python environment
-        uses: actions/setup-python@v6
+        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Run check-requirements.sh script
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -4,16 +4,10 @@ on:
  push:
    branches:
      - master
-    paths: [
-      '.github/workflows/python-lint.yml',
-      '**/*.py'
-    ]
+    paths: ['.github/workflows/python-lint.yml', '**/*.py']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/python-lint.yml',
-      '**/*.py'
-    ]
+    paths: ['.github/workflows/python-lint.yml', '**/*.py']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -21,16 +15,16 @@ concurrency:

 jobs:
  flake8-lint:
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
    name: Lint
    steps:
      - name: Check out source repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
      - name: Set up Python environment
-        uses: actions/setup-python@v6
+        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: flake8 Lint
-        uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2
+        uses: py-actions/flake8@v2
        with:
            plugins: "flake8-no-print"
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -4,17 +4,15 @@ on:
  push:
    paths:
      - '.github/workflows/python-type-check.yml'
-      - 'ty.toml'
+      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
-      # - 'pyrightconfig.json'
  pull_request:
    paths:
      - '.github/workflows/python-type-check.yml'
-      - 'ty.toml'
+      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
-      # - 'pyrightconfig.json'

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -22,22 +20,21 @@ concurrency:

 jobs:
  python-type-check:
-    runs-on: ubuntu-slim
-    name: python type-check
+    runs-on: ubuntu-latest
+    name: pyright type-check
    steps:
      - name: Check out source repository
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
      - name: Set up Python environment
-        uses: actions/setup-python@v6
+        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.26
-      # - name: Type-check with Pyright
-      #   uses: jakebailey/pyright-action@v2
-      #   with:
-      #     version: 1.1.382
-      #     level: warning
-      #     warnings: true
-      - name: Type-check with ty
-        run: |
-            ty check --output-format=github
+      - name: Install Python dependencies
+        # TODO: use a venv
+        run: pip install -r requirements/requirements-all.txt
+      - name: Type-check with Pyright
+        uses: jakebailey/pyright-action@v2
+        with:
+          version: 1.1.382
+          level: warning
+          warnings: true
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -10,22 +10,7 @@ on:
  push:
    branches:
      - master
-    paths: [
-      '.github/workflows/release.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal',
-      '**/*.comp',
-      '**/*.glsl'
-    ]
+    paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -36,50 +21,40 @@ env:
  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"

 jobs:
-  macOS-cpu:
-    strategy:
-      matrix:
-        include:
-          - build: 'arm64'
-            arch: 'arm64'
-            os: macos-14
-            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
-          - build: 'arm64-kleidiai'
-            arch: 'arm64'
-            os: macos-14
-            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
-          - build: 'x64'
-            arch: 'x64'
-            os: macos-15-intel
-            # Metal is disabled on x64 due to intermittent failures with Github runners not having a GPU:
-            # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-            defines: "-DGGML_METAL=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3"
-
-    runs-on: ${{ matrix.os }}
+  macOS-arm64:
+    runs-on: macos-14

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: macOS-latest-${{ matrix.arch }}
+          key: macOS-latest-cmake-arm64
          evict-old-files: 1d

+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          cmake -B build \
-            ${{ matrix.defines }} \
            -DCMAKE_INSTALL_RPATH='@loader_path' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DGGML_RPC=ON \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

@@ -91,53 +66,100 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*

      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz
-          name: llama-bin-macos-${{ matrix.build }}.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+          name: llama-bin-macos-arm64.zip

-  ubuntu-cpu:
+  macOS-x64:
+    runs-on: macos-15-intel
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-x64
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build \
+            -DCMAKE_INSTALL_RPATH='@loader_path' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+          name: llama-bin-macos-x64.zip
+
+  ubuntu-22-cpu:
    strategy:
      matrix:
        include:
          - build: 'x64'
            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
-          - build: 's390x'
-            os: ubuntu-24.04-s390x
+          - build: 's390x-z15' # z15 because our CI runners are on z15
+            os: ubuntu-22.04-s390x
+          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
+          # - build: 'arm64'
+          #   os: ubuntu-22.04-arm

    runs-on: ${{ matrix.os }}

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: ccache
-        if: ${{ matrix.build != 's390x' }}
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-cpu-${{ matrix.build }}
+          key: ubuntu-cpu-cmake-${{ matrix.build }}
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
-
-      - name: Toolchain workaround (GCC 14)
-        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
-        run: |
-          sudo apt-get install -y gcc-14 g++-14
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Build
        id: cmake_build
@@ -160,52 +182,37 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*

      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
-          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
+          name: llama-bin-ubuntu-${{ matrix.build }}.zip

-  ubuntu-vulkan:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
-
-    runs-on: ${{ matrix.os }}
+  ubuntu-22-vulkan:
+    runs-on: ubuntu-22.04

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-vulkan-${{ matrix.build }}
+          key: ubuntu-22-cmake-vulkan
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
-          if [[ "${{ matrix.os }}" =~ "ubuntu-22.04" ]]; then
-            wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-            sudo apt-get update -y
-            sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
-          else
-            sudo apt-get update -y
-            sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
-            echo "CC=gcc-14" >> "$GITHUB_ENV"
-            echo "CXX=g++-14" >> "$GITHUB_ENV"
-          fi
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev

      - name: Build
        id: cmake_build
@@ -228,93 +235,13 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*

      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
-          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
-
-  ubuntu-24-openvino:
-    runs-on: ubuntu-24.04
-
-    outputs:
-      openvino_version: ${{ steps.openvino_version.outputs.value }}
-
-    env:
-      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
-
-    steps:
-      - name: Set OpenVINO version output
-        id: openvino_version
-        run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-openvino-release-no-preset-v1
-          evict-old-files: 1d
-
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
-          sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
-
-      - name: Use OpenVINO Toolkit Cache
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenVINO dependencies
-        run: |
-          cd ./openvino_toolkit
-          chmod +x ./install_dependencies/install_openvino_dependencies.sh
-          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source ./openvino_toolkit/setupvars.sh
-          cmake -B build/ReleaseOV -G Ninja \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENVINO=ON
-          cmake --build build/ReleaseOV --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/ReleaseOV/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/ReleaseOV/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
-          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
+          name: llama-bin-ubuntu-vulkan-x64.zip

  windows-cpu:
    runs-on: windows-2025
@@ -327,14 +254,14 @@ jobs:

    steps:
      - name: Clone
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-cpu-${{ matrix.arch }}
+          key: windows-latest-cmake-cpu-${{ matrix.arch }}
          variant: ccache
          evict-old-files: 1d

@@ -342,28 +269,39 @@ jobs:
        run: |
          choco install ninja

+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+        with:
+          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
+
      - name: Build
        shell: cmd
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-            -DLLAMA_BUILD_BORINGSSL=ON ^
            -DGGML_NATIVE=OFF ^
            -DGGML_BACKEND_DL=ON ^
            -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
            -DGGML_OPENMP=ON ^
+            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release

      - name: Pack artifacts
        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
+          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
-          7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
+          7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*

      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v4
        with:
          path: llama-bin-win-cpu-${{ matrix.arch }}.zip
          name: llama-bin-win-cpu-${{ matrix.arch }}.zip
@@ -390,12 +328,12 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
+          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
          variant: ccache
          evict-old-files: 1d

@@ -436,16 +374,16 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
+          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
          cmake --build build --config Release --target ${{ matrix.target }}

      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          7z a -snl llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
+          7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll

      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v4
        with:
          path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
@@ -455,15 +393,15 @@ jobs:

    strategy:
      matrix:
-        cuda: ['12.4', '13.1']
+        cuda: ['12.4']

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@@ -482,7 +420,6 @@ jobs:
      - name: Build
        id: cmake_build
        shell: cmd
-        # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
          cmake -S . -B build -G "Ninja Multi-Config" ^
@@ -490,18 +427,17 @@ jobs:
            -DGGML_NATIVE=OFF ^
            -DGGML_CPU=OFF ^
            -DGGML_CUDA=ON ^
-            -DLLAMA_BUILD_BORINGSSL=ON ^
-            -DGGML_CUDA_CUB_3DOT2=ON
+            -DLLAMA_CURL=OFF
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda

      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          7z a -snl llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
+          7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll

      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v4
        with:
          path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
          name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
@@ -512,11 +448,10 @@ jobs:
          $dst='.\build\bin\cudart\'
          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*

      - name: Upload Cuda runtime
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v4
        with:
          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
@@ -536,12 +471,12 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-sycl
+          key: windows-latest-cmake-sycl
          variant: ccache
          evict-old-files: 1d

@@ -559,7 +494,7 @@ jobs:
            -DCMAKE_BUILD_TYPE=Release ^
            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-            -DLLAMA_BUILD_BORINGSSL=ON
+            -DLLAMA_CURL=OFF
          cmake --build build --target ggml-sycl -j

      - name: Build the release package
@@ -582,8 +517,6 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin

          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
@@ -593,147 +526,49 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin

          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
+          7z a llama-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload the release package
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v4
        with:
          path: llama-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

-  ubuntu-22-rocm:
-    runs-on: ubuntu-22.04
-
-    strategy:
-      matrix:
-        include:
-          - ROCM_VERSION: "7.2.1"
-            gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
-            build: 'x64'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt install -y build-essential git cmake wget
-
-      - name: Setup Legacy ROCm
-        if: matrix.ROCM_VERSION == '7.2.1'
-        id: legacy_env
-        run: |
-          sudo mkdir --parents --mode=0755 /etc/apt/keyrings
-          wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
-            gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
-
-          sudo tee /etc/apt/sources.list.d/rocm.list << EOF
-          deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} jammy main
-          EOF
-
-          sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
-          Package: *
-          Pin: release o=repo.radeon.com
-          Pin-Priority: 600
-          EOF
-
-          sudo apt update
-          sudo apt-get install -y libssl-dev rocm-hip-sdk
-
-      - name: Setup TheRock
-        if: matrix.ROCM_VERSION != '7.2.1'
-        id: therock_env
-        run: |
-          wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
-          mkdir install
-          tar -xf *.tar.gz -C install
-          export ROCM_PATH=$(pwd)/install
-          echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV
-          echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
-          echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_NATIVE=OFF \
-            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DGPU_TARGETS="${{ matrix.gpu_targets }}" \
-            -DGGML_HIP=ON \
-            -DHIP_PLATFORM=amd \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Get ROCm short version
-        run: echo "ROCM_VERSION_SHORT=$(echo '${{ matrix.ROCM_VERSION }}' | cut -d '.' -f 1,2)" >> $GITHUB_ENV
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
-          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
-
  windows-hip:
    runs-on: windows-2022

    env:
-      HIPSDK_INSTALLER_VERSION: "26.Q1"
+      HIPSDK_INSTALLER_VERSION: "25.Q3"

    strategy:
      matrix:
        include:
          - name: "radeon"
-            gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+            gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Grab rocWMMA package
        id: grab_rocwmma
        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.0.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.0.0.70001-42~24.04_amd64.deb"
          7z x rocwmma.deb
          7z x data.tar

      - name: Cache ROCm Installation
        id: cache-rocm
-        uses: actions/cache@v5
+        uses: actions/cache@v4
        with:
          path: C:\Program Files\AMD\ROCm
          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
+          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
          evict-old-files: 1d

      - name: Install ROCm
@@ -742,7 +577,7 @@ jobs:
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
          $completed = $proc.WaitForExit(600000)
@@ -776,20 +611,20 @@ jobs:
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.0.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
            -DCMAKE_BUILD_TYPE=Release `
            -DGGML_BACKEND_DL=ON `
            -DGGML_NATIVE=OFF `
            -DGGML_CPU=OFF `
-            -DGPU_TARGETS="${{ matrix.gpu_targets }}" `
+            -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_HIP=ON `
-            -DLLAMA_BUILD_BORINGSSL=ON
+            -DLLAMA_CURL=OFF
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          md "build\bin\hipblaslt\library"
-          cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
@@ -797,10 +632,10 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          7z a -snl llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
+          7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*

      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v4
        with:
          path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
@@ -810,7 +645,7 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -825,7 +660,7 @@ jobs:
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_CURL=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -850,106 +685,13 @@ jobs:
      - name: Pack artifacts
        id: pack_artifacts
        run: |
-          # Zip file is required for Swift Package Manager, which does not support tar.gz for binary targets.
-          # For more details, see https://developer.apple.com/documentation/xcode/distributing-binary-frameworks-as-swift-packages
-          zip -r -y llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework

      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-
-
-  openEuler-cann:
-    strategy:
-      matrix:
-        include:
-          # 910b with aclgraph (both architectures)
-          - arch: x86
-            chip_type: '910b'
-            build: 'Release'
-            use_acl_graph: 'on'
-          - arch: aarch64
-            chip_type: '910b'
-            build: 'Release'
-            use_acl_graph: 'on'
-          # 310p without aclgraph (both architectures)
-          - arch: x86
-            chip_type: '310p'
-            build: 'Release'
-            use_acl_graph: 'off'
-          - arch: aarch64
-            chip_type: '310p'
-            build: 'Release'
-            use_acl_graph: 'off'
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
-        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
-
-      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
-          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
-        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
-
-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE} \
-                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
-          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
+          name: llama-${{ steps.tag.outputs.name }}-xcframework

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -959,7 +701,7 @@ jobs:
    permissions:
        contents: write # for creating release

-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest

    needs:
      - windows
@@ -967,18 +709,16 @@ jobs:
      - windows-cuda
      - windows-sycl
      - windows-hip
-      - ubuntu-22-rocm
-      - ubuntu-cpu
-      - ubuntu-vulkan
-      - ubuntu-24-openvino
-      - macOS-cpu
+      - ubuntu-22-cpu
+      - ubuntu-22-vulkan
+      - macOS-arm64
+      - macOS-x64
      - ios-xcode-build
-      - openEuler-cann

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -988,7 +728,7 @@ jobs:

      - name: Download artifacts
        id: download-artifact
-        uses: actions/download-artifact@v7
+        uses: actions/download-artifact@v4
        with:
          path: ./artifact
          merge-multiple: true
@@ -1028,7 +768,6 @@ jobs:

          echo "Moving other artifacts..."
          mv -v artifact/*.zip release
-          mv -v artifact/*.tar.gz release

      - name: Create release
        id: create_release
@@ -1037,46 +776,10 @@ jobs:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
          tag_name: ${{ steps.tag.outputs.name }}
-          body: |
-            <details open>
-
-            ${{ github.event.head_commit.message }}
-
-            </details>
-
-            **macOS/iOS:**
-            - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
-            - [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
-            - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
-            - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
-
-            **Linux:**
-            - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
-            - [Ubuntu arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-arm64.tar.gz)
-            - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
-            - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
-            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
-            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
-            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
-
-            **Windows:**
-            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
-            - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
-            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
-            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
-            - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
-            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
-            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
-
-            **openEuler:**
-            - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
-            - [openEuler x86 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86-aclgraph.tar.gz)
-            - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
-            - [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)

      - name: Upload release
        id: upload_release
-        uses: actions/github-script@v8
+        uses: actions/github-script@v3
        with:
          github-token: ${{secrets.GITHUB_TOKEN}}
          script: |
@@ -1084,9 +787,9 @@ jobs:
            const fs = require('fs');
            const release_id = '${{ steps.create_release.outputs.id }}';
            for (let file of await fs.readdirSync('./release')) {
-              if (path.extname(file) === '.zip' || file.endsWith('.tar.gz')) {
+              if (path.extname(file) === '.zip') {
                console.log('uploadReleaseAsset', file);
-                await github.rest.repos.uploadReleaseAsset({
+                await github.repos.uploadReleaseAsset({
                  owner: context.repo.owner,
                  repo: context.repo.repo,
                  release_id: release_id,
--- a/.github/workflows/server-sanitize.yml
+++ b/.github/workflows/server-sanitize.yml
@@ -1,105 +0,0 @@
-name: Server (sanitize)
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/server-sanitize.yml',
-      '**/CMakeLists.txt',
-      '**/Makefile',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      'tools/server/**.*'
-    ]
-
-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  server:
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
-        build_type: [RelWithDebInfo]
-      fail-fast: false
-
-    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libssl-dev
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_SCHED_NO_REALLOC=ON \
-            -DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-            -DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-            -DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
-            -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
-            -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
-            -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.11'
-          pip-install: -r tools/server/tests/requirements.txt
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-        run: |
-          cd tools/server/tests
-          export ${{ matrix.extra_args }}
-          pytest -v -x -m "not slow"
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd tools/server/tests
-          export ${{ matrix.extra_args }}
-          SLOW_TESTS=1 pytest -v -x
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -1,124 +0,0 @@
-name: Server (self-hosted)
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/server-self-hosted.yml',
-      '**/CMakeLists.txt',
-      '**/Makefile',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.swift',
-      '**/*.m',
-      'tools/server/**.*'
-    ]
-
-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  server-metal:
-    runs-on: [self-hosted, llama-server, macOS, ARM64]
-
-    name: server-metal (${{ matrix.wf_name }})
-    strategy:
-      matrix:
-        build_type: [Release]
-        wf_name: ["GPUx1"]
-        include:
-          - build_type: Release
-            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
-            wf_name:    "GPUx1, backend-sampling"
-          - build_type: Release
-            extra_args: "GGML_METAL_DEVICES=2"
-            wf_name:    "GPUx2"
-          - build_type: Release
-            extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
-            wf_name:    "GPUx2, backend-sampling"
-      fail-fast: false
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-        run: |
-          cd tools/server/tests
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-          export ${{ matrix.extra_args }}
-          pytest -v -x -m "not slow"
-
-  server-cuda:
-    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
-
-    name: server-cuda (${{ matrix.wf_name }})
-    strategy:
-      matrix:
-        build_type: [Release]
-        wf_name: ["GPUx1"]
-        include:
-          - build_type: Release
-            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
-            wf_name:    "GPUx1, backend-sampling"
-      fail-fast: false
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-        run: |
-          cd tools/server/tests
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-          export ${{ matrix.extra_args }}
-          pytest -v -x -m "not slow"
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -1,108 +0,0 @@
-name: Server WebUI
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/server-webui.yml',
-      'tools/server/webui/**.*',
-      'tools/server/tests/**.*',
-      'tools/server/public/**'
-    ]
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/server-webui.yml',
-      'tools/server/webui/**.*',
-      'tools/server/tests/**.*',
-      'tools/server/public/**'
-    ]
-
-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  webui-check:
-    name: WebUI Checks
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    continue-on-error: true
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        id: node
-        uses: actions/setup-node@v6
-        with:
-          node-version: "22"
-          cache: "npm"
-          cache-dependency-path: "tools/server/webui/package-lock.json"
-
-      - name: Install dependencies
-        id: setup
-        if: ${{ steps.node.conclusion == 'success' }}
-        run: npm ci
-        working-directory: tools/server/webui
-
-      - name: Run type checking
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run check
-        working-directory: tools/server/webui
-
-      - name: Run linting
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run lint
-        working-directory: tools/server/webui
-
-      - name: Build application
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run build
-        working-directory: tools/server/webui
-
-      - name: Install Playwright browsers
-        id: playwright
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npx playwright install --with-deps
-        working-directory: tools/server/webui
-
-      - name: Build Storybook
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run build-storybook
-        working-directory: tools/server/webui
-
-      - name: Run Client tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:client
-        working-directory: tools/server/webui
-
-      - name: Run Unit tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:unit
-        working-directory: tools/server/webui
-
-      - name: Run UI tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:ui -- --testTimeout=60000
-        working-directory: tools/server/webui
-
-      - name: Run E2E tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:e2e
-        working-directory: tools/server/webui
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -1,3 +1,4 @@
+# Server build and tests
 name: Server

 on:
@@ -14,34 +15,10 @@ on:
  push:
    branches:
      - master
-    paths: [
-      '.github/workflows/server.yml',
-      '**/CMakeLists.txt',
-      '**/Makefile',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.swift',
-      '**/*.m',
-      'tools/server/**.*'
-    ]
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/server.yml',
-      '**/CMakeLists.txt',
-      '**/Makefile',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.swift',
-      '**/*.m',
-      'tools/server/**.*'
-    ]
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']

 env:
  LLAMA_LOG_COLORS: 1
@@ -57,19 +34,14 @@ jobs:
  server:
    runs-on: ubuntu-latest

-    name: server (${{ matrix.wf_name }})
    strategy:
      matrix:
-        build_type: [Release]
-        wf_name: ["default"]
+        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
+        build_type: [RelWithDebInfo]
        include:
          - build_type: Release
-            extra_args: ""
-            wf_name:    "default"
-          - build_type: Release
-            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
-            wf_name:    "backend-sampling"
-      fail-fast: false
+            sanitizer: ""
+      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken

    steps:
      - name: Dependencies
@@ -84,45 +56,283 @@ jobs:
            curl \
            wget \
            language-pack-en \
-            libssl-dev
+            libcurl4-openssl-dev

      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Build
-        id: cmake_build
+      - name: Python setup
+        id: setup_python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Tests dependencies
+        id: test_dependencies
        run: |
-          cmake -B build \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+          pip install -r tools/server/tests/requirements.txt
+
+  webui-setup:
+    name: WebUI Setup
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"
+
+      - name: Cache node_modules
+        uses: actions/cache@v4
+        id: cache-node-modules
+        with:
+          path: tools/server/webui/node_modules
+          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
+          restore-keys: |
+            ${{ runner.os }}-node-modules-
+
+      - name: Install dependencies
+        if: steps.cache-node-modules.outputs.cache-hit != 'true'
+        run: npm ci
+        working-directory: tools/server/webui
+
+  webui-check:
+    needs: webui-setup
+    name: WebUI Check
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+
+      - name: Restore node_modules cache
+        uses: actions/cache@v4
+        with:
+          path: tools/server/webui/node_modules
+          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
+          restore-keys: |
+            ${{ runner.os }}-node-modules-
+
+      - name: Run type checking
+        run: npm run check
+        working-directory: tools/server/webui
+
+      - name: Run linting
+        run: npm run lint
+        working-directory: tools/server/webui
+
+  webui-build:
+    needs: webui-check
+    name: WebUI Build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+
+      - name: Restore node_modules cache
+        uses: actions/cache@v4
+        with:
+          path: tools/server/webui/node_modules
+          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
+          restore-keys: |
+            ${{ runner.os }}-node-modules-
+
+      - name: Build application
+        run: npm run build
+        working-directory: tools/server/webui
+
+  webui-tests:
+    needs: webui-build
+    name: Run WebUI tests
+    permissions:
+      contents: read
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+
+      - name: Restore node_modules cache
+        uses: actions/cache@v4
+        with:
+          path: tools/server/webui/node_modules
+          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
+          restore-keys: |
+            ${{ runner.os }}-node-modules-
+
+      - name: Install Playwright browsers
+        run: npx playwright install --with-deps
+        working-directory: tools/server/webui
+
+      - name: Build Storybook
+        run: npm run build-storybook
+        working-directory: tools/server/webui
+
+      - name: Run Client tests
+        run: npm run test:client
+        working-directory: tools/server/webui
+
+      - name: Run Server tests
+        run: npm run test:server
+        working-directory: tools/server/webui
+
+      - name: Run UI tests
+        run: npm run test:ui
+        working-directory: tools/server/webui
+
+      - name: Run E2E tests
+        run: npm run test:e2e
+        working-directory: tools/server/webui
+
+  server-build:
+    needs: [webui-tests]
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
+        build_type: [RelWithDebInfo]
+        include:
+          - build_type: Release
+            sanitizer: ""
+      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
+
+    steps:
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get -y install \
+            build-essential \
+            xxd \
+            git \
+            cmake \
+            curl \
+            wget \
+            language-pack-en \
+            libcurl4-openssl-dev
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

      - name: Python setup
        id: setup_python
-        uses: actions/setup-python@v6
+        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-          pip-install: -r tools/server/tests/requirements.txt
+
+      - name: Tests dependencies
+        id: test_dependencies
+        run: |
+          pip install -r tools/server/tests/requirements.txt
+
+      - name: Setup Node.js for WebUI
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"
+
+      - name: Install WebUI dependencies
+        run: npm ci
+        working-directory: tools/server/webui
+
+      - name: Build WebUI
+        run: npm run build
+        working-directory: tools/server/webui
+
+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+              -DGGML_OPENMP=OFF ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Build (sanitizers)
+        id: cmake_build_sanitizers
+        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Build (sanitizers)
+        id: cmake_build
+        if: ${{ matrix.sanitizer == '' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

      - name: Tests
        id: server_integration_tests
-        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+        if: ${{ matrix.sanitizer == '' }}
+        env:
+          GITHUB_ACTIONS: "true"
        run: |
          cd tools/server/tests
-          export ${{ matrix.extra_args }}
-          pytest -v -x -m "not slow"
+          ./tests.sh
+
+      - name: Tests (sanitizers)
+        id: server_integration_tests_sanitizers
+        if: ${{ matrix.sanitizer != '' }}
+        run: |
+          cd tools/server/tests
+          LLAMA_SANITIZE=1 ./tests.sh

      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd tools/server/tests
-          export ${{ matrix.extra_args }}
-          SLOW_TESTS=1 pytest -v -x
+          SLOW_TESTS=1 ./tests.sh
+

  server-windows:
    runs-on: windows-2022
@@ -130,23 +340,40 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+
      - name: Build
        id: cmake_build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
+          cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
        id: setup_python
-        uses: actions/setup-python@v6
+        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
-          pip-install: -r tools/server/tests/requirements.txt
+
+      - name: Tests dependencies
+        id: test_dependencies
+        run: |
+          pip install -r tools/server/tests/requirements.txt
+
+      - name: Copy Libcurl
+        id: prepare_libcurl
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll

      - name: Tests
        id: server_integration_tests
--- a/.github/workflows/update-ops-docs.yml
+++ b/.github/workflows/update-ops-docs.yml
@@ -14,14 +14,14 @@ on:

 jobs:
    update-ops-docs:
-        runs-on: ubuntu-slim
+        runs-on: ubuntu-latest

        steps:
        - name: Checkout repository
-          uses: actions/checkout@v6
+          uses: actions/checkout@v4

        - name: Set up Python
-          uses: actions/setup-python@v6
+          uses: actions/setup-python@v5
          with:
              python-version: '3.x'

--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -9,7 +9,6 @@ jobs:
  update:
    name: Update Winget Package
    runs-on: ubuntu-latest
-    if: github.repository_owner == 'ggml-org'

    steps:
      - name: Install cargo binstall
@@ -17,28 +16,27 @@ jobs:

      - name: Install komac
        run: |
-          cargo binstall komac@2.15.0 -y
+          cargo binstall komac@2.11.2 -y

      - name: Find latest release
        id: find_latest_release
-        uses: actions/github-script@v8
+        uses: actions/github-script@v6
        with:
          script: |
            const { data: releases } = await github.rest.repos.listReleases({
              owner: context.repo.owner,
              repo: context.repo.repo,
            });
-            const { tag_name: version, assets: assets } = releases.find(({assets}) => assets.find(asset => asset.name.includes('win-vulkan')));
-            const { browser_download_url: asset_url } = assets.find(asset => asset.name.includes('win-vulkan'));
-            console.log("Latest release:", version);
-            core.setOutput('VERSION', version);
-            core.setOutput('ASSETURL', asset_url);
+            console.log("Latest release:", releases[0].tag_name);
+            return releases[0].tag_name;

      - name: Update manifest
+        env:
+          VERSION: ${{ steps.find_latest_release.outputs.result }}
        run: |
          echo "Updating manifest..."
-          komac update --version ${{ steps.find_latest_release.outputs.VERSION }} \
-            --urls "${{ steps.find_latest_release.outputs.ASSETURL }}" \
+          komac update --version ${{ env.VERSION }} \
+            --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
            --token ${{ secrets.WINGET_GITHUB_TOKEN }} \
            --submit \
            ggml.llamacpp
--- a/.gitignore
+++ b/.gitignore
@@ -20,41 +20,52 @@
 *.so
 *.swp
 *.tmp
-*.DS_Store

 # IDE / OS

-/.cache/
-/.ccls-cache/
-/.direnv/
-/.envrc
-/.idea/
-/.swiftpm
-/.vs/
-/.vscode/
-/nppBackup
+.cache/
+.ccls-cache/
+.direnv/
+.DS_Store
+.envrc
+.idea/
+.swiftpm
+.vs/
+.vscode/
+nppBackup


 # Coverage

-/gcovr-report/
-/lcov-report/
+gcovr-report/
+lcov-report/

 # Build Artifacts

-/tags
-/.build/
-/build*
-/release
-/debug
+tags
+.build/
+build*
+release
+debug
+!build-info.cmake
+!build-info.cpp.in
+!build-info.sh
+!build.zig
+!docs/build.md
 /libllama.so
 /llama-*
 /vulkan-shaders-gen
+android-ndk-*
+arm_neon.h
+cmake-build-*
+CMakeSettings.json
+compile_commands.json
+ggml-metal-embed.metal
+llama-batched-swift
 /rpc-server
-/out/
-/tmp/
-/autogen-*.md
-/common/build-info.cpp
+out/
+tmp/
+autogen-*.md

 # Deprecated

@@ -63,40 +74,44 @@

 # CI

-!/.github/workflows/*.yml
+!.github/workflows/*.yml

 # Models

-/models/*
-/models-mnt
-!/models/.editorconfig
-!/models/ggml-vocab-*.gguf*
-!/models/templates
+models/*
+models-mnt
+!models/.editorconfig
+!models/ggml-vocab-*.gguf*
+!models/templates

 # Zig
-/zig-out/
-/zig-cache/
+zig-out/
+zig-cache/
+
+# Logs
+
+ppl-*.txt
+qnt-*.txt
+perf-*.txt

 # Examples

-/examples/jeopardy/results.txt
-/tools/server/*.css.hpp
-/tools/server/*.html.hpp
-/tools/server/*.js.hpp
-/tools/server/*.mjs.hpp
-/tools/server/*.gz.hpp
-!/build_64.sh
-!/examples/*.bat
-!/examples/*/*.kts
-!/examples/*/*/*.kts
-!/examples/sycl/*.bat
-!/examples/sycl/*.sh
+examples/jeopardy/results.txt
+tools/server/*.css.hpp
+tools/server/*.html.hpp
+tools/server/*.js.hpp
+tools/server/*.mjs.hpp
+tools/server/*.gz.hpp
+!build_64.sh
+!examples/*.bat
+!examples/*/*.kts
+!examples/*/*/*.kts
+!examples/sycl/*.bat
+!examples/sycl/*.sh

 # Server Web UI temporary files
-/tools/server/webui/node_modules
-/tools/server/webui/dist
-# we no longer use gz for index.html
-/tools/server/public/index.html.gz
+node_modules
+tools/server/webui/dist

 # Python

@@ -126,22 +141,14 @@ poetry.toml
 # Scripts
 !/scripts/install-oneapi.bat

-# Generated by scripts
-/hellaswag_val_full.txt
-/winogrande-debiased-eval.csv
-/wikitext-2-raw/
-
 # Test models for lora adapters
 /lora-tests

 # Local scripts
 /run-vim.sh
 /run-chat.sh
-/run-spec.sh
-/.ccache/
+.ccache/

 # IDE
-/*.code-workspace
-/.windsurf/
-# emscripten
-a.out.*
+*.code-workspace
+.windsurf/
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,110 +0,0 @@
-# Instructions for llama.cpp
-
-> [!IMPORTANT]
-> This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
->
-> Read more: [CONTRIBUTING.md](CONTRIBUTING.md)
-
-AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below).
-
---
-
-## Guidelines for Contributors Using AI
-
-llama.cpp is built by humans, for humans. Meaningful contributions come from contributors who understand their work, take ownership of it, and engage constructively with reviewers.
-
-Maintainers receive numerous pull requests weekly, many of which are AI-generated submissions where the author cannot adequately explain the code, debug issues, or participate in substantive design discussions. Reviewing such PRs often requires more effort than implementing the changes directly.
-
-**A pull request represents a long-term commitment.** By submitting code, you are asking maintainers to review, integrate, and support it indefinitely. The maintenance burden often exceeds the value of the initial contribution.
-
-Most maintainers already have access to AI tools. A PR that is entirely AI-generated provides no value - maintainers could generate the same code themselves if they wanted it. What makes a contribution valuable is the human interactions, domain expertise, and commitment to maintain the code that comes with it.
-
-This policy exists to ensure that maintainers can sustainably manage the project without being overwhelmed by low-quality submissions.
-
---
-
-## Guidelines for Contributors
-
-Contributors are expected to:
-
-1. **Demonstrate full understanding of their code.** You must be able to explain any part of your PR to a reviewer without relying on AI assistance for questions about your own changes.
-
-2. **Take responsibility for maintenance.** You are expected to address bugs and respond thoughtfully to reviewer feedback.
-
-3. **Communicate clearly and concisely.** Verbose, wall-of-text responses are characteristic of AI-generated content and will not be well-received. Direct, human communication is expected.
-
-4. **Respect maintainers' time.** Search for existing issues and discussions before submitting. Ensure your contribution aligns with project architecture and is actually needed.
-
-Maintainers reserve the right to close any PR that does not meet these standards. This applies to all contributions to the main llama.cpp repository. **Private forks are exempt.**
-
-### Permitted AI Usage
-
-AI tools may be used responsibly for:
-
- **Learning and exploration**: Understanding codebase structure, techniques, and documentation
- **Code review assistance**: Obtaining suggestions on human-written code
- **Mechanical tasks**: Formatting, generating repetitive patterns from established designs, completing code based on existing patterns
- **Documentation drafts**: For components the contributor already understands thoroughly
- **Writing code**: Only when the contributor has already designed the solution and can implement it themselves - AI accelerates, not replaces, the contributor's work
-
-AI-generated code may be accepted if you (1) fully understand the output, (2) can debug issues independently, and (3) can discuss it directly with reviewers without AI assistance.
-
-**Disclosure is required** when AI meaningfully contributed to your code. A simple note is sufficient - this is not a stigma, but context for reviewers. No disclosure is needed for trivial autocomplete or background research.
-
-### Prohibited AI Usage
-
-The following will result in immediate PR closure:
-
- **AI-written PR descriptions or commit messages** - these are typically recognizable and waste reviewer time
- **AI-generated responses to reviewer comments** - this undermines the human-to-human interaction fundamental to code review
- **Implementing features without understanding the codebase** - particularly new model support or architectural changes
- **Automated commits or PR submissions** - this may spam maintainers and can result in contributor bans
-
---
-
-## Guidelines for AI Coding Agents
-
-AI agents assisting contributors must recognize that their outputs directly impact volunteer maintainers who sustain this project.
-
-### Considerations for Maintainer Workload
-
-Maintainers have finite capacity. Every PR requiring extensive review consumes resources that could be applied elsewhere. Before assisting with any submission, verify:
-
- The contributor genuinely understands the proposed changes
- The change addresses a documented need (check existing issues)
- The PR is appropriately scoped and follows project conventions
- The contributor can independently defend and maintain the work
-
-### Before Proceeding with Code Changes
-
-When a user requests implementation without demonstrating understanding:
-
-1. **Verify comprehension.** Ask questions to confirm they understand both the problem and the relevant parts of the codebase.
-2. **Provide guidance rather than solutions.** Direct them to relevant code and documentation. Allow them to formulate the approach.
-3. **Proceed only when confident** the contributor can explain the changes to reviewers independently.
-
-For first-time contributors, confirm they have reviewed [CONTRIBUTING.md](CONTRIBUTING.md) and acknowledge this policy.
-
-### Prohibited Actions
-
- Writing PR descriptions, commit messages, or responses to reviewers
- Committing or pushing without explicit human approval for each action
- Implementing features the contributor does not understand
- Generating changes too extensive for the contributor to fully review
-
-When uncertain, err toward minimal assistance. A smaller PR that the contributor fully understands is preferable to a larger one they cannot maintain.
-
-### Useful Resources
-
-To conserve context space, load these resources as needed:
-
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Existing issues](https://github.com/ggml-org/llama.cpp/issues) and [Existing PRs](https://github.com/ggml-org/llama.cpp/pulls) - always search here first
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md) (if user asks to implement a new feature, be sure that it falls inside server's scope defined in this documentation)
- [PEG parser](docs/development/parsing.md) - alternative to regex that llama.cpp uses to parse model's output
- [Auto parser](docs/autoparser.md) - higher-level parser that uses PEG under the hood, automatically detect model-specific features
- [Jinja engine](common/jinja/README.md)
- [How to add a new model](docs/development/HOWTO-add-model.md)
- [PR template](.github/pull_request_template.md)
--- a/1089
+++ b/1089
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1 +0,0 @@
-IMPORTANT: Ensure you’ve thoroughly reviewed the [AGENTS.md](AGENTS.md) file before beginning any work.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 include(CheckIncludeFileCXX)

@@ -33,24 +33,10 @@ endif()

 option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)

-option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
-
 if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)

-    # Use 64-bit memory to support backend_get_memory queries
-    # TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
-    if (LLAMA_WASM_MEM64)
-      add_compile_options("-sMEMORY64=1")
-      add_link_options("-sMEMORY64=1")
-    endif()
-    add_link_options("-sALLOW_MEMORY_GROWTH=1")
-
-    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
-    option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
-    if (LLAMA_BUILD_HTML)
-        set(CMAKE_EXECUTABLE_SUFFIX ".html")
-    endif()
+    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
@@ -72,12 +58,6 @@ if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

-if (LLAMA_STANDALONE)
-    # enable parallel builds for msbuild
-    list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
-    list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
-endif()
-
 if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
    set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
 else()
@@ -108,15 +88,13 @@ option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_WEBUI    "llama: build the embedded Web UI for server"  ON)
 option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
-option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)

 # 3rd party libs
-option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
+option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
+option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" OFF)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)

-
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
@@ -143,15 +121,10 @@ if (NOT DEFINED GGML_CUDA_GRAPHS)
 endif()

 # transition helpers
-function (llama_option_depr TYPE OLD)
+function (llama_option_depr TYPE OLD NEW)
    if (${OLD})
-        set(NEW "${ARGV2}")
-        if(NEW)
-            message(${TYPE} "${OLD} is deprecated, use ${NEW} instead")
-            set(${NEW} ON PARENT_SCOPE)
-        else()
-            message(${TYPE} "${OLD} is deprecated and will be ignored")
-        endif()
+        message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
+        set(${NEW} ON PARENT_SCOPE)
    endif()
 endfunction()

@@ -164,10 +137,29 @@ llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
-llama_option_depr(WARNING     LLAMA_CURL)

-include("cmake/license.cmake")
-license_add_file("llama.cpp" "LICENSE")
+if (NOT MSVC)
+    if (LLAMA_SANITIZE_THREAD)
+        message(STATUS "Using -fsanitize=thread")
+
+        add_compile_options(-fsanitize=thread)
+        link_libraries     (-fsanitize=thread)
+    endif()
+
+    if (LLAMA_SANITIZE_ADDRESS)
+        message(STATUS "Using -fsanitize=address")
+
+        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+        link_libraries     (-fsanitize=address)
+    endif()
+
+    if (LLAMA_SANITIZE_UNDEFINED)
+        message(STATUS "Using -fsanitize=undefined")
+
+        add_compile_options(-fsanitize=undefined)
+        link_libraries     (-fsanitize=undefined)
+    endif()
+endif()

 #
 # 3rd-party
@@ -186,6 +178,11 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()

+if (MINGW)
+    # Target Windows 8 for PrefetchVirtualMemory
+    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
 #
 # build the library
 #
@@ -196,9 +193,13 @@ add_subdirectory(src)
 # utils, programs, examples and tests
 #

+if (NOT LLAMA_BUILD_COMMON)
+    message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
+    set(LLAMA_CURL OFF)
+endif()
+
 if (LLAMA_BUILD_COMMON)
    add_subdirectory(common)
-    add_subdirectory(vendor/cpp-httplib)
 endif()

 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
@@ -215,19 +216,6 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
    add_subdirectory(tools)
 endif()

-# Automatically add all files from the 'licenses' directory
-file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
-
-foreach(FILE_PATH ${EXTRA_LICENSES})
-    get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
-    string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
-    license_add_file("${NAME}" "${FILE_PATH}")
-endforeach()
-
-if (LLAMA_BUILD_COMMON)
-    license_generate(common)
-endif()
-
 #
 # install
 #
--- a/93
+++ b/93
@@ -2,19 +2,27 @@
 # multiplie collaborators per item can be specified

 /.devops/*.Dockerfile                   @ngxson
-/.github/actions/                       @ggml-org/ci
-/.github/workflows/                     @ggml-org/ci
+/.github/actions/                       @slaren @CISC
+/.github/workflows/                     @CISC
+/.github/workflows/release.yml          @slaren
+/.github/workflows/winget.yml           @slaren
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
-/common/                                @ggml-org/llama-common
-/common/jinja/                          @CISC
-/common/ngram-map.*                     @srogmann
+/common/CMakeLists.txt                  @ggerganov
+/common/arg.*                           @ggerganov @ericcurtin
+/common/base64.hpp.*                    @ggerganov
+/common/build-info.*                    @ggerganov
+/common/common.*                        @ggerganov
+/common/console.*                       @ggerganov
+/common/http.*                          @angt
+/common/llguidance.*                    @ggerganov
+/common/log.*                           @ggerganov
+/common/sampling.*                      @ggerganov
+/common/speculative.*                   @ggerganov
 /convert_*.py                           @CISC
-/docs/backend/snapdragon/               @ggml-org/ggml-hexagon
 /examples/batched.swift/                @ggerganov
 /examples/batched/                      @ggerganov
 /examples/convert-llama2c-to-ggml/      @ggerganov
-/examples/debug/                        @danbev @pwilkin
 /examples/deprecation-warning/          @ggerganov
 /examples/diffusion/                    @am17an
 /examples/embedding/                    @ggerganov
@@ -22,7 +30,7 @@
 /examples/export-docs/                  @ggerganov
 /examples/gen-docs/                     @ggerganov
 /examples/gguf/                         @ggerganov
-/examples/llama.android/                @ggerganov @hanyin-arm @naco-siren
+/examples/llama.android/                @ggerganov
 /examples/llama.swiftui/                @ggerganov
 /examples/llama.vim                     @ggerganov
 /examples/lookahead/                    @ggerganov
@@ -32,66 +40,73 @@
 /examples/passkey/                      @ggerganov
 /examples/retrieval/                    @ggerganov
 /examples/save-load-state/              @ggerganov
+/examples/simple-chat/                  @slaren
+/examples/simple/                       @slaren
 /examples/speculative-simple/           @ggerganov
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
-/ggml/include/                          @ggerganov
-/ggml/src/ggml-cann/                    @ggml-org/ggml-cann
-/ggml/src/ggml-common.h                 @ggerganov
-/ggml/src/ggml-cpu/                     @ggerganov
+/ggml/include/                          @ggerganov @slaren
+/ggml/src/ggml-alloc.c                  @slaren
+/ggml/src/ggml-backend*                 @slaren
+/ggml/src/ggml-blas/                    @slaren
+/ggml/src/ggml-common.h                 @ggerganov @slaren
+/ggml/src/ggml-cpu/                     @ggerganov @slaren
 /ggml/src/ggml-cpu/spacemit/            @alex-spacemit
-/ggml/src/ggml-cuda/                    @ggml-org/ggml-cuda
+/ggml/src/ggml-cuda/common.cuh          @slaren
+/ggml/src/ggml-cuda/fattn*              @JohannesGaessler
+/ggml/src/ggml-cuda/ggml-cuda.cu        @slaren
+/ggml/src/ggml-cuda/mmf.*               @JohannesGaessler @am17an
+/ggml/src/ggml-cuda/mmq.*               @JohannesGaessler
+/ggml/src/ggml-cuda/mmvf.*              @JohannesGaessler
+/ggml/src/ggml-cuda/mmvq.*              @JohannesGaessler
 /ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
 /ggml/src/ggml-hip/                     @IMbackK
 /ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
-/ggml/src/ggml-impl.h                   @ggerganov
-/ggml/src/ggml-metal/                   @ggml-org/ggml-metal
-/ggml/src/ggml-opencl/                  @ggml-org/ggml-opencl
-/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
+/ggml/src/ggml-impl.h                   @ggerganov @slaren
+/ggml/src/ggml-metal/                   @ggerganov
+/ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
+/ggml/src/ggml-hexagon/                 @max-krasnyansky
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
-/ggml/src/ggml-rpc/                     @ggml-org/ggml-rpc
-/ggml/src/ggml-sycl/                    @ggml-org/ggml-sycl
-/ggml/src/ggml-threading.*              @ggerganov
-/ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
-/ggml/src/ggml-virtgpu/                 @kpouget
-/ggml/src/ggml-webgpu/                  @ggml-org/ggml-webgpu
-/ggml/src/ggml-zdnn/                    @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml-openvino/                @cavusmustafa @wine99
-/ggml/src/ggml.c                        @ggerganov
-/ggml/src/ggml.cpp                      @ggerganov
+/ggml/src/ggml-rpc/                     @rgerganov
+/ggml/src/ggml-threading.*              @ggerganov @slaren
+/ggml/src/ggml-vulkan/                  @0cc4m
+/ggml/src/ggml-webgpu/                  @reeselevine
+/ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
+/ggml/src/ggml.c                        @ggerganov @slaren
+/ggml/src/ggml.cpp                      @ggerganov @slaren
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
 /gguf-py/                               @CISC
 /media/                                 @ggerganov
 /scripts/gen*                           @ggerganov
 /scripts/get*                           @ggerganov
 /scripts/sync*                          @ggerganov
-/scripts/snapdragon/                    @ggml-org/ggml-hexagon
 /src/                                   @ggerganov
 /src/llama-adapter.*                    @CISC
 /src/llama-arch.*                       @CISC
 /src/llama-chat.*                       @ngxson
 /src/llama-graph.*                      @CISC
+/src/llama-model-loader.*               @slaren
 /src/llama-model.*                      @CISC
 /src/llama-vocab.*                      @CISC
-/src/models/                            @CISC
 /tests/                                 @ggerganov
-/tests/test-chat.*                      @pwilkin
-/tests/test-llama-archs.cpp             @JohannesGaessler
+/tests/test-backend-ops.cpp             @slaren
+/tests/test-thread-safety.cpp           @slaren
 /tools/batched-bench/                   @ggerganov
-/tools/cli/                             @ngxson
-/tools/completion/                      @ggerganov
-/tools/mtmd/                            @ggml-org/llama-mtmd
+/tools/llama-bench/                     @slaren
+/tools/main/                            @ggerganov
+/tools/mtmd/                            @ngxson
 /tools/perplexity/                      @ggerganov
-/tools/parser/                          @pwilkin
 /tools/quantize/                        @ggerganov
-/tools/rpc/                             @ggml-org/ggml-rpc
-/tools/server/*                         @ggml-org/llama-server # no subdir
-/tools/server/tests/                    @ggml-org/llama-server
-/tools/server/webui/                    @ggml-org/llama-webui
+/tools/rpc/                             @rgerganov
+/tools/run/                             @ericcurtin
+/tools/server/*                         @ngxson @ggerganov @ericcurtin # no subdir
+/tools/server/webui/                    @allozaur
 /tools/tokenize/                        @ggerganov
 /tools/tts/                             @ggerganov
 /vendor/                                @ggerganov
+/.clang-format                          @slaren
+/.clang-tidy                            @slaren
 /AUTHORS                                @ggerganov
 /CMakeLists.txt                         @ggerganov
 /CONTRIBUTING.md                        @ggerganov
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -6,53 +6,19 @@ The project differentiates between 3 levels of contributors:
 - Collaborators (Triage): people with significant contributions, who may be responsible for some parts of the code, and are expected to maintain and review contributions for the code they own
 - Maintainers: responsible for reviewing and merging PRs, after approval from the code owners

-# AI Usage Policy
-
-> [!IMPORTANT]
-> This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
->
-> Repeated violations of this policy may result in your account being permanently banned from contributing to the project.
->
-> Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file.
-
-Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations).
-
-If AI is used to generate any portion of the code, contributors must adhere to the following requirements:
-
-1. Explicitly disclose the manner in which AI was employed.
-2. Perform a comprehensive manual review prior to submitting the pull request.
-3. Be prepared to explain every line of code they submitted when asked about it by a maintainer.
-4. It is strictly prohibited to use AI to write your posts for you (bug reports, feature requests, pull request descriptions, Github discussions, responding to humans, ...).
-
-For more info, please refer to the [AGENTS.md](AGENTS.md) file.
-
 # Pull requests (for contributors & collaborators)

-Before submitting your PR:
- Search for existing PRs to prevent duplicating efforts
 - llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
 - Test your changes:
-  - Execute [the full CI locally on your machine](ci/README.md) before publishing
-  - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
-  - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
-  - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Create separate PRs for each feature or fix:
-  - Avoid combining unrelated changes in a single PR
-  - For intricate features, consider opening a feature request first to discuss and align expectations
-  - When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
-  - In particular, adding new data types (extension of the `ggml_type` enum) carries with it a disproportionate maintenance burden. As such, to add a new quantization type you will need to meet the following *additional* criteria *at minimum*:
-    - convert a small model to GGUF using the new type and upload it to HuggingFace
-    - provide [perplexity](https://github.com/ggml-org/llama.cpp/tree/master/tools/perplexity) comparisons to FP16/BF16 (whichever is the native precision) as well as to types of similar size
-    - provide KL divergence data calculated vs. the FP16/BF16 (whichever is the native precision) version for both the new type as well as types of similar size
-    - provide [performance data](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) for the new type in comparison to types of similar size on pure CPU
+    - Execute [the full CI locally on your machine](ci/README.md) before publishing
+    - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
+    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
+    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
+- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor, limit your open PRs to 1.
-
-After submitting your PR:
- Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
+- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
 - Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for fixing related issues and reviewing related PRs
+- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs

 # Pull requests (for maintainers)

@@ -63,11 +29,6 @@ After submitting your PR:
 - When merging a PR, make sure you have a good understanding of the changes
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)

-Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
- The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone.
- The pull request duplicates an existing one.
- The contributor fails to adhere to this contributing guide or the AI policy.
-
 # Coding guidelines

 - Avoid adding third-party dependencies, extra files, extra headers, etc.
@@ -167,7 +128,7 @@ Maintainers reserve the right to decline review or close pull requests for any r

 # Code maintenance

- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file responsible for:
+- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
  - Reviewing and merging related PRs
  - Fixing related bugs
  - Providing developer guidance/support
@@ -180,8 +141,6 @@ Maintainers reserve the right to decline review or close pull requests for any r
 - New code should follow the guidelines (coding, naming, etc.) outlined in this document. Exceptions are allowed in isolated, backend-specific parts of the code that do not interface directly with the `ggml` interfaces.
  _(NOTE: for legacy reasons, existing code is not required to follow this guideline)_

- For changes in server, please make sure to refer to the [server development documentation](./tools/server/README-dev.md)
-
 # Documentation

 - Documentation is a community effort
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023-2026 The ggml authors
+Copyright (c) 2023-2024 The ggml authors

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@@ -17,14 +17,14 @@ LLM inference in C/C++

 ## Hot topics

- **Hugging Face cache migration: models downloaded with `-hf` are now stored in the standard Hugging Face cache directory, enabling sharing with other HF tools.**
- **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
- [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
- [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
+- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)**
+- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
 - Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
+- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
 - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
+- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

@@ -62,7 +62,6 @@ range of hardware - locally and in the cloud.
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2, AVX512 and AMX support for x86 architectures
- RVV, ZVFH, ZFH, ZICBOP and ZIHINTPAUSE support for RISC-V architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
 - Vulkan and SYCL backend support
@@ -85,7 +84,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
 - [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
- [x] [Jamba](https://huggingface.co/ai21labs)
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
@@ -133,7 +131,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
 - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
 - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
- [x] [RWKV-7](https://huggingface.co/collections/shoumenchougou/rwkv7-gxx-gguf)
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
 - [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
 - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
@@ -192,7 +189,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 - Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
 - Go (no CGo needed): [hybridgroup/yzma](https://github.com/hybridgroup/yzma)
- Android: [llama.android](/examples/llama.android)

 </details>

@@ -202,7 +198,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [BonzAI App](https://apps.apple.com/us/app/bonzai-your-local-ai-agent/id6752847988) (proprietary)
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
 - [Dot](https://github.com/alexpinel/Dot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
@@ -214,7 +209,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
 - [LARS](https://github.com/abgulati/LARS) (AGPL)
 - [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
- [LlamaLib](https://github.com/undreamai/LlamaLib) (Apache-2.0)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
 - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
@@ -242,12 +236,11 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 <details>
 <summary>Tools</summary>

- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from Hugging Face Hub and convert them to GGML
+- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
 - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0)

 </details>

@@ -260,8 +253,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
 - [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
 - [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
- [LLMKube](https://github.com/defilantech/llmkube) - Kubernetes operator for llama.cpp with multi-GPU and Apple Silicon Metal
-  support"
 </details>

 <details>
@@ -280,19 +271,16 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [BLAS](docs/build.md#blas-build) | All |
 | [BLIS](docs/backend/BLIS.md) | All |
 | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
 | [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
 | [HIP](docs/build.md#hip) | AMD GPU |
-| [ZenDNN](docs/build.md#zendnn) | AMD CPU |
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
-| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
-| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
+| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |

 ## Obtaining and quantizing models

@@ -301,13 +289,13 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
 - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
 - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)

-You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:

 ```sh
 llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
 ```

-By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. The `MODEL_ENDPOINT` must point to a Hugging Face compatible API endpoint.
+By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.

 After downloading a model, use the CLI tools to run it locally - see below.

@@ -322,7 +310,7 @@ The Hugging Face platform provides a variety of online tools for converting, qua

 To learn more about model quantization, [read this documentation](tools/quantize/README.md)

-## [`llama-cli`](tools/cli)
+## [`llama-cli`](tools/main)

 #### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.

@@ -356,6 +344,19 @@ To learn more about model quantization, [read this documentation](tools/quantize

    </details>

+- <details>
+    <summary>Run simple text completion</summary>
+
+    To disable conversation mode explicitly, use `-no-cnv`
+
+    ```bash
+    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
+
+    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+    ```
+
+    </details>
+
 - <details>
    <summary>Constrain the output with a custom grammar</summary>

@@ -490,6 +491,21 @@ To learn more about model quantization, [read this documentation](tools/quantize

    </details>

+## [`llama-run`](tools/run)
+
+#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
+
+- <details>
+    <summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
+
+    ```bash
+    llama-run granite-code
+    ```
+
+    </details>
+
+[^3]: [RamaLama](https://github.com/containers/ramalama)
+
 ## [`llama-simple`](examples/simple)

 #### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
@@ -519,8 +535,7 @@ To learn more about model quantization, [read this documentation](tools/quantize

 ## Other documentation

- [cli](tools/cli/README.md)
- [completion](tools/completion/README.md)
+- [main (cli)](tools/main/README.md)
 - [server](tools/server/README.md)
 - [GBNF grammars](grammars/README.md)

@@ -592,5 +607,7 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 - [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
 - [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
 - [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
+- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
+- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
+- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
 - [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,52 +1,12 @@
 # Security Policy

- - [**Reporting a vulnerability**](#reporting-a-vulnerability)
- - [**Requirements**](#requirements)
- - [**Covered Topics**](#covered-topics)
 - [**Using llama.cpp securely**](#using-llamacpp-securely)
   - [Untrusted models](#untrusted-models)
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
   - [Untrusted environments or networks](#untrusted-environments-or-networks)
   - [Multi-Tenant environments](#multi-tenant-environments)
-
-## Reporting a vulnerability
-
-If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
-
-Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
-
-A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
-
-> [!IMPORTANT]
-> For collaborators: if you are interested in helping out with reviewing private security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
-
-## Requirements
-
-Before submitting your report, ensure you meet the following requirements:
-
- You have read this policy and fully understand it.
- AI is only permitted in an assistive capacity as stated in [AGENTS.md](AGENTS.md). We do not accept reports that are written exclusively by AI.
- Your report must include a working Proof-of-Concept in the form of a script and/or attached files.
-
-Maintainers reserve the right to close the report if these requirements are not fulfilled.
-
-## Covered Topics
-
-Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues.
-
- `src/**/*`
- `ggml/**/*`
- `gguf-py/**/*`
- `tools/server/*`, **excluding** the following topics:
-    - Web UI
-    - Features marked as experimental
-    - Features not recommended for use in untrusted environments (e.g., router, MCP)
-    - Bugs that can lead to Denial-of-Service attack
-
-Note that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities in LLaMA C++.
-
-For vulnerabilities that fall within the `vendor` directory, please report them directly to the third-party project.
+ - [**Reporting a vulnerability**](#reporting-a-vulnerability)

 ## Using llama.cpp securely

@@ -95,3 +55,14 @@ If you intend to run multiple models in parallel with shared memory, it is your
 3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.

 4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
+
+## Reporting a vulnerability
+
+Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
+
+<!-- normal version -->
+However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+
+Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
+
+A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
--- a/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.html
+++ b/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.html
--- a/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.json
+++ b/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.json
@@ -1,6 +0,0 @@
-{
-  "chars": 2296.1916666666666,
-  "chars:std": 986.051306946325,
-  "score": 0.925,
-  "score:std": 0.26339134382131846
-}
--- a/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547_allresults.json
+++ b/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547_allresults.json
--- a/benches/dgx-spark/dgx-spark.md
+++ b/benches/dgx-spark/dgx-spark.md
@@ -1,311 +0,0 @@
-## System info
-
-```bash
-uname --all
-Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
-
-g++ --version
-g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
-
-nvidia-smi
-Thu Feb  5 13:49:40 2026
-+-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
-+-----------------------------------------+------------------------+----------------------+
-| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
-|                                         |                        |               MIG M. |
-|=========================================+========================+======================|
-|   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
-| N/A   47C    P0             13W /  N/A  | Not Supported          |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-```
-
-## ggml-org/gpt-oss-20b-GGUF
-
-Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.270 |  1895.57 |    0.399 |    80.13 |    0.669 |   812.60 |
-|   512 |     32 |    2 |   1088 |    0.230 |  4451.23 |    0.583 |   109.71 |    0.813 |  1337.56 |
-|   512 |     32 |    4 |   2176 |    0.437 |  4688.87 |    0.820 |   156.03 |    1.257 |  1730.91 |
-|   512 |     32 |    8 |   4352 |    0.863 |  4744.23 |    0.942 |   271.79 |    1.805 |  2410.73 |
-|   512 |     32 |   16 |   8704 |    1.725 |  4748.19 |    1.173 |   436.38 |    2.899 |  3002.85 |
-|   512 |     32 |   32 |  17408 |    3.437 |  4767.38 |    1.503 |   681.49 |    4.939 |  3524.40 |
-|  4096 |     32 |    1 |   4128 |    0.907 |  4513.91 |    0.407 |    78.54 |    1.315 |  3139.56 |
-|  4096 |     32 |    2 |   8256 |    1.796 |  4560.42 |    0.625 |   102.37 |    2.422 |  3409.45 |
-|  4096 |     32 |    4 |  16512 |    3.596 |  4555.66 |    0.888 |   144.11 |    4.485 |  3681.93 |
-|  4096 |     32 |    8 |  33024 |    7.184 |  4561.44 |    1.098 |   233.11 |    8.282 |  3987.51 |
-|  4096 |     32 |   16 |  66048 |   14.369 |  4560.82 |    1.503 |   340.74 |   15.872 |  4161.30 |
-|  4096 |     32 |   32 | 132096 |   28.760 |  4557.52 |    2.162 |   473.59 |   30.922 |  4271.95 |
-|  8192 |     32 |    1 |   8224 |    1.859 |  4405.59 |    0.430 |    74.36 |    2.290 |  3591.61 |
-|  8192 |     32 |    2 |  16448 |    3.698 |  4430.02 |    0.656 |    97.59 |    4.354 |  3777.47 |
-|  8192 |     32 |    4 |  32896 |    7.403 |  4426.10 |    0.957 |   133.82 |    8.360 |  3934.97 |
-|  8192 |     32 |    8 |  65792 |   14.802 |  4427.63 |    1.222 |   209.44 |   16.024 |  4105.87 |
-|  8192 |     32 |   16 | 131584 |   29.596 |  4428.67 |    1.741 |   294.13 |   31.337 |  4199.00 |
-|  8192 |     32 |   32 | 263168 |   59.169 |  4430.42 |    2.619 |   390.92 |   61.789 |  4259.17 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap | dio |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |          pp2048 |      4505.82 ± 12.90 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |            tg32 |         83.43 ± 0.59 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |  pp2048 @ d4096 |      4158.34 ± 18.84 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |    tg32 @ d4096 |         79.22 ± 0.60 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |  pp2048 @ d8192 |      3993.81 ± 17.55 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |    tg32 @ d8192 |         75.22 ± 1.05 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   1 | pp2048 @ d16384 |      3449.98 ± 12.13 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |   tg32 @ d16384 |         70.36 ± 0.37 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   1 | pp2048 @ d32768 |      2689.42 ± 18.89 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |   tg32 @ d32768 |         61.65 ± 0.30 |
-
-build: 11fb327bf (7941)
-
-## ggml-org/gpt-oss-120b-GGUF
-
-Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.445 |  1151.80 |    0.560 |    57.14 |    1.005 |   541.53 |
-|   512 |     32 |    2 |   1088 |    0.472 |  2169.85 |    0.874 |    73.27 |    1.345 |   808.65 |
-|   512 |     32 |    4 |   2176 |    0.826 |  2480.33 |    1.299 |    98.51 |    2.125 |  1023.94 |
-|   512 |     32 |    8 |   4352 |    1.644 |  2491.67 |    1.608 |   159.18 |    3.252 |  1338.20 |
-|   512 |     32 |   16 |   8704 |    3.292 |  2488.35 |    2.117 |   241.85 |    5.409 |  1609.13 |
-|   512 |     32 |   32 |  17408 |    6.604 |  2481.07 |    2.898 |   353.31 |    9.502 |  1832.04 |
-|  4096 |     32 |    1 |   4128 |    1.698 |  2412.65 |    0.580 |    55.21 |    2.277 |  1812.66 |
-|  4096 |     32 |    2 |   8256 |    3.399 |  2409.88 |    0.934 |    68.53 |    4.333 |  1905.27 |
-|  4096 |     32 |    4 |  16512 |    6.823 |  2401.21 |    1.411 |    90.72 |    8.234 |  2005.30 |
-|  4096 |     32 |    8 |  33024 |   13.574 |  2413.97 |    1.841 |   139.07 |   15.415 |  2142.31 |
-|  4096 |     32 |   16 |  66048 |   27.176 |  2411.52 |    2.609 |   196.26 |   29.785 |  2217.49 |
-|  4096 |     32 |   32 | 132096 |   54.359 |  2411.23 |    3.905 |   262.20 |   58.264 |  2267.19 |
-|  8192 |     32 |    1 |   8224 |    3.491 |  2346.81 |    0.613 |    52.23 |    4.103 |  2004.21 |
-|  8192 |     32 |    2 |  16448 |    6.939 |  2361.03 |    0.981 |    65.21 |    7.921 |  2076.56 |
-|  8192 |     32 |    4 |  32896 |   13.888 |  2359.40 |    1.511 |    84.71 |   15.399 |  2136.21 |
-|  8192 |     32 |    8 |  65792 |   27.756 |  2361.18 |    2.034 |   125.86 |   29.790 |  2208.56 |
-|  8192 |     32 |   16 | 131584 |   55.554 |  2359.34 |    3.021 |   169.49 |   58.575 |  2246.41 |
-|  8192 |     32 |   32 | 263168 |  111.036 |  2360.89 |    4.537 |   225.72 |  115.573 |  2277.08 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap | dio |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |          pp2048 |       2443.91 ± 7.47 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |            tg32 |         58.72 ± 0.20 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |  pp2048 @ d4096 |       2309.84 ± 3.63 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |    tg32 @ d4096 |         55.67 ± 0.35 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |  pp2048 @ d8192 |      2216.68 ± 10.16 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |    tg32 @ d8192 |         52.87 ± 0.43 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   1 | pp2048 @ d16384 |       1956.31 ± 6.39 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |   tg32 @ d16384 |         49.45 ± 0.20 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   1 | pp2048 @ d32768 |      1567.08 ± 11.79 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |   tg32 @ d32768 |         42.76 ± 0.14 |
-
-build: 11fb327bf (7941)
-
-## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
-
-Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.393 |  1303.73 |    0.548 |    58.36 |    0.941 |   578.10 |
-|   512 |     32 |    2 |   1088 |    0.387 |  2648.68 |    0.910 |    70.35 |    1.296 |   839.27 |
-|   512 |     32 |    4 |   2176 |    0.659 |  3107.63 |    1.302 |    98.33 |    1.961 |  1109.77 |
-|   512 |     32 |    8 |   4352 |    1.322 |  3099.35 |    1.669 |   153.42 |    2.990 |  1455.43 |
-|   512 |     32 |   16 |   8704 |    2.639 |  3104.63 |    2.212 |   231.44 |    4.851 |  1794.32 |
-|   512 |     32 |   32 |  17408 |    5.284 |  3100.80 |    2.955 |   346.53 |    8.239 |  2112.93 |
-|  4096 |     32 |    1 |   4128 |    1.417 |  2890.36 |    0.598 |    53.51 |    2.015 |  2048.45 |
-|  4096 |     32 |    2 |   8256 |    2.829 |  2895.62 |    1.019 |    62.82 |    3.848 |  2145.60 |
-|  4096 |     32 |    4 |  16512 |    5.656 |  2896.96 |    1.528 |    83.79 |    7.183 |  2298.71 |
-|  4096 |     32 |    8 |  33024 |   11.338 |  2890.02 |    2.127 |   120.36 |   13.465 |  2452.53 |
-|  4096 |     32 |   16 |  66048 |   22.709 |  2885.96 |    3.104 |   164.97 |   25.812 |  2558.79 |
-|  4096 |     32 |   32 | 132096 |   45.301 |  2893.35 |    4.723 |   216.80 |   50.024 |  2640.63 |
-|  8192 |     32 |    1 |   8224 |    3.022 |  2711.09 |    0.678 |    47.20 |    3.700 |  2222.89 |
-|  8192 |     32 |    2 |  16448 |    6.039 |  2713.01 |    1.149 |    55.70 |    7.188 |  2288.21 |
-|  8192 |     32 |    4 |  32896 |   12.050 |  2719.35 |    1.785 |    71.69 |   13.835 |  2377.67 |
-|  8192 |     32 |    8 |  65792 |   24.113 |  2717.90 |    2.629 |    97.39 |   26.741 |  2460.31 |
-|  8192 |     32 |   16 | 131584 |   48.178 |  2720.58 |    4.099 |   124.91 |   52.277 |  2517.06 |
-|  8192 |     32 |   32 | 263168 |   96.401 |  2719.31 |    6.696 |   152.93 |  103.097 |  2552.63 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap | dio |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |          pp2048 |      2986.97 ± 18.87 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |            tg32 |         61.06 ± 0.23 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |  pp2048 @ d4096 |       2633.45 ± 6.26 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |    tg32 @ d4096 |         54.77 ± 0.28 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |  pp2048 @ d8192 |       2354.14 ± 3.84 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |    tg32 @ d8192 |         48.02 ± 0.40 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   1 | pp2048 @ d16384 |       1908.86 ± 4.25 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |   tg32 @ d16384 |         40.23 ± 0.10 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   1 | pp2048 @ d32768 |       1348.17 ± 2.00 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |   tg32 @ d32768 |         30.21 ± 0.04 |
-
-build: 11fb327bf (7941)
-
-## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
-
-Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.212 |  2420.12 |    1.100 |    29.10 |    1.311 |   414.85 |
-|   512 |     32 |    2 |   1088 |    0.428 |  2393.89 |    1.185 |    54.00 |    1.613 |   674.56 |
-|   512 |     32 |    4 |   2176 |    0.894 |  2290.41 |    1.229 |   104.17 |    2.123 |  1025.02 |
-|   512 |     32 |    8 |   4352 |    1.758 |  2330.36 |    1.319 |   194.15 |    3.076 |  1414.70 |
-|   512 |     32 |   16 |   8704 |    3.508 |  2335.21 |    1.543 |   331.90 |    5.051 |  1723.33 |
-|   512 |     32 |   32 |  17408 |    7.035 |  2328.93 |    1.738 |   589.21 |    8.773 |  1984.29 |
-|  4096 |     32 |    1 |   4128 |    1.831 |  2237.25 |    1.125 |    28.44 |    2.956 |  1396.42 |
-|  4096 |     32 |    2 |   8256 |    3.642 |  2249.48 |    1.253 |    51.07 |    4.895 |  1686.64 |
-|  4096 |     32 |    4 |  16512 |    7.274 |  2252.26 |    1.380 |    92.72 |    8.655 |  1907.81 |
-|  4096 |     32 |    8 |  33024 |   14.576 |  2248.09 |    1.617 |   158.29 |   16.193 |  2039.37 |
-|  4096 |     32 |   16 |  66048 |   29.138 |  2249.17 |    2.081 |   246.01 |   31.219 |  2115.63 |
-|  4096 |     32 |   32 | 132096 |   58.275 |  2249.19 |    2.814 |   363.87 |   61.089 |  2162.34 |
-|  8192 |     32 |    1 |   8224 |    3.757 |  2180.26 |    1.184 |    27.03 |    4.941 |  1664.37 |
-|  8192 |     32 |    2 |  16448 |    7.522 |  2178.05 |    1.341 |    47.73 |    8.863 |  1855.77 |
-|  8192 |     32 |    4 |  32896 |   15.043 |  2178.25 |    1.548 |    82.69 |   16.591 |  1982.74 |
-|  8192 |     32 |    8 |  65792 |   30.111 |  2176.49 |    1.937 |   132.13 |   32.048 |  2052.90 |
-|  8192 |     32 |   16 | 131584 |   60.405 |  2169.90 |    2.706 |   189.21 |   63.111 |  2084.97 |
-|  8192 |     32 |   32 | 263168 |  120.439 |  2176.58 |    3.993 |   256.46 |  124.432 |  2114.96 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap | dio |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |          pp2048 |       2250.28 ± 6.41 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |            tg32 |         29.43 ± 0.02 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |  pp2048 @ d4096 |       2100.19 ± 8.96 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |    tg32 @ d4096 |         28.61 ± 0.02 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |  pp2048 @ d8192 |       2007.56 ± 4.16 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |    tg32 @ d8192 |         27.38 ± 0.09 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   1 | pp2048 @ d16384 |       1779.11 ± 6.42 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |   tg32 @ d16384 |         25.72 ± 0.03 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   1 | pp2048 @ d32768 |       1471.23 ± 1.71 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |   tg32 @ d32768 |         22.51 ± 0.02 |
-
-build: 11fb327bf (7941)
-
-## ggml-org/gemma-3-4b-it-qat-GGUF
-
-Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.092 |  5566.97 |    0.412 |    77.63 |    0.504 |  1078.95 |
-|   512 |     32 |    2 |   1088 |    0.161 |  6345.67 |    0.522 |   122.70 |    0.683 |  1593.06 |
-|   512 |     32 |    4 |   2176 |    0.325 |  6309.87 |    0.562 |   227.68 |    0.887 |  2453.87 |
-|   512 |     32 |    8 |   4352 |    0.643 |  6374.42 |    0.685 |   373.67 |    1.328 |  3277.94 |
-|   512 |     32 |   16 |   8704 |    1.277 |  6413.64 |    0.915 |   559.47 |    2.192 |  3970.01 |
-|   512 |     32 |   32 |  17408 |    2.518 |  6506.57 |    1.249 |   819.61 |    3.767 |  4620.64 |
-|  4096 |     32 |    1 |   4128 |    0.674 |  6079.68 |    0.453 |    70.60 |    1.127 |  3662.88 |
-|  4096 |     32 |    2 |   8256 |    1.335 |  6137.82 |    0.627 |   102.03 |    1.962 |  4208.11 |
-|  4096 |     32 |    4 |  16512 |    2.657 |  6167.35 |    0.749 |   170.92 |    3.405 |  4848.71 |
-|  4096 |     32 |    8 |  33024 |    5.307 |  6173.91 |    0.974 |   262.89 |    6.281 |  5257.53 |
-|  4096 |     32 |   16 |  66048 |   10.610 |  6176.96 |    1.379 |   371.42 |   11.988 |  5509.40 |
-|  4096 |     32 |   32 | 132096 |   21.213 |  6178.89 |    2.122 |   482.50 |   23.335 |  5660.82 |
-|  8192 |     32 |    1 |   8224 |    1.359 |  6027.34 |    0.467 |    68.52 |    1.826 |  4503.48 |
-|  8192 |     32 |    2 |  16448 |    2.699 |  6069.68 |    0.653 |    98.03 |    3.352 |  4906.68 |
-|  8192 |     32 |    4 |  32896 |    5.366 |  6106.74 |    0.818 |   156.55 |    6.184 |  5319.96 |
-|  8192 |     32 |    8 |  65792 |   10.755 |  6093.50 |    1.174 |   218.04 |   11.929 |  5515.22 |
-|  8192 |     32 |   16 | 131584 |   21.484 |  6100.82 |    1.829 |   279.90 |   23.314 |  5644.11 |
-|  8192 |     32 |   32 | 263168 |   42.950 |  6103.40 |    3.058 |   334.91 |   46.008 |  5720.05 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap | dio |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |          pp2048 |      5948.74 ± 10.61 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |            tg32 |         81.05 ± 0.20 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |  pp2048 @ d4096 |      5652.69 ± 34.29 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |    tg32 @ d4096 |         76.37 ± 0.58 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |  pp2048 @ d8192 |      5509.57 ± 40.69 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |    tg32 @ d8192 |         71.61 ± 0.80 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   1 | pp2048 @ d16384 |      5340.86 ± 36.92 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |   tg32 @ d16384 |         70.89 ± 0.34 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   1 | pp2048 @ d32768 |      5023.30 ± 13.52 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   1 |   tg32 @ d32768 |         62.28 ± 0.30 |
-
-build: 11fb327bf (7941)
-
-## ggml-org/GLM-4.7-Flash-GGUF
-
-Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.433 |  1181.83 |    0.693 |    46.16 |    1.126 |   482.94 |
-|   512 |     32 |    2 |   1088 |    0.439 |  2334.46 |    1.034 |    61.89 |    1.473 |   738.75 |
-|   512 |     32 |    4 |   2176 |    0.772 |  2654.46 |    1.459 |    87.76 |    2.230 |   975.77 |
-|   512 |     32 |    8 |   4352 |    1.541 |  2658.78 |    2.043 |   125.31 |    3.583 |  1214.47 |
-|   512 |     32 |   16 |   8704 |    3.083 |  2656.91 |    2.675 |   191.42 |    5.758 |  1511.62 |
-|   512 |     32 |   32 |  17408 |    6.159 |  2660.12 |    3.615 |   283.24 |    9.774 |  1780.98 |
-|  4096 |     32 |    1 |   4128 |    1.915 |  2139.30 |    0.725 |    44.14 |    2.640 |  1563.83 |
-|  4096 |     32 |    2 |   8256 |    3.834 |  2136.40 |    1.119 |    57.21 |    4.953 |  1666.81 |
-|  4096 |     32 |    4 |  16512 |    7.636 |  2145.72 |    1.631 |    78.49 |    9.266 |  1781.93 |
-|  4096 |     32 |    8 |  33024 |   15.295 |  2142.40 |    2.344 |   109.21 |   17.639 |  1872.20 |
-|  4096 |     32 |   16 |  66048 |   30.573 |  2143.62 |    3.773 |   135.70 |   34.346 |  1923.04 |
-|  4096 |     32 |   32 | 132096 |   61.282 |  2138.82 |    5.795 |   176.71 |   67.077 |  1969.31 |
-|  8192 |     32 |    1 |   8224 |    4.510 |  1816.24 |    0.760 |    42.11 |    5.270 |  1560.44 |
-|  8192 |     32 |    2 |  16448 |    9.036 |  1813.19 |    1.206 |    53.06 |   10.242 |  1605.91 |
-|  8192 |     32 |    4 |  32896 |   18.070 |  1813.43 |    1.783 |    71.80 |   19.852 |  1657.03 |
-|  8192 |     32 |    8 |  65792 |   36.125 |  1814.15 |    2.635 |    97.14 |   38.760 |  1697.41 |
-|  8192 |     32 |   16 | 131584 |   72.367 |  1811.20 |    4.954 |   103.34 |   77.322 |  1701.77 |
-|  8192 |     32 |   32 | 263168 |  144.501 |  1814.13 |    8.103 |   126.37 |  152.604 |  1724.51 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | ngl | n_ubatch | fa | dio |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | --: | --------------: | -------------------: |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | CUDA       |  99 |     2048 |  1 |   1 |          pp2048 |      2364.18 ± 11.43 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | CUDA       |  99 |     2048 |  1 |   1 |            tg32 |         48.68 ± 0.12 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | CUDA       |  99 |     2048 |  1 |   1 |  pp2048 @ d4096 |       1684.13 ± 1.24 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | CUDA       |  99 |     2048 |  1 |   1 |    tg32 @ d4096 |         44.62 ± 0.22 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | CUDA       |  99 |     2048 |  1 |   1 |  pp2048 @ d8192 |       1314.68 ± 1.41 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | CUDA       |  99 |     2048 |  1 |   1 |    tg32 @ d8192 |         42.59 ± 0.11 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | CUDA       |  99 |     2048 |  1 |   1 | pp2048 @ d16384 |        914.05 ± 3.32 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | CUDA       |  99 |     2048 |  1 |   1 |   tg32 @ d16384 |         38.72 ± 0.13 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | CUDA       |  99 |     2048 |  1 |   1 | pp2048 @ d32768 |        567.20 ± 0.90 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | CUDA       |  99 |     2048 |  1 |   1 |   tg32 @ d32768 |         32.65 ± 0.09 |
-
-build: 11fb327bf (7941)
--- a/benches/dgx-spark/run-aime-120b-t8-x8-high.log
+++ b/benches/dgx-spark/run-aime-120b-t8-x8-high.log
--- a/benches/mac-m2-ultra/mac-m2-ultra.md
+++ b/benches/mac-m2-ultra/mac-m2-ultra.md
@@ -1,298 +0,0 @@
-## System info
-
-```bash
-uname -a
-Darwin gg-studio 25.2.0 Darwin Kernel Version 25.2.0: Tue Nov 18 21:07:05 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6020 arm64
-
-g++ --version
-Apple clang version 17.0.0 (clang-1700.3.19.1)
-Target: arm64-apple-darwin25.2.0
-```
-
-## ggml-org/gpt-oss-20b-GGUF
-
-Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.215 |  2381.35 |    0.245 |   130.45 |    0.460 |  1181.81 |
-|   512 |     32 |    2 |   1088 |    0.379 |  2701.43 |    0.382 |   167.56 |    0.761 |  1429.67 |
-|   512 |     32 |    4 |   2176 |    0.721 |  2839.27 |    0.604 |   211.76 |    1.326 |  1641.32 |
-|   512 |     32 |    8 |   4352 |    1.433 |  2858.30 |    1.033 |   247.75 |    2.466 |  1764.57 |
-|   512 |     32 |   16 |   8704 |    2.853 |  2871.12 |    1.570 |   326.11 |    4.423 |  1967.77 |
-|   512 |     32 |   32 |  17408 |    5.699 |  2874.95 |    1.910 |   536.15 |    7.609 |  2287.88 |
-|  4096 |     32 |    1 |   4128 |    1.552 |  2638.56 |    0.334 |    95.72 |    1.887 |  2188.00 |
-|  4096 |     32 |    2 |   8256 |    3.084 |  2655.88 |    0.404 |   158.54 |    3.488 |  2366.86 |
-|  4096 |     32 |    4 |  16512 |    6.151 |  2663.78 |    0.652 |   196.39 |    6.802 |  2427.37 |
-|  4096 |     32 |    8 |  33024 |   12.288 |  2666.77 |    1.135 |   225.47 |   13.423 |  2460.27 |
-|  4096 |     32 |   16 |  66048 |   24.563 |  2668.12 |    1.762 |   290.55 |   26.325 |  2508.97 |
-|  4096 |     32 |   32 | 132096 |   49.114 |  2668.73 |    2.398 |   426.94 |   51.512 |  2564.35 |
-|  8192 |     32 |    1 |   8224 |    3.345 |  2448.78 |    0.275 |   116.46 |    3.620 |  2271.76 |
-|  8192 |     32 |    2 |  16448 |    6.665 |  2458.11 |    0.425 |   150.71 |    7.090 |  2319.91 |
-|  8192 |     32 |    4 |  32896 |   13.315 |  2460.92 |    0.691 |   185.21 |   14.006 |  2348.63 |
-|  8192 |     32 |    8 |  65792 |   26.611 |  2462.73 |    1.212 |   211.16 |   27.823 |  2364.62 |
-|  8192 |     32 |   16 | 131584 |   53.232 |  2462.27 |    1.919 |   266.83 |   55.151 |  2385.88 |
-|  8192 |     32 |   32 | 263168 |  110.455 |  2373.30 |    2.752 |   372.03 |  113.208 |  2324.64 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | threads | n_ubatch | fa |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | MTL,BLAS   |      16 |     2048 |  1 |          pp2048 |       2713.40 ± 3.56 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | MTL,BLAS   |      16 |     2048 |  1 |            tg32 |        129.97 ± 3.90 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d4096 |       2324.59 ± 3.01 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d4096 |        123.38 ± 0.17 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d8192 |      1989.82 ± 30.11 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d8192 |        117.39 ± 0.33 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d16384 |       1556.54 ± 6.22 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d16384 |        109.75 ± 0.42 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d32768 |       1122.63 ± 1.45 |
-| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d32768 |         98.25 ± 0.08 |
-
-build: b828e18c7 (7948)
-
-## ggml-org/gpt-oss-120b-GGUF
-
-Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.426 |  1200.92 |    0.361 |    88.56 |    0.788 |   690.64 |
-|   512 |     32 |    2 |   1088 |    0.683 |  1500.14 |    0.545 |   117.35 |    1.228 |   886.02 |
-|   512 |     32 |    4 |   2176 |    1.204 |  1701.56 |    0.847 |   151.19 |    2.050 |  1061.34 |
-|   512 |     32 |    8 |   4352 |    2.402 |  1705.20 |    1.455 |   176.00 |    3.857 |  1128.45 |
-|   512 |     32 |   16 |   8704 |    4.802 |  1705.90 |    2.349 |   217.93 |    7.152 |  1217.08 |
-|   512 |     32 |   32 |  17408 |    9.593 |  1707.85 |    3.665 |   279.42 |   13.258 |  1313.01 |
-|  4096 |     32 |    1 |   4128 |    2.581 |  1587.08 |    0.390 |    82.12 |    2.970 |  1389.67 |
-|  4096 |     32 |    2 |   8256 |    5.124 |  1598.79 |    0.589 |   108.62 |    5.713 |  1445.10 |
-|  4096 |     32 |    4 |  16512 |   10.231 |  1601.47 |    0.928 |   137.98 |   11.158 |  1479.80 |
-|  4096 |     32 |    8 |  33024 |   20.468 |  1600.94 |    1.606 |   159.38 |   22.074 |  1496.04 |
-|  4096 |     32 |   16 |  66048 |   40.924 |  1601.42 |    2.639 |   193.99 |   43.563 |  1516.15 |
-|  4096 |     32 |   32 | 132096 |   81.819 |  1601.98 |    4.466 |   229.29 |   86.284 |  1530.94 |
-|  8192 |     32 |    1 |   8224 |    5.517 |  1484.74 |    0.409 |    78.16 |    5.927 |  1387.58 |
-|  8192 |     32 |    2 |  16448 |   11.008 |  1488.43 |    0.622 |   102.92 |   11.629 |  1414.34 |
-|  8192 |     32 |    4 |  32896 |   22.002 |  1489.29 |    0.987 |   129.66 |   22.990 |  1430.90 |
-|  8192 |     32 |    8 |  65792 |   46.051 |  1423.11 |    1.858 |   137.79 |   47.909 |  1373.27 |
-|  8192 |     32 |   16 | 131584 |   97.680 |  1341.85 |    2.872 |   178.28 |  100.552 |  1308.62 |
-|  8192 |     32 |   32 | 263168 |  176.407 |  1486.02 |    5.048 |   202.85 |  181.455 |  1450.32 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | threads | n_ubatch | fa |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | MTL,BLAS   |      16 |     2048 |  1 |          pp2048 |       1648.69 ± 1.80 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | MTL,BLAS   |      16 |     2048 |  1 |            tg32 |         85.60 ± 0.52 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d4096 |       1429.86 ± 1.01 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d4096 |         82.03 ± 0.12 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d8192 |       1257.90 ± 1.81 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d8192 |         78.23 ± 0.33 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d16384 |       1013.49 ± 0.70 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d16384 |         73.20 ± 0.28 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d32768 |        721.11 ± 0.58 |
-| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d32768 |         65.52 ± 0.10 |
-
-build: b828e18c7 (7948)
-
-## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
-
-Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.243 |  2109.23 |    0.419 |    76.34 |    0.662 |   821.84 |
-|   512 |     32 |    2 |   1088 |    0.406 |  2521.40 |    0.575 |   111.36 |    0.981 |  1109.27 |
-|   512 |     32 |    4 |   2176 |    0.744 |  2751.65 |    0.841 |   152.22 |    1.585 |  1372.71 |
-|   512 |     32 |    8 |   4352 |    1.479 |  2770.20 |    1.330 |   192.48 |    2.809 |  1549.53 |
-|   512 |     32 |   16 |   8704 |    2.951 |  2776.20 |    2.572 |   199.05 |    5.523 |  1575.93 |
-|   512 |     32 |   32 |  17408 |    5.899 |  2777.64 |    2.603 |   393.34 |    8.502 |  2047.54 |
-|  4096 |     32 |    1 |   4128 |    1.901 |  2154.15 |    0.474 |    67.58 |    2.375 |  1738.14 |
-|  4096 |     32 |    2 |   8256 |    3.788 |  2162.89 |    0.652 |    98.17 |    4.439 |  1859.69 |
-|  4096 |     32 |    4 |  16512 |    7.564 |  2166.18 |    0.990 |   129.24 |    8.554 |  1930.34 |
-|  4096 |     32 |    8 |  33024 |   15.121 |  2166.98 |    1.632 |   156.82 |   16.754 |  1971.12 |
-|  4096 |     32 |   16 |  66048 |   30.241 |  2167.09 |    3.166 |   161.72 |   33.407 |  1977.04 |
-|  4096 |     32 |   32 | 132096 |   60.474 |  2167.42 |    3.780 |   270.93 |   64.254 |  2055.86 |
-|  8192 |     32 |    1 |   8224 |    4.733 |  1730.92 |    0.483 |    66.29 |    5.215 |  1576.85 |
-|  8192 |     32 |    2 |  16448 |    9.459 |  1732.09 |    0.722 |    88.58 |   10.182 |  1615.46 |
-|  8192 |     32 |    4 |  32896 |   18.912 |  1732.65 |    1.120 |   114.26 |   20.032 |  1642.14 |
-|  8192 |     32 |    8 |  65792 |   37.797 |  1733.91 |    1.873 |   136.67 |   39.670 |  1658.49 |
-|  8192 |     32 |   16 | 131584 |   84.133 |  1557.92 |    3.718 |   137.72 |   87.850 |  1497.82 |
-|  8192 |     32 |   32 | 263168 |  157.550 |  1663.88 |    4.854 |   210.98 |  162.403 |  1620.46 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | threads | n_ubatch | fa |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | MTL,BLAS   |      16 |     2048 |  1 |          pp2048 |       2453.11 ± 1.70 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | MTL,BLAS   |      16 |     2048 |  1 |            tg32 |         78.97 ± 0.46 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d4096 |       1569.46 ± 1.97 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d4096 |         71.18 ± 0.37 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d8192 |       1145.51 ± 1.16 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d8192 |         65.11 ± 0.36 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d16384 |        741.04 ± 0.74 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d16384 |         56.87 ± 0.14 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d32768 |        431.31 ± 0.31 |
-| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d32768 |         45.26 ± 0.11 |
-
-build: b828e18c7 (7948)
-
-## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
-
-Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.339 |  1509.22 |    0.409 |    78.17 |    0.749 |   726.67 |
-|   512 |     32 |    2 |   1088 |    0.646 |  1584.93 |    0.483 |   132.45 |    1.129 |   963.45 |
-|   512 |     32 |    4 |   2176 |    1.258 |  1627.50 |    0.585 |   218.67 |    1.844 |  1180.21 |
-|   512 |     32 |    8 |   4352 |    2.506 |  1634.41 |    1.005 |   254.83 |    3.511 |  1239.64 |
-|   512 |     32 |   16 |   8704 |    5.007 |  1635.99 |    1.595 |   321.07 |    6.602 |  1318.38 |
-|   512 |     32 |   32 |  17408 |   10.007 |  1637.19 |    1.676 |   611.12 |   11.683 |  1490.03 |
-|  4096 |     32 |    1 |   4128 |    2.730 |  1500.46 |    0.431 |    74.31 |    3.160 |  1306.12 |
-|  4096 |     32 |    2 |   8256 |    5.446 |  1504.33 |    0.524 |   122.04 |    5.970 |  1382.91 |
-|  4096 |     32 |    4 |  16512 |   10.875 |  1506.59 |    0.662 |   193.45 |   11.537 |  1431.28 |
-|  4096 |     32 |    8 |  33024 |   21.749 |  1506.61 |    1.158 |   221.11 |   22.907 |  1441.64 |
-|  4096 |     32 |   16 |  66048 |   43.477 |  1507.36 |    1.901 |   269.32 |   45.378 |  1455.49 |
-|  4096 |     32 |   32 | 132096 |   86.954 |  1507.37 |    2.325 |   440.42 |   89.279 |  1479.59 |
-|  8192 |     32 |    1 |   8224 |    5.940 |  1379.21 |    0.449 |    71.20 |    6.389 |  1287.20 |
-|  8192 |     32 |    2 |  16448 |   11.865 |  1380.84 |    0.559 |   114.59 |   12.424 |  1323.92 |
-|  8192 |     32 |    4 |  32896 |   23.723 |  1381.25 |    0.728 |   175.80 |   24.452 |  1345.35 |
-|  8192 |     32 |    8 |  65792 |   47.434 |  1381.63 |    1.279 |   200.09 |   48.713 |  1350.60 |
-|  8192 |     32 |   16 | 131584 |   94.864 |  1381.69 |    2.198 |   232.97 |   97.061 |  1355.68 |
-|  8192 |     32 |   32 | 263168 |  189.743 |  1381.57 |    3.052 |   335.50 |  192.795 |  1365.01 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | threads | n_ubatch | fa |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | MTL,BLAS   |      16 |     2048 |  1 |          pp2048 |       1565.91 ± 0.86 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | MTL,BLAS   |      16 |     2048 |  1 |            tg32 |         79.68 ± 0.39 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d4096 |       1317.41 ± 1.02 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d4096 |         74.70 ± 0.04 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d8192 |       1134.65 ± 0.76 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d8192 |         71.31 ± 0.12 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d16384 |        886.46 ± 0.78 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d16384 |         65.93 ± 0.06 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d32768 |        612.21 ± 0.30 |
-| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d32768 |         56.83 ± 0.02 |
-
-build: b828e18c7 (7948)
-
-## ggml-org/gemma-3-4b-it-qat-GGUF
-
-Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.186 |  2748.06 |    0.235 |   136.28 |    0.421 |  1291.78 |
-|   512 |     32 |    2 |   1088 |    0.342 |  2990.95 |    0.312 |   204.99 |    0.655 |  1662.15 |
-|   512 |     32 |    4 |   2176 |    0.662 |  3092.69 |    0.404 |   316.97 |    1.066 |  2041.21 |
-|   512 |     32 |    8 |   4352 |    1.317 |  3110.41 |    0.579 |   441.80 |    1.896 |  2294.97 |
-|   512 |     32 |   16 |   8704 |    2.625 |  3120.23 |    1.207 |   424.08 |    3.833 |  2270.93 |
-|   512 |     32 |   32 |  17408 |    5.242 |  3125.34 |    1.299 |   788.23 |    6.541 |  2661.19 |
-|  4096 |     32 |    1 |   4128 |    1.408 |  2909.90 |    0.296 |   108.07 |    1.704 |  2422.95 |
-|  4096 |     32 |    2 |   8256 |    2.793 |  2933.40 |    0.325 |   197.00 |    3.118 |  2648.25 |
-|  4096 |     32 |    4 |  16512 |    5.567 |  2943.22 |    0.440 |   291.07 |    6.006 |  2749.05 |
-|  4096 |     32 |    8 |  33024 |   11.114 |  2948.23 |    0.640 |   400.26 |   11.754 |  2809.59 |
-|  4096 |     32 |   16 |  66048 |   22.217 |  2949.76 |    1.327 |   385.83 |   23.544 |  2805.26 |
-|  4096 |     32 |   32 | 132096 |   44.420 |  2950.77 |    1.553 |   659.30 |   45.973 |  2873.36 |
-|  8192 |     32 |    1 |   8224 |    2.860 |  2864.58 |    0.250 |   127.90 |    3.110 |  2644.42 |
-|  8192 |     32 |    2 |  16448 |    5.702 |  2873.63 |    0.335 |   191.07 |    6.036 |  2724.77 |
-|  8192 |     32 |    4 |  32896 |   11.383 |  2878.69 |    0.456 |   280.72 |   11.839 |  2778.63 |
-|  8192 |     32 |    8 |  65792 |   22.750 |  2880.75 |    0.671 |   381.48 |   23.421 |  2809.14 |
-|  8192 |     32 |   16 | 131584 |   45.484 |  2881.74 |    1.406 |   364.04 |   46.890 |  2806.22 |
-|  8192 |     32 |   32 | 263168 |   90.956 |  2882.10 |    1.793 |   570.98 |   92.749 |  2837.41 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | threads | n_ubatch | fa |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | MTL,BLAS   |      16 |     2048 |  1 |          pp2048 |       2923.59 ± 3.10 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | MTL,BLAS   |      16 |     2048 |  1 |            tg32 |        134.28 ± 1.29 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d4096 |       2748.21 ± 3.05 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d4096 |        133.11 ± 0.08 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d8192 |       2641.45 ± 2.31 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d8192 |        125.85 ± 0.35 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d16384 |       2446.20 ± 2.94 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d16384 |        125.00 ± 0.12 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d32768 |       2129.18 ± 7.43 |
-| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d32768 |        113.14 ± 0.10 |
-
-build: b828e18c7 (7948)
-
-## ggml-org/GLM-4.7-Flash-GGUF
-
-Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF
-
- `llama-batched-bench`
-
-
-main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.326 |  1568.69 |    0.522 |    61.28 |    0.849 |   641.09 |
-|   512 |     32 |    2 |   1088 |    0.528 |  1939.42 |    0.744 |    86.07 |    1.272 |   855.63 |
-|   512 |     32 |    4 |   2176 |    0.968 |  2114.85 |    1.105 |   115.85 |    2.073 |  1049.56 |
-|   512 |     32 |    8 |   4352 |    1.928 |  2124.62 |    1.684 |   151.99 |    3.612 |  1204.82 |
-|   512 |     32 |   16 |   8704 |    3.844 |  2131.34 |    3.141 |   162.99 |    6.985 |  1246.11 |
-|   512 |     32 |   32 |  17408 |    7.683 |  2132.38 |    3.924 |   260.95 |   11.608 |  1499.71 |
-|  4096 |     32 |    1 |   4128 |    3.280 |  1248.75 |    0.723 |    44.29 |    4.003 |  1031.33 |
-|  4096 |     32 |    2 |   8256 |    6.545 |  1251.63 |    0.930 |    68.85 |    7.475 |  1104.53 |
-|  4096 |     32 |    4 |  16512 |   13.080 |  1252.64 |    1.454 |    88.03 |   14.534 |  1136.12 |
-|  4096 |     32 |    8 |  33024 |   26.154 |  1252.90 |    2.388 |   107.20 |   28.542 |  1157.04 |
-|  4096 |     32 |   16 |  66048 |   52.297 |  1253.14 |    4.724 |   108.37 |   57.022 |  1158.30 |
-|  4096 |     32 |   32 | 132096 |  104.578 |  1253.34 |    7.266 |   140.93 |  111.844 |  1181.08 |
-|  8192 |     32 |    1 |   8224 |    9.623 |   851.31 |    0.767 |    41.72 |   10.390 |   791.54 |
-|  8192 |     32 |    2 |  16448 |   20.916 |   783.32 |    1.148 |    55.74 |   22.064 |   745.45 |
-|  8192 |     32 |    4 |  32896 |   43.509 |   753.14 |    1.833 |    69.82 |   45.342 |   725.51 |
-|  8192 |     32 |    8 |  65792 |   79.621 |   823.10 |    3.180 |    80.50 |   82.801 |   794.58 |
-|  8192 |     32 |   16 | 131584 |  153.770 |   852.39 |    6.502 |    78.74 |  160.272 |   821.00 |
-|  8192 |     32 |   32 | 263168 |  307.539 |   852.39 |   10.839 |    94.48 |  318.378 |   826.59 |
-
-
- `llama-bench`
-
-| model                          |       size |     params | backend    | threads | n_ubatch | fa |            test |                  t/s |
-| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | MTL,BLAS   |      16 |     2048 |  1 |          pp2048 |       1629.33 ± 0.27 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | MTL,BLAS   |      16 |     2048 |  1 |            tg32 |         59.58 ± 0.13 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d4096 |        732.67 ± 0.42 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d4096 |         47.44 ± 0.15 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | MTL,BLAS   |      16 |     2048 |  1 |  pp2048 @ d8192 |        474.33 ± 0.33 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | MTL,BLAS   |      16 |     2048 |  1 |    tg32 @ d8192 |         40.20 ± 0.20 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d16384 |        277.46 ± 0.09 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d16384 |         31.50 ± 0.93 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | MTL,BLAS   |      16 |     2048 |  1 | pp2048 @ d32768 |        151.44 ± 0.05 |
-| deepseek2 30B.A3B Q8_0         |  29.65 GiB |    29.94 B | MTL,BLAS   |      16 |     2048 |  1 |   tg32 @ d32768 |         21.81 ± 0.01 |
-
-build: b828e18c7 (7948)
--- a/benches/nemotron/nemotron-dgx-spark.md
+++ b/benches/nemotron/nemotron-dgx-spark.md
@@ -1,117 +0,0 @@
-# NVIDIA DGX Spark
-
-## System info
-
-```bash
-uname --all
-Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
-
-g++ --version
-g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
-
-nvidia-smi
-Fri Mar  6 11:39:45 2026
-+-----------------------------------------------------------------------------------------+
-| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
-+-----------------------------------------+------------------------+----------------------+
-| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
-|                                         |                        |               MIG M. |
-|=========================================+========================+======================|
-|   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
-| N/A   52C    P0             13W /  N/A  | Not Supported          |      0%      Default |
-|                                         |                        |                  N/A |
-+-----------------------------------------+------------------------+----------------------+
-```
-
-## ggml-org/Nemotron-3-Super-120B-GGUF
-
-Model: https://huggingface.co/ggml-org/Nemotron-3-Super-120B-GGUF
-
- `llama-batched-bench`
-
-main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    1.094 |   468.05 |    1.621 |    19.74 |    2.715 |   200.37 |
-|   512 |     32 |    2 |   1088 |    1.463 |   700.16 |    2.437 |    26.26 |    3.900 |   279.01 |
-|   512 |     32 |    4 |   2176 |    2.647 |   773.76 |    4.043 |    31.66 |    6.689 |   325.29 |
-|   512 |     32 |    8 |   4352 |    5.291 |   774.14 |    6.151 |    41.62 |   11.442 |   380.37 |
-|   512 |     32 |   16 |   8704 |   10.603 |   772.62 |   10.385 |    49.30 |   20.987 |   414.72 |
-|   512 |     32 |   32 |  17408 |   21.231 |   771.69 |   18.235 |    56.16 |   39.466 |   441.09 |
-|  4096 |     32 |    1 |   4128 |    5.340 |   767.05 |    1.616 |    19.81 |    6.956 |   593.47 |
-|  4096 |     32 |    2 |   8256 |   10.673 |   767.55 |    2.454 |    26.08 |   13.127 |   628.94 |
-|  4096 |     32 |    4 |  16512 |   21.348 |   767.46 |    4.072 |    31.44 |   25.420 |   649.57 |
-|  4096 |     32 |    8 |  33024 |   42.714 |   767.15 |    6.277 |    40.78 |   48.991 |   674.08 |
-|  4096 |     32 |   16 |  66048 |   85.385 |   767.54 |   10.596 |    48.32 |   95.981 |   688.14 |
-|  4096 |     32 |   32 | 132096 |  170.819 |   767.32 |   18.619 |    55.00 |  189.437 |   697.31 |
-|  8192 |     32 |    1 |   8224 |   10.690 |   766.32 |    1.619 |    19.76 |   12.310 |   668.10 |
-|  8192 |     32 |    2 |  16448 |   21.382 |   766.24 |    2.467 |    25.94 |   23.850 |   689.65 |
-|  8192 |     32 |    4 |  32896 |   42.782 |   765.92 |    4.098 |    31.23 |   46.881 |   701.69 |
-|  8192 |     32 |    8 |  65792 |   85.582 |   765.77 |    6.368 |    40.20 |   91.951 |   715.52 |
-|  8192 |     32 |   16 | 131584 |  171.066 |   766.21 |   10.774 |    47.52 |  181.840 |   723.62 |
-|  8192 |     32 |   32 | 263168 |  342.140 |   766.19 |   18.969 |    53.98 |  361.109 |   728.78 |
-
- `llama-bench`
-
-| model                   |       size |     params | backend    | n_ubatch | fa |            test |                  t/s |
-| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |          pp2048 |        768.84 ± 0.90 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |            tg32 |         19.94 ± 0.16 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |  pp2048 @ d4096 |        764.51 ± 0.50 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |    tg32 @ d4096 |         19.95 ± 0.18 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |  pp2048 @ d8192 |        759.53 ± 0.71 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |    tg32 @ d8192 |         19.83 ± 0.18 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 | pp2048 @ d16384 |        747.98 ± 1.58 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |   tg32 @ d16384 |         19.84 ± 0.18 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 | pp2048 @ d32768 |        724.40 ± 2.70 |
-| nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |   tg32 @ d32768 |         19.45 ± 0.18 |
-
-build: 04a65daab (8268)
-
-## ggml-org/Nemotron-3-Nano-4B-GGUF
-
-Model: https://huggingface.co/ggml-org/Nemotron-3-Nano-4B-GGUF
-
- `llama-batched-bench`
-
-main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   512 |     32 |    1 |    544 |    0.152 |  3371.61 |    0.597 |    53.64 |    0.748 |   726.90 |
-|   512 |     32 |    2 |   1088 |    0.319 |  3208.68 |    0.857 |    74.66 |    1.176 |   924.89 |
-|   512 |     32 |    4 |   2176 |    0.720 |  2843.56 |    1.323 |    96.78 |    2.043 |  1065.18 |
-|   512 |     32 |    8 |   4352 |    1.428 |  2867.96 |    2.311 |   110.76 |    3.739 |  1163.82 |
-|   512 |     32 |   16 |   8704 |    2.857 |  2866.94 |    4.203 |   121.82 |    7.060 |  1232.82 |
-|   512 |     32 |   32 |  17408 |    5.709 |  2869.76 |    7.964 |   128.58 |   13.673 |  1273.14 |
-|  4096 |     32 |    1 |   4128 |    1.458 |  2809.76 |    0.605 |    52.92 |    2.062 |  2001.52 |
-|  4096 |     32 |    2 |   8256 |    2.905 |  2819.95 |    0.875 |    73.12 |    3.780 |  2183.95 |
-|  4096 |     32 |    4 |  16512 |    5.790 |  2829.74 |    1.361 |    94.07 |    7.151 |  2309.17 |
-|  4096 |     32 |    8 |  33024 |   11.598 |  2825.32 |    2.378 |   107.65 |   13.976 |  2362.89 |
-|  4096 |     32 |   16 |  66048 |   23.208 |  2823.88 |    4.348 |   117.76 |   27.556 |  2396.89 |
-|  4096 |     32 |   32 | 132096 |   46.515 |  2817.85 |    8.279 |   123.69 |   54.794 |  2410.79 |
-|  8192 |     32 |    1 |   8224 |    2.950 |  2776.95 |    0.617 |    51.89 |    3.567 |  2305.75 |
-|  8192 |     32 |    2 |  16448 |    5.921 |  2767.32 |    0.896 |    71.45 |    6.816 |  2413.05 |
-|  8192 |     32 |    4 |  32896 |   11.842 |  2767.21 |    1.401 |    91.34 |   13.243 |  2484.03 |
-|  8192 |     32 |    8 |  65792 |   23.726 |  2762.17 |    2.461 |   104.03 |   26.187 |  2512.38 |
-|  8192 |     32 |   16 | 131584 |   47.777 |  2743.43 |    4.577 |   111.86 |   52.354 |  2513.36 |
-|  8192 |     32 |   32 | 263168 |   96.691 |  2711.16 |    8.772 |   116.73 |  105.463 |  2495.36 |
-
- `llama-bench`
-
-| model                   |       size |     params | backend    | n_ubatch | fa |            test |                  t/s |
-| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: |
-| nemotron 4B Q8_0        |   3.94 GiB |     3.97 B | CUDA       |     2048 |  1 |          pp2048 |      2761.90 ± 19.31 |
-| nemotron 4B Q8_0        |   3.94 GiB |     3.97 B | CUDA       |     2048 |  1 |            tg32 |         52.85 ± 0.12 |
-| nemotron 4B Q8_0        |   3.94 GiB |     3.97 B | CUDA       |     2048 |  1 |  pp2048 @ d4096 |      2687.07 ± 21.84 |
-| nemotron 4B Q8_0        |   3.94 GiB |     3.97 B | CUDA       |     2048 |  1 |    tg32 @ d4096 |         52.32 ± 0.23 |
-| nemotron 4B Q8_0        |   3.94 GiB |     3.97 B | CUDA       |     2048 |  1 |  pp2048 @ d8192 |      2564.52 ± 57.69 |
-| nemotron 4B Q8_0        |   3.94 GiB |     3.97 B | CUDA       |     2048 |  1 |    tg32 @ d8192 |         51.27 ± 0.34 |
-| nemotron 4B Q8_0        |   3.94 GiB |     3.97 B | CUDA       |     2048 |  1 | pp2048 @ d16384 |      2334.02 ± 37.83 |
-| nemotron 4B Q8_0        |   3.94 GiB |     3.97 B | CUDA       |     2048 |  1 |   tg32 @ d16384 |         49.71 ± 0.14 |
-| nemotron 4B Q8_0        |   3.94 GiB |     3.97 B | CUDA       |     2048 |  1 | pp2048 @ d32768 |      2041.46 ± 40.45 |
-| nemotron 4B Q8_0        |   3.94 GiB |     3.97 B | CUDA       |     2048 |  1 |   tg32 @ d32768 |         46.71 ± 0.13 |
-
-build: 1bbec6a75 (8382)
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -43,6 +43,11 @@ COMMON_CMAKE_ARGS=(
    -DGGML_OPENMP=${GGML_OPENMP}
 )

+XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
+MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
+MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
+echo "Detected Xcode version: $XCODE_VERSION"
+
 check_required_tool() {
    local tool=$1
    local install_message=$2
@@ -55,12 +60,9 @@ check_required_tool() {
 }
 echo "Checking for required tools..."
 check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
-check_required_tool "xcrun" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
-
-XCODE_VERSION=$(xcrun xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
-MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
-MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
-echo "Detected Xcode version: $XCODE_VERSION"
+check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
+check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
+check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"

 set -e

@@ -258,7 +260,7 @@ combine_static_libraries() {

    # Since we have multiple architectures libtool will find object files that do not
    # match the target architecture. We suppress these warnings.
-    xcrun libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
+    libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null

    # Determine SDK, architectures, and install_name based on platform and simulator flag.
    local sdk=""
@@ -331,7 +333,7 @@ combine_static_libraries() {

    # Platform-specific post-processing for device builds
    if [[ "$is_simulator" == "false" ]]; then
-        if xcrun -f vtool &>/dev/null; then
+        if command -v xcrun vtool &>/dev/null; then
            case "$platform" in
                "ios")
                    echo "Marking binary as a framework binary for iOS..."
@@ -412,7 +414,7 @@ cmake -B build-ios-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-ios-sim --config Release -- -quiet

@@ -426,7 +428,7 @@ cmake -B build-ios-device -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-ios-device --config Release -- -quiet

@@ -437,7 +439,7 @@ cmake -B build-macos -G Xcode \
    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-macos --config Release -- -quiet

@@ -449,10 +451,9 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xros \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
-    -DLLAMA_BUILD_SERVER=OFF \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-visionos --config Release -- -quiet

@@ -464,10 +465,9 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xrsimulator \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
-    -DLLAMA_BUILD_SERVER=OFF \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-visionos-sim --config Release -- -quiet

@@ -483,7 +483,7 @@ cmake -B build-tvos-sim -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-tvos-sim --config Release -- -quiet

@@ -498,7 +498,7 @@ cmake -B build-tvos-device -G Xcode \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_OPENSSL=OFF \
+    -DLLAMA_CURL=OFF \
    -S .
 cmake --build build-tvos-device --config Release -- -quiet

@@ -524,13 +524,13 @@ combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"

 # Create XCFramework with correct debug symbols paths
 echo "Creating XCFramework..."
-xcrun xcodebuild -create-xcframework \
+xcodebuild -create-xcframework \
    -framework $(pwd)/build-ios-sim/framework/llama.framework \
    -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
    -framework $(pwd)/build-ios-device/framework/llama.framework \
    -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
    -framework $(pwd)/build-macos/framework/llama.framework \
-    -debug-symbols $(pwd)/build-macos/dSYMs/llama.dSYM \
+    -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
    -framework $(pwd)/build-visionos/framework/llama.framework \
    -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
    -framework $(pwd)/build-visionos-sim/framework/llama.framework \
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -25,15 +25,6 @@
 # # with KLEIDIAI support
 # GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
-# # with BLAS support
-# GG_BUILD_BLAS=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-# with BLAS support (custom vendor)
-# GG_BUILD_BLAS=1 GG_BUILD_BLAS_VENDOR=Intel10_64lp bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-# with OPENVINO support
-# GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -54,23 +45,14 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
-CTEST_EXTRA=""
-
-# Default to use make unless specified for compatibility
-CMAKE_GENERATOR="Unix Makefiles"
-
-if [ ! -z "${GG_BUILD_NINJA}" ]; then
-    CMAKE_GENERATOR="Ninja"
-fi
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DGGML_CUDA_CUB_3DOT2=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"

    if command -v nvidia-smi >/dev/null 2>&1; then
        CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
@@ -119,28 +101,10 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
    fi

-    # Build shared libs on Windows
-    # to reduce binary size and avoid errors in library loading unit tests
-    if uname -s | grep -qi nt; then
-        CMAKE_EXTRA="${CMAKE_EXTRA} -DBUILD_SHARED_LIBS=ON"
-    fi
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
-
-    if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
-        if [ -z "${CMAKE_PREFIX_PATH}" ]; then
-            export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}"
-        else
-            export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}:${CMAKE_PREFIX_PATH}"
-        fi
-    fi
-
-    # For some systems, Dawn_DIR needs to be set explicitly, e.g., the lib64 path
-    if [ ! -z "${GG_BUILD_WEBGPU_DAWN_DIR}" ]; then
-        CMAKE_EXTRA="${CMAKE_EXTRA} -DDawn_DIR=${GG_BUILD_WEBGPU_DAWN_DIR}"
-    fi
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
 fi

 if [ ! -z ${GG_BUILD_MUSA} ]; then
@@ -156,23 +120,30 @@ fi

 if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
    echo ">>===== Enabling KleidiAI support"
-    CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } -DGGML_CPU_KLEIDIAI=ON"
-fi

-if [ ! -z ${GG_BUILD_BLAS} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
-fi
+    CANDIDATES=("armv9-a+dotprod+i8mm" "armv8.6-a+dotprod+i8mm" "armv8.2-a+dotprod")
+    CPU=""

-if [ ! -z ${GG_BUILD_OPENVINO} ]; then
-    if [ -z ${OpenVINO_DIR} ]; then
-        echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:"
-        echo "source /opt/intel/openvino/setupvars.sh"
+    for cpu in "${CANDIDATES[@]}"; do
+        if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
+            CPU="$cpu"
+            break
+        fi
+    done
+
+    if [ -z "$CPU" ]; then
+        echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
        exit 1
    fi
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"

-    # TODO: fix and re-enable the `test-llama-archs` test below
-    CTEST_EXTRA="-E test-llama-archs"
+    echo ">>===== Using ARM baseline: ${CPU}"
+
+    CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CPU_KLEIDIAI=ON \
+        -DGGML_CPU_AARCH64=ON \
+        -DGGML_CPU_ARM_ARCH=${CPU} \
+        -DBUILD_SHARED_LIBS=OFF"
 fi

 ## helpers
@@ -226,13 +197,13 @@ function gg_run_ctest_debug {

    set -e

-    # Check required binaries are installed
+    # Check cmake, make and ctest are installed
    gg_check_build_requirements

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log

-    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
 }
@@ -257,16 +228,16 @@ function gg_run_ctest_release {

    set -e

-    # Check required binaries are installed
+    # Check cmake, make and ctest are installed
    gg_check_build_requirements

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest -C Release --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
-        (time ctest -C Release --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi

    set +e
@@ -307,8 +278,7 @@ function gg_sum_test_scripts {
 }

 function gg_get_model {
-    #local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
-    local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-q4_0.gguf"
+    local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
    if [[ -s $gguf_0 ]]; then
        echo -n "$gguf_0"
    else
@@ -324,7 +294,7 @@ function gg_run_ctest_with_model_debug {
    cd build-ci-debug
    set -e

-    (LLAMACPP_TEST_MODELFILE="$model" time ctest -C Debug --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
    cd ..
@@ -337,7 +307,7 @@ function gg_run_ctest_with_model_release {
    cd build-ci-release
    set -e

-    (LLAMACPP_TEST_MODELFILE="$model" time ctest -C Release --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log

    # test memory leaks
    #if [[ ! -z ${GG_BUILD_METAL} ]]; then
@@ -391,8 +361,8 @@ function gg_run_qwen3_0_6b {

    set -e

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf  --outtype f16
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
@@ -423,20 +393,18 @@ function gg_run_qwen3_0_6b {
    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)

-    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
-
-    (time ./bin/llama-completion -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    if [ -z ${GG_BUILD_NO_BF16} ]; then
@@ -455,10 +423,10 @@ function gg_run_qwen3_0_6b {

    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -540,8 +508,8 @@ function gg_run_embd_bge_small {

    set -e

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -550,10 +518,8 @@ function gg_run_embd_bge_small {

    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0

-    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
-
-    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log

    set +e
 }
@@ -585,17 +551,15 @@ function gg_run_rerank_tiny {

    set -e

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"

-    (time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
-
    # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log

    # sample output
    # rerank score 0:    0.029
@@ -632,36 +596,12 @@ function gg_sum_rerank_tiny {
 }

 function gg_check_build_requirements {
-    if ! command -v git &> /dev/null; then
-        gg_printf 'git not found, please install'
-    fi
-
-    if ! command -v git-lfs &> /dev/null; then
-        gg_printf 'git-lfs not found, please install'
-    fi
-
-    if ! command -v wget &> /dev/null; then
-        gg_printf 'wget not found, please install'
-    fi
-
-    if ! command -v python3 &> /dev/null; then
-        gg_printf 'python3 not found, please install'
-    fi
-
-    if ! command -v pip3 &> /dev/null; then
-        gg_printf 'pip3 not found, please install'
-    fi
-
-    if ! python3 -m ensurepip --help &> /dev/null; then
-        gg_printf 'ensurepip not found, please install python3-venv package'
-    fi
-
    if ! command -v cmake &> /dev/null; then
        gg_printf 'cmake not found, please install'
    fi

-    if ! command -v ccache &> /dev/null; then
-        gg_printf 'ccache not found, please consider installing for faster builds'
+    if ! command -v make &> /dev/null; then
+        gg_printf 'make not found, please install'
    fi

    if ! command -v ctest &> /dev/null; then
@@ -669,29 +609,6 @@ function gg_check_build_requirements {
    fi
 }

-function gg_run_test_backend_ops_cpu {
-    cd ${SRC}
-
-    cd build-ci-release
-
-    set -e
-
-    (time ./bin/test-backend-ops -b CPU ) 2>&1 | tee -a $OUT/${ci}-test-backend-ops-cpu.log
-
-    set +e
-}
-
-function gg_sum_test_backend_ops_cpu {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs test-backend-ops for CPU backend\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-test-backend-ops-cpu.log)"
-    gg_printf '```\n'
-    gg_printf '\n'
-}
-
 ## main

 export LLAMA_LOG_PREFIX=1
@@ -720,10 +637,6 @@ ret=0
 test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release

-if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
-    test $ret -eq 0 && gg_run test_backend_ops_cpu
-fi
-
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small
    test $ret -eq 0 && gg_run rerank_tiny
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -39,10 +39,26 @@ if(Git_FOUND)
    endif()
 endif()

-set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
-
-if(CMAKE_VS_PLATFORM_NAME)
-    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+if(MSVC)
+    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+    if (CMAKE_VS_PLATFORM_NAME)
+        set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+    else()
+        set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
+    endif()
 else()
-    set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
+    execute_process(
+        COMMAND ${CMAKE_C_COMPILER} --version
+        OUTPUT_VARIABLE OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
+    set(BUILD_COMPILER ${OUT})
+
+    execute_process(
+        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
+        OUTPUT_VARIABLE OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    set(BUILD_TARGET ${OUT})
 endif()
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -32,27 +32,4 @@ function(llama_add_compile_flags)
            set(CXX_FLAGS "" PARENT_SCOPE)
        endif()
    endif()
-
-    if (NOT MSVC)
-        if (LLAMA_SANITIZE_THREAD)
-            message(STATUS "Using -fsanitize=thread")
-
-            add_compile_options(-fsanitize=thread)
-            link_libraries     (-fsanitize=thread)
-        endif()
-
-        if (LLAMA_SANITIZE_ADDRESS)
-            message(STATUS "Using -fsanitize=address")
-
-            add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
-            link_libraries     (-fsanitize=address)
-        endif()
-
-        if (LLAMA_SANITIZE_UNDEFINED)
-            message(STATUS "Using -fsanitize=undefined")
-
-            add_compile_options(-fsanitize=undefined)
-            link_libraries     (-fsanitize=undefined)
-        endif()
-    endif()
 endfunction()
--- a/cmake/download-models.cmake
+++ b/cmake/download-models.cmake
@@ -1,21 +0,0 @@
-get_filename_component(DEST_DIR "${DEST}" DIRECTORY)
-file(MAKE_DIRECTORY "${DEST_DIR}")
-
-if(NOT EXISTS "${DEST}")
-    message(STATUS "Downloading ${NAME} from ggml-org/models...")
-endif()
-
-file(DOWNLOAD
-    "https://huggingface.co/ggml-org/models/resolve/main/${NAME}?download=true"
-    "${DEST}"
-    TLS_VERIFY ON
-    EXPECTED_HASH ${HASH}
-    STATUS status
-)
-
-list(GET status 0 code)
-
-if(NOT code EQUAL 0)
-    list(GET status 1 msg)
-    message(FATAL_ERROR "Failed to download ${NAME}: ${msg}")
-endif()
--- a/cmake/license.cmake
+++ b/cmake/license.cmake
@@ -1,40 +0,0 @@
-define_property(GLOBAL PROPERTY LICENSE_TEXT
-    BRIEF_DOCS "Embedded licenses"
-    FULL_DOCS  "Global string containing all aggregated licenses"
-)
-
-function(license_add_file NAME FILE)
-    if(NOT IS_ABSOLUTE "${FILE}")
-        set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
-    endif()
-    if(EXISTS "${FILE}")
-        set(TITLE "License for ${NAME}")
-        string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
-        file(READ "${FILE}" TEXT)
-        get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
-        string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
-        set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
-    else()
-        message(WARNING "License file '${FILE}' not found")
-    endif()
-endfunction()
-
-function(license_generate TARGET_NAME)
-    message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
-    get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
-
-    set(CPP_CONTENT "// Generated by CMake\n\n")
-    string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
-    string(APPEND CPP_CONTENT "${TEXT}")
-    string(APPEND CPP_CONTENT "nullptr\n")
-    string(APPEND CPP_CONTENT "};\n")
-
-    set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
-    file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
-
-    if(TARGET ${TARGET_NAME})
-        target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
-    else()
-        message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
-    endif()
-endfunction()
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -5,6 +5,7 @@ find_package(Threads REQUIRED)
 llama_add_compile_flags()

 # Build info header
+#

 if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
    set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
@@ -47,24 +48,14 @@ add_library(${TARGET} STATIC
    arg.cpp
    arg.h
    base64.hpp
-    chat-auto-parser-generator.cpp
-    chat-auto-parser-helpers.cpp
-    chat-auto-parser.h
-    chat-diff-analyzer.cpp
-    chat-peg-parser.cpp
-    chat-peg-parser.h
+    chat-parser.cpp
+    chat-parser.h
    chat.cpp
    chat.h
    common.cpp
    common.h
    console.cpp
    console.h
-    debug.cpp
-    debug.h
-    download.cpp
-    download.h
-    hf-cache.cpp
-    hf-cache.h
    http.h
    json-partial.cpp
    json-partial.h
@@ -74,55 +65,86 @@ add_library(${TARGET} STATIC
    log.h
    ngram-cache.cpp
    ngram-cache.h
-    ngram-map.cpp
-    ngram-map.h
-    ngram-mod.cpp
-    ngram-mod.h
-    peg-parser.cpp
-    peg-parser.h
-    preset.cpp
-    preset.h
    regex-partial.cpp
-    reasoning-budget.cpp
-    reasoning-budget.h
    regex-partial.h
    sampling.cpp
    sampling.h
    speculative.cpp
    speculative.h
-    unicode.cpp
-    unicode.h
-    jinja/lexer.cpp
-    jinja/lexer.h
-    jinja/parser.cpp
-    jinja/parser.h
-    jinja/runtime.cpp
-    jinja/runtime.h
-    jinja/value.cpp
-    jinja/value.h
-    jinja/string.cpp
-    jinja/string.h
-    jinja/caps.cpp
-    jinja/caps.h
    )

-target_include_directories(${TARGET} PUBLIC . ../vendor)
-target_compile_features   (${TARGET} PUBLIC cxx_std_17)
-
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

-target_link_libraries(${TARGET} PRIVATE
-    build_info
-    cpp-httplib
-)
+set(LLAMA_COMMON_EXTRA_LIBS build_info)
+
+# Use curl to download model url
+if (LLAMA_CURL)
+    find_package(CURL)
+    if (NOT CURL_FOUND)
+        message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
+    endif()
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
+    include_directories(${CURL_INCLUDE_DIRS})
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
+endif()
+
+if (LLAMA_OPENSSL)
+    find_package(OpenSSL)
+    if (OpenSSL_FOUND)
+        include(CheckCSourceCompiles)
+        set(SAVED_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+        set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
+        check_c_source_compiles("
+        #include <openssl/opensslv.h>
+        #if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
+        #    if OPENSSL_VERSION_NUMBER < 0x1010107f
+        #        error bad version
+        #    endif
+        #else
+        #    if OPENSSL_VERSION_NUMBER < 0x30000000L
+        #        error bad version
+        #    endif
+        #endif
+        int main() { return 0; }
+        " OPENSSL_VERSION_SUPPORTED)
+        set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
+        if (OPENSSL_VERSION_SUPPORTED)
+            message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
+            target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
+            target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
+            if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+                target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
+                find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
+                find_library(SECURITY_FRAMEWORK Security REQUIRED)
+                target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
+            endif()
+        endif()
+    else()
+        message(STATUS "OpenSSL not found, SSL support disabled")
+    endif()
+endif()

 if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
    set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
    set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
-    set(LLGUIDANCE_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}llguidance${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+    # Set the correct library file extension based on platform
+    if (WIN32)
+        set(LLGUIDANCE_LIB_NAME "llguidance.lib")
+        # Add Windows-specific libraries
+        set(LLGUIDANCE_PLATFORM_LIBS
+            ws2_32    # Windows Sockets API
+            userenv   # For GetUserProfileDirectoryW
+            ntdll     # For NT functions
+            bcrypt    # For BCryptGenRandom
+        )
+    else()
+        set(LLGUIDANCE_LIB_NAME "libllguidance.a")
+        set(LLGUIDANCE_PLATFORM_LIBS "")
+    endif()

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
@@ -144,10 +166,34 @@ if (LLAMA_LLGUIDANCE)
    add_dependencies(llguidance llguidance_ext)

    target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
-    target_link_libraries(${TARGET} PRIVATE llguidance)
-    if (WIN32)
-        target_link_libraries(${TARGET} PRIVATE ws2_32 userenv ntdll bcrypt)
-    endif()
-endif()
+    # Add platform libraries to the main target
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
+endif ()

-target_link_libraries(${TARGET} PUBLIC llama Threads::Threads)
+target_include_directories(${TARGET} PUBLIC . ../vendor)
+target_compile_features   (${TARGET} PUBLIC cxx_std_17)
+target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+
+
+#
+# copy the license files
+#
+
+# Check if running in GitHub Actions
+if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
+    message(STATUS "Running inside GitHub Actions - copying license files")
+
+    # Copy all files from licenses/ to build/bin/
+    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
+    foreach(LICENSE_FILE ${LICENSE_FILES})
+        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
+        add_custom_command(
+            POST_BUILD
+            TARGET ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                "${LICENSE_FILE}"
+                "$<TARGET_FILE_DIR:llama>/${FILENAME}"
+            COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
+        message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
+    endforeach()
+endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -3,14 +3,8 @@
 #include "common.h"

 #include <set>
-#include <map>
 #include <string>
 #include <vector>
-#include <cstring>
-
-// pseudo-env variable to identify preset-only arguments
-#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
-#define COMMON_ARG_PRESET_STOP_TIMEOUT    "__PRESET_STOP_TIMEOUT"

 //
 // CLI argument parsing
@@ -20,20 +14,15 @@ struct common_arg {
    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
    std::set<enum llama_example> excludes = {};
    std::vector<const char *> args;
-    std::vector<const char *> args_neg;  // for negated args like --no-xxx
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
    std::string help;
    bool is_sparam = false; // is current arg a sampling param?
-    bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
    void (*handler_void)   (common_params & params) = nullptr;
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
    void (*handler_int)    (common_params & params, int) = nullptr;
-    void (*handler_bool)   (common_params & params, bool) = nullptr;
-
-    common_arg() = default;

    common_arg(
        const std::initializer_list<const char *> & args,
@@ -55,13 +44,6 @@ struct common_arg {
        void (*handler)(common_params & params)
    ) : args(args), help(help), handler_void(handler) {}

-    common_arg(
-        const std::initializer_list<const char *> & args,
-        const std::initializer_list<const char *> & args_neg,
-        const std::string & help,
-        void (*handler)(common_params & params, bool)
-    ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
-
    // support 2 values for arg
    common_arg(
        const std::initializer_list<const char *> & args,
@@ -75,38 +57,13 @@ struct common_arg {
    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
    common_arg & set_env(const char * env);
    common_arg & set_sparam();
-    common_arg & set_preset_only();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
-    bool get_value_from_env(std::string & output) const;
-    bool has_value_from_env() const;
-    std::string to_string() const;
-
-    // for using as key in std::map
-    bool operator<(const common_arg& other) const {
-        if (args.empty() || other.args.empty()) {
-            return false;
-        }
-        return strcmp(args[0], other.args[0]) < 0;
-    }
-    bool operator==(const common_arg& other) const {
-        if (args.empty() || other.args.empty()) {
-            return false;
-        }
-        return strcmp(args[0], other.args[0]) == 0;
-    }
-
-    // get all args and env vars (including negated args/env)
-    std::vector<std::string> get_args() const;
-    std::vector<std::string> get_env() const;
+    bool get_value_from_env(std::string & output);
+    bool has_value_from_env();
+    std::string to_string();
 };

-namespace common_arg_utils {
-    bool is_truthy(const std::string & value);
-    bool is_falsey(const std::string & value);
-    bool is_autoy(const std::string & value);
-}
-
 struct common_params_context {
    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
    common_params & params;
@@ -119,13 +76,13 @@ struct common_params_context {
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

-// parse input arguments from CLI into a map
-bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
-
-// populate preset-only arguments
-// these arguments are not treated as command line arguments
-// see: https://github.com/ggml-org/llama.cpp/issues/18163
-void common_params_add_preset_options(std::vector<common_arg> & args);
-
-// initialize argument parser context - used by test-arg-parser and preset
+// function to be used by test-arg-parser
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+
+struct common_remote_params {
+    std::vector<std::string> headers;
+    long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
+    long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
+};
+// get remote file content, returns <http_code, raw_response_body>
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -1,487 +0,0 @@
-#include "chat-auto-parser-helpers.h"
-#include "chat-auto-parser.h"
-#include "chat-peg-parser.h"
-#include "chat.h"
-#include "common.h"
-#include "json-schema-to-grammar.h"
-#include "log.h"
-#include "nlohmann/json.hpp"
-#include "peg-parser.h"
-
-#include <stdexcept>
-#include <string>
-
-using json = nlohmann::ordered_json;
-
-// Helper to iterate over tools/functions
-static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
-    for (const auto & tool : tools) {
-        if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
-            continue;
-        }
-        fn(tool);
-    }
-}
-
-namespace autoparser {
-
-parser_build_context::parser_build_context(common_chat_peg_builder & p, const generation_params & inputs) :
-    p(p),
-    inputs(inputs),
-    reasoning_parser(p.eps()) {}
-
-common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
-                                                  const struct generation_params & inputs) {
-    // Run differential analysis to extract template structure
-    struct autoparser autoparser;
-    autoparser.analyze_template(tmpl);
-    return generate_parser(tmpl, inputs, autoparser);
-}
-
-common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
-                                                  const struct generation_params & inputs,
-                                                  const autoparser &              autoparser) {
-    // Create the result structure
-    common_chat_params data;
-    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
-    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.preserved_tokens = autoparser.preserved_tokens;
-
-    auto parser = autoparser.build_parser(inputs);
-    data.parser = parser.save();
-
-    // Build grammar if tools are present
-    bool has_tools =
-        autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
-    std::string trigger_marker = !autoparser.tools.format.section_start.empty() ? autoparser.tools.format.section_start :
-                                                                                  autoparser.tools.format.per_call_start;
-
-    bool has_response_format = !inputs.json_schema.empty() && inputs.json_schema.is_object();
-    bool include_grammar = has_response_format || (has_tools &&
-            ((inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO && !trigger_marker.empty()) ||
-              inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
-
-    if (include_grammar) {
-        data.grammar_lazy = !has_response_format && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
-        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                auto         schema   = function.contains("parameters") ? function.at("parameters") : json::object();
-                builder.resolve_refs(schema);
-            });
-            parser.build_grammar(builder, data.grammar_lazy);
-        });
-
-        // Set grammar triggers based on tool section markers (fall back to per-call markers)
-        if (data.grammar_lazy) {
-            data.grammar_triggers = {
-                { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
-            };
-        }
-    }
-
-    return data;
-}
-
-common_peg_arena autoparser::build_parser(const generation_params & inputs) const {
-    if (!analysis_complete) {
-        throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
-    }
-    return build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        parser_build_context ctx(p, inputs);
-        bool                 extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-
-        ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
-        ctx.content              = &content;
-        ctx.reasoning            = &reasoning;
-
-        // Build reasoning parser
-        ctx.reasoning_parser = reasoning.build_parser(ctx);
-
-        auto parser = p.eps();
-
-        bool has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
-        bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
-        bool pure_content        = reasoning.mode == reasoning_mode::NONE;
-
-        if (has_response_format) {
-            auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
-            parser = ctx.reasoning_parser + p.space() + p.choice({
-                p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
-                response_format
-            }) + p.end();
-            pure_content = false;
-        } else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
-            parser = tools.build_parser(ctx);
-            pure_content = false;
-        } else {
-            parser = content.build_parser(ctx);
-        }
-        return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser;
-    });
-}
-
-common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) const {
-    auto & p = ctx.p;
-
-    if (!ctx.extracting_reasoning) {
-        return p.eps();
-    }
-
-    if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
-        if (!end.empty()) {
-            if (!start.empty()) {
-                // Standard tag-based: optional(<think>reasoning</think>)
-                return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
-            }
-            // Delimiter-style (empty start)
-            return p.optional(p.reasoning(p.until(end)) + end + p.space());
-        }
-    }
-
-    return p.eps();
-}
-
-common_peg_parser analyze_content::build_parser(parser_build_context & ctx) const {
-    auto & p = ctx.p;
-
-    if (is_always_wrapped()) {
-        if (ctx.extracting_reasoning) {
-            return ctx.reasoning_parser + start + p.content(p.until(end)) + end + p.end();
-        }
-        return p.content(p.until(start)) + start + p.content(p.until(end)) + end + p.end();
-    }
-    return ctx.reasoning_parser + p.content(p.rest()) + p.end();
-}
-
-common_peg_parser analyze_content::build_optional_wrapped(parser_build_context & ctx) const {
-    auto & p = ctx.p;
-
-    if (is_always_wrapped()) {
-        return p.optional(start + p.content(p.until(end)) + end);
-    }
-    return p.eps();
-}
-
-common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const {
-    switch (format.mode) {
-        case tool_format::JSON_NATIVE:
-            return build_tool_parser_json_native(ctx);
-        case tool_format::TAG_WITH_JSON:
-            return build_tool_parser_tag_json(ctx);
-        case tool_format::TAG_WITH_TAGGED:
-            return build_tool_parser_tag_tagged(ctx);
-        default:
-            LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
-                "Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
-                "report an issue at https://github.com/ggml-org/llama.cpp/issues\n");
-            return ctx.p.eps();
-    }
-}
-
-common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
-    auto &       p           = ctx.p;
-    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    // Build effective field names with dot notation if function_field is set
-    std::string name_field = format.name_field;
-    std::string args_field = format.args_field;
-
-    if (!format.function_field.empty() && format.function_field != "function" &&
-        name_field.find('.') == std::string::npos) {
-        name_field = format.function_field + "." + name_field;
-        args_field = format.function_field + "." + args_field;
-    }
-
-    auto tools_parser = p.standard_json_tools(
-        format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
-        inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-        format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
-
-    // Handle content wrappers if present
-    if (ctx.content && ctx.content->is_always_wrapped()) {
-        auto wrapped_content = ctx.content->build_optional_wrapped(ctx);
-        return ctx.reasoning_parser + wrapped_content + tools_parser + p.end();
-    }
-
-    std::string tool_start = "{";
-    if (!format.section_start.empty()) {
-        tool_start = format.section_start;
-    } else if (!format.per_call_start.empty()) {
-        tool_start = format.per_call_start;
-    }
-
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
-           p.end();
-}
-
-common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
-                                                    const common_peg_parser & call_id_section, bool have_call_id,
-                                                    const common_peg_parser & args,
-                                                    std::optional<common_peg_parser> atomic_peek) const {
-    auto              open           = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix);
-    bool              matched_atomic = false;
-    common_peg_parser func_parser    = p.eps();
-
-    if (!function.name_suffix.empty()) {
-        func_parser    = open + call_id_section + p.space() + args;
-        matched_atomic = true;
-    } else if (have_call_id) {
-        func_parser    = p.atomic(open + call_id_section) + p.space() + args;
-        matched_atomic = true;
-    } else if (atomic_peek.has_value()) {
-        func_parser    = p.atomic(open + call_id_section + p.space() + *atomic_peek) + args;
-        matched_atomic = true;
-    } else {
-        func_parser = open + call_id_section + p.space() + args;
-    }
-
-    if (!function.close.empty()) {
-        func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
-    } else if (!format.per_call_end.empty()) {
-        // When there's no func_close but there is a per_call_end marker, use peek() to ensure
-        // we only emit tool_close when we can actually see the closing marker. This prevents
-        // premature closing during partial parsing when we've seen e.g. "</" which could be
-        // either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
-        func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
-    } else {
-        func_parser = func_parser + p.tool_close(p.space());  // force this to process tool closing callbacks in mapper
-    }
-    if (!matched_atomic) {
-        func_parser = p.atomic(func_parser);
-    }
-    return func_parser;
-}
-
-common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
-    auto &       p           = ctx.p;
-    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    common_peg_parser tool_choice = p.choice();
-
-    foreach_function(inputs.tools, [&](const json & tool) {
-        const auto & func   = tool.at("function");
-        std::string  name   = func.at("name");
-        const auto & schema = func.contains("parameters") ? func.at("parameters") : json::object();
-
-        // Build call_id parser based on position (if supported)
-        bool have_call_id = false;
-        common_peg_parser call_id_section = p.eps();
-        if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
-            (!call_id.suffix.empty() || !arguments.start.empty())) {
-            if (!call_id.suffix.empty()) {
-                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
-            } else {
-                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
-            }
-            have_call_id = true;
-        }
-        auto args_parser = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
-        if (!arguments.start.empty()) {
-            args_parser = p.literal(arguments.start) + args_parser;
-        }
-        if (!arguments.end.empty()) {
-            args_parser = args_parser + p.literal(arguments.end);
-        }
-
-        auto atomic_peek = !arguments.start.empty() ? std::optional(p.peek(p.literal(arguments.start))) : std::nullopt;
-        auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_parser, atomic_peek);
-        tool_choice |= p.rule("tool-" + name, func_parser);
-    });
-
-    auto require_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    common_peg_parser tool_calls = p.eps();
-
-    if (!format.per_call_start.empty()) {
-        auto wrapped_call = format.per_call_start + tool_choice + format.per_call_end;
-        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
-        } else {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call);
-        }
-        if (!format.section_start.empty()) {
-            tool_calls = p.trigger_rule("tool-calls",
-                                        p.literal(format.section_start) + p.space() + tool_calls + p.space() +
-                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end)));
-        }
-    } else {
-        std::string separator = ", ";  // Default
-        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", format.section_start + tool_choice +
-                                                         p.zero_or_more(separator + tool_choice) + format.section_end);
-        } else {
-            tool_calls = p.trigger_rule("tool-call", format.section_start + tool_choice + format.section_end);
-        }
-    }
-
-    if (!require_calls) {
-        tool_calls = p.optional(tool_calls);
-    }
-
-    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
-    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
-           p.end();
-}
-
-common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
-    auto &       p           = ctx.p;
-    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    common_peg_parser tool_choice = p.choice();
-
-    foreach_function(inputs.tools, [&](const json & tool) {
-        const auto &          func       = tool.at("function");
-        std::string           name       = func.at("name");
-        const auto &          params     = func.contains("parameters") ? func.at("parameters") : json::object();
-        const auto &          properties = params.contains("properties") ? params.at("properties") : json::object();
-        std::set<std::string> required;
-
-        // Build parser for each argument, separating required and optional
-        std::vector<common_peg_parser> required_parsers;
-        std::vector<common_peg_parser> optional_parsers;
-        for (const auto & [param_name, param_schema] : properties.items()) {
-            bool        is_required = required.find(param_name) != required.end();
-            std::string type        = "object";
-            if (param_schema.contains("type")) {
-                const auto & type_obj = param_schema.at("type");
-                if (type_obj.is_string()) {
-                    type_obj.get_to(type);
-                } else if (type_obj.is_array()) {
-                    // Handle nullable types like ["string", "null"]
-                    for (const auto & t : type_obj) {
-                        if (t.is_string() && t.get<std::string>() != "null") {
-                            type = t.get<std::string>();
-                            break;
-                        }
-                    }
-                } else if (type_obj.is_object()) {
-                    if (type_obj.contains("type") && type_obj.at("type").is_string()) {
-                        type_obj.at("type").get_to(type);
-                    }
-                }
-            }
-            // Infer string type from enum values when type is unspecified
-            if (type == "object" && param_schema.contains("enum")) {
-                const auto & enum_vals = param_schema.at("enum");
-                if (enum_vals.is_array()) {
-                    for (const auto & v : enum_vals) {
-                        if (v.is_string()) {
-                            type = "string";
-                            break;
-                        }
-                    }
-                }
-            }
-
-            auto arg =
-                p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
-                                           arguments.name_suffix) +
-                           arguments.value_prefix +
-                           (type == "string" ?
-                                p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
-                                                                 "tool-" + name + "-arg-" + param_name + "-schema",
-                                                                 param_schema, true)) :
-                                p.tool_arg_json_value(p.schema(
-                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
-                                    p.space()) +
-                           p.tool_arg_close(p.literal(arguments.value_suffix)));
-
-            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
-            if (is_required) {
-                required_parsers.push_back(named_arg);
-            } else {
-                optional_parsers.push_back(named_arg);
-            }
-        }
-
-        // Build required arg sequence in definition order
-        common_peg_parser args_seq = p.eps();
-        for (size_t i = 0; i < required_parsers.size(); i++) {
-            if (i > 0) {
-                args_seq = args_seq + p.space();
-            }
-            args_seq = args_seq + required_parsers[i];
-        }
-
-        // Build optional args with flexible ordering
-        if (!optional_parsers.empty()) {
-            common_peg_parser any_opt = p.choice();
-            for (const auto & opt : optional_parsers) {
-                any_opt |= opt;
-            }
-            args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
-        }
-
-        if (!arguments.start.empty()) {
-            args_seq = p.literal(arguments.start) + args_seq;
-        }
-        if (!arguments.end.empty()) {
-            args_seq = args_seq + p.literal(arguments.end);
-        }
-
-        // Build call_id parser based on position (if supported)
-        common_peg_parser call_id_section = p.eps();
-        bool have_call_id = false;
-        if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
-            (!call_id.suffix.empty() || !arguments.start.empty())) {
-            have_call_id = true;
-            if (!call_id.suffix.empty()) {
-                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
-            } else {
-                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
-            }
-        }
-
-        // Only peek for an arg tag when there are required args that must follow.
-        // When all args are optional, the model may emit no arg tags at all (#20650).
-        auto atomic_peek = (!arguments.name_prefix.empty() && !required_parsers.empty()) ?
-            std::optional(p.peek(p.literal(arguments.name_prefix))) : std::nullopt;
-        auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_seq, atomic_peek);
-        tool_choice |= p.rule("tool-" + name, func_parser);
-    });
-
-    auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    common_peg_parser tool_calls = p.eps();
-
-    if (!format.per_call_start.empty()) {
-        auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
-        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
-        } else {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call);
-        }
-        if (!format.section_start.empty()) {
-            tool_calls = p.trigger_rule("tool-calls",
-                                        p.literal(format.section_start) + p.space() + tool_calls + p.space() +
-                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end)));
-        }
-    } else {
-        std::string separator = ", ";  // Default
-
-        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", format.section_start + p.space() + tool_choice +
-                                                         p.zero_or_more(separator + tool_choice) + p.space() +
-                                                         format.section_end);
-        } else {
-            tool_calls = p.trigger_rule(
-                "tool-call", format.section_start + p.space() + tool_choice + p.space() + format.section_end);
-        }
-    }
-
-    if (!require_tools) {
-        tool_calls = p.optional(tool_calls);
-    }
-
-    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
-    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
-           p.end();
-}
-
-}  // namespace autoparser
--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@@ -1,364 +0,0 @@
-#include "chat-auto-parser-helpers.h"
-
-#include "chat-auto-parser.h"
-#include "chat-peg-parser.h"
-#include "chat.h"
-#include "log.h"
-#include "nlohmann/json.hpp"
-#include "peg-parser.h"
-
-#include <cctype>
-#include <numeric>
-
-using json = nlohmann::ordered_json;
-
-std::string trim_whitespace(const std::string & str) {
-    size_t start = 0;
-    while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
-        start++;
-    }
-
-    if (start == str.length()) {
-        return "";
-    }
-
-    size_t end = str.length() - 1;
-    while (end > start && std::isspace(static_cast<unsigned char>(str[end]))) {
-        end--;
-    }
-
-    return str.substr(start, end - start + 1);
-}
-
-std::string trim_leading_whitespace(const std::string & str) {
-    size_t start = 0;
-    while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
-        start++;
-    }
-
-    return str.substr(start);
-}
-
-std::string trim_trailing_whitespace(const std::string & str) {
-    if (str.empty()) {
-        return "";
-    }
-
-    size_t end = str.length() - 1;
-    while (end > 0 && std::isspace(static_cast<unsigned char>(str[end]))) {
-        end--;
-    }
-
-    // If first char is also whitespace, return empty string
-    if (end == 0 && std::isspace(static_cast<unsigned char>(str[0]))) {
-        return "";
-    }
-
-    return str.substr(0, end + 1);
-}
-
-std::string trim_trailing_newlines(const std::string & str) {
-    size_t end = str.length();
-    while (end > 0 && str[end - 1] == '\n') {
-        end--;
-    }
-
-    return str.substr(0, end);
-}
-
-static size_t common_prefix_len(const std::string & left, const std::string & right) {
-    size_t prefix_len = 0;
-    size_t min_len    = std::min(left.length(), right.length());
-    while (prefix_len < min_len && left[prefix_len] == right[prefix_len]) {
-        prefix_len++;
-    }
-    return prefix_len;
-}
-
-static size_t common_suffix_len(const std::string & left, const std::string & right) {
-    size_t suffix_len = 0;
-    size_t min_len    = std::min(left.length(), right.length());
-    while (suffix_len < min_len && left[left.length() - 1 - suffix_len] == right[right.length() - 1 - suffix_len]) {
-        suffix_len++;
-    }
-    return suffix_len;
-}
-
-diff_split calculate_diff_split(const std::string & left, const std::string & right) {
-    diff_split result;
-
-    auto left_seg = segmentize_markers(left);
-    auto right_seg = segmentize_markers(right);
-
-    if (left_seg.empty()) {
-        result.right = right;
-        return result;
-    }
-    if (right_seg.empty()) {
-        result.left = left;
-        return result;
-    }
-
-    auto left_start = left_seg.begin();
-    auto left_end = --left_seg.end();
-    auto right_start = right_seg.begin();
-    auto right_end = --right_seg.end();
-
-    auto test = [&] () {
-        return left_start != left_end && right_start != right_end;
-    };
-
-    bool left_fully_consumed = false;
-    bool right_fully_consumed = false;
-
-    while (test()) {
-        bool advanced = false;
-        if (*left_start == *right_start) {
-            result.prefix.append(left_start->value);
-            left_start++;
-            right_start++;
-            advanced = true;
-        }
-        if (*left_end == *right_end) {
-            result.suffix = left_end->value + result.suffix;
-            if (left_start != left_end) {
-                left_end--;
-            } else {
-                left_fully_consumed = true;
-            }
-            if (right_start != right_end) {
-                right_end--;
-            } else {
-                right_fully_consumed = true;
-            }
-            advanced = true;
-        }
-        if (!advanced) {
-            break;
-        }
-    }
-
-    if (left_start == left_end && right_start != right_end) {
-        if (*left_start == *right_end) {
-            result.suffix = right_end->value + result.suffix;
-            right_end--;
-            left_fully_consumed = true;
-        } else if (*left_start == *right_start) {
-            result.prefix.append(right_start->value);
-            right_start++;
-            left_fully_consumed = true;
-        }
-    } else if (right_start == right_end && left_start != left_end) {
-        if (*left_end == *right_start) {
-            result.suffix = left_end->value + result.suffix;
-            left_end--;
-            right_fully_consumed = true;
-        } else if (*left_start == *right_start) {
-            result.prefix.append(left_start->value);
-            left_start++;
-            right_fully_consumed = true;
-        }
-    } else if (left_start == left_end && right_start == right_end && *left_start == *right_start && left_start->type == segment_type::MARKER) {
-        result.prefix.append(right_start->value);
-        left_fully_consumed = true;
-        right_fully_consumed = true;
-    }
-
-    auto eat_segment = [](std::string str, const segment & seg) -> std::string { return std::move(str) + seg.value; };
-
-    bool can_have_text_suffix = left_end->type == segment_type::TEXT && right_end->type == segment_type::TEXT;
-    bool can_have_text_prefix = right_start->type == segment_type::TEXT && left_start->type == segment_type::TEXT;
-
-    std::string remainder_left = std::accumulate(left_start, left_fully_consumed ? left_end : ++left_end, std::string(), eat_segment);
-    std::string remainder_right = std::accumulate(right_start, right_fully_consumed ? right_end : ++right_end, std::string(), eat_segment);
-
-    size_t suffix_len = can_have_text_suffix ? common_suffix_len(remainder_left, remainder_right) : 0;
-    // avoid overlaps between prefix and suffix
-    size_t prefix_len = can_have_text_prefix ? common_prefix_len(remainder_left.substr(0, remainder_left.size() - suffix_len),
-        remainder_right.substr(0, remainder_right.size() - suffix_len)) : 0;
-
-    result.prefix.append(remainder_left.substr(0, prefix_len));
-    result.suffix = remainder_left.substr(remainder_left.length() - suffix_len, suffix_len) + result.suffix;
-    result.left = remainder_left.substr(prefix_len, remainder_left.length() - prefix_len - suffix_len);
-    result.right = remainder_right.substr(prefix_len, remainder_right.length() - prefix_len - suffix_len);
-
-    if (result.left == "" && result.right == "") {
-        // degenerate case, no diff
-        result.prefix = left;
-        result.suffix = "";
-        // pick prefix = all as representation
-    }
-
-    // When left has no unique content (result.left is empty), left is entirely
-    // shared with right. The simultaneous prefix/suffix segment matching can
-    // incorrectly consume trailing segments of left as suffix when those same
-    // segments also appear at the end of right (e.g. "\n" at the end of both
-    // the shared content and the generation prompt). This rotates the diff.
-    // Fix: if left is a prefix of right, enforce that directly.
-    if (result.left.empty() && !result.right.empty() &&
-            left.size() <= right.size() &&
-            right.substr(0, left.size()) == left) {
-        result.prefix = left;
-        result.suffix = "";
-        result.right  = right.substr(left.size());
-    }
-
-    return result;
-}
-
-// Returns the prefix of `full` up until the first occurrence of the common prefix of `left` and `right`
-std::string until_common_prefix(const std::string & full, const std::string & left, const std::string & right) {
-    // Find the common prefix of left and right
-    size_t common_prefix_len = 0;
-    size_t min_len           = std::min(left.length(), right.length());
-    while (common_prefix_len < min_len && left[common_prefix_len] == right[common_prefix_len]) {
-        common_prefix_len++;
-    }
-
-    // If there's no common prefix, return empty string
-    if (common_prefix_len == 0) {
-        return "";
-    }
-
-    // Find the common prefix in the full string
-    std::string common_prefix = left.substr(0, common_prefix_len);
-    size_t      pos           = full.find(common_prefix);
-
-    // If not found, return empty string
-    if (pos == std::string::npos) {
-        return "";
-    }
-
-    // Return everything before the common prefix
-    return full.substr(0, pos);
-}
-
-// Returns the suffix of `full` after the last occurrence of the common suffix of `left` and `right`
-std::string after_common_suffix(const std::string & full, const std::string & left, const std::string & right) {
-    // Find the common suffix of left and right (compare from the end)
-    size_t common_suffix_len = 0;
-    size_t min_len           = std::min(left.length(), right.length());
-    while (common_suffix_len < min_len &&
-           left[left.length() - 1 - common_suffix_len] == right[right.length() - 1 - common_suffix_len]) {
-        common_suffix_len++;
-    }
-
-    // If there's no common suffix, return empty string
-    if (common_suffix_len == 0) {
-        return "";
-    }
-
-    // Extract the common suffix
-    std::string common_suffix = left.substr(left.length() - common_suffix_len);
-
-    // Find the last occurrence of the common suffix in the full string
-    size_t pos = full.rfind(common_suffix);
-
-    // If not found, return empty string
-    if (pos == std::string::npos) {
-        return "";
-    }
-
-    // Return everything after the common suffix
-    return full.substr(pos + common_suffix_len);
-}
-
-// TODO: segmentize will treat a JSON array inside tags as a tag: <calls>[{ "fun": { ... } }]</calls> will be three markers
-// not too worried about that because it hasn't turned out as a problem anywhere, but noting here in case it will
-// Might have to put some restrictions on tag contents as well (like "no { }")
-std::vector<segment> segmentize_markers(const std::string & text) {
-    std::vector<segment> retval;
-    bool in_marker = false;
-    char marker_opener = '\0';
-
-    auto is_marker_opener = [](char c) -> bool { return c == '<' || c == '['; };
-    auto is_marker_closer = [](char op, char c) -> bool { return (op == '<' && c == '>') || (op == '[' && c == ']'); };
-
-    size_t last_border = 0;
-
-    for (size_t cur_pos = 0; cur_pos < text.length(); cur_pos++) {
-        if (!in_marker && is_marker_opener(text[cur_pos])) {
-            if (last_border < cur_pos) {
-                retval.push_back(segment(segment_type::TEXT, text.substr(last_border, cur_pos - last_border)));
-            }
-            last_border = cur_pos;
-            in_marker = true;
-            marker_opener = text[cur_pos];
-        } else if (in_marker && is_marker_closer(marker_opener, text[cur_pos])) {
-            // no need to check because last_border will always be smaller
-                retval.push_back(segment(segment_type::MARKER, text.substr(last_border, cur_pos - last_border + 1)));
-            last_border = cur_pos + 1;
-            in_marker = false;
-            marker_opener = '\0';
-        }
-    }
-    if (last_border < text.length()) {
-            retval.push_back(segment(segment_type::TEXT, text.substr(last_border)));
-    }
-    return retval;
-}
-
-std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments) {
-    std::vector<segment> result;
-    for (const auto & seg : segments) {
-        if (!trim_whitespace(seg.value).empty()) {
-            result.push_back(seg);
-        }
-    }
-    return result;
-}
-
-namespace autoparser {
-
-std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
-    generation_params tmpl_params;
-    tmpl_params.messages              = params.messages;
-    tmpl_params.tools                 = params.tools;
-    tmpl_params.add_generation_prompt = params.add_generation_prompt;
-    tmpl_params.enable_thinking       = params.enable_thinking;
-
-    if (params.extra_context) {
-        tmpl_params.extra_context = *params.extra_context;
-    }
-    tmpl_params.extra_context["enable_thinking"] = params.enable_thinking;
-
-    try {
-        return common_chat_template_direct_apply(tmpl, tmpl_params);
-    } catch (const std::exception & e) {
-        LOG_DBG("Template application failed: %s\n", e.what());
-        return "";
-    }
-}
-
-std::optional<compare_variants_result> compare_variants(
-    const common_chat_template &                   tmpl,
-    const template_params &                        params_A,
-    const std::function<void(template_params &)> & params_modifier) {
-    // Create variant B by copying A
-    template_params params_B = params_A;
-
-    // Apply modifier to create variant B
-    if (params_modifier) {
-        params_modifier(params_B);
-    }
-
-    // Apply template to both variants
-    std::string output_A = apply_template(tmpl, params_A);
-    std::string output_B = apply_template(tmpl, params_B);
-
-    // Check for template application failures
-    if (output_A.empty() || output_B.empty()) {
-        return std::nullopt;
-    }
-
-    // Calculate diff and return result with both outputs
-    compare_variants_result result;
-    result.diff     = calculate_diff_split(output_A, output_B);
-    result.output_A = output_A;
-    result.output_B = output_B;
-
-    return result;
-}
-
-}  // namespace autoparser
-
--- a/common/chat-auto-parser-helpers.h
+++ b/common/chat-auto-parser-helpers.h
@@ -1,74 +0,0 @@
-#pragma once
-
-#include "chat-auto-parser.h"
-
-#include <functional>
-#include <optional>
-#include <string>
-
-std::string trim_whitespace(const std::string & str);
-std::string trim_leading_whitespace(const std::string & str);
-std::string trim_trailing_whitespace(const std::string & str);
-std::string trim_trailing_newlines(const std::string & str);
-
-// calculate a diff split (longest common prefix, longest common suffix excluding prefix,
-// mismatched part on the left, mismatched part on the right) between two strings
-// account for markers - align prefix and suffix endings so that they end on markers
-// * eg.:
-// calculate_diff_split("<html><body><div></div></body></html>", "<html><body><p>Something</p></body><html>") ->
-//  { "prefix": "<html><body>" (not: "<html><body><"), "suffix": "</body></html>", "left": "<div></div>", "right": "<p>Something</p>" }
-// calculate_diff_split("<html><body>Something</body></html>", "<html><body></body><html>") ->
-//  { "prefix": "<html><body>", "suffix": "</body></html>", "left": "Something", "right": "" }
-diff_split calculate_diff_split(const std::string & left, const std::string & right);
-
-// Returns the prefix of `full` up until the first occurrence of the common prefix of `left` and `right`
-// Returns empty string if there's no common prefix
-// * eg.:
-// until_common_prefix("really want a FUNCTION call", "FUNCTION alpha", "FUNCTION beta") -> "really want a "
-// until_common_prefix("<tool_call>", "<something>", "<something_else>") -> ""
-// until_common_prefix("some text", "1234", "abcd") -> ""
-// until_common_prefix("one arg two args three args four", "argument alpha", "argument beta") -> "one ""
-std::string until_common_prefix(const std::string & full, const std::string & left, const std::string & right);
-
-// Returns the suffix of `full` after the last occurrence of the common suffix of `left` and `right`
-// Returns empty string if there's no common suffix
-// Mirror function of `until_common_prefix`
-// * eg.:
-// after_common_suffix("really want a FUNCTION call", "first FUNCTION", "second FUNCTION") -> " call"
-// after_common_suffix("one arg two-args three args four", "alpha-args", "beta-args") -> " three args four"
-std::string after_common_suffix(const std::string & full, const std::string & left, const std::string & right);
-
-// Segmentize text into markers and non-marker fragments
-// * eg.:
-// segmentize_markers("<html><head><title>The site title</title><body><div>Here's some <b>content</b></div></body></html>" ->
-//  [ (MARKER, "<html>"), (MARKER, "<head>"), (MARKER, "<title>"), (TEXT, "The site title"), (MARKER, "</title>"),
-//    (MARKER, "<body>"), (MARKER, "<div>"), (TEXT, "Here's some "), (MARKER, "<b>"), (TEXT, "content"), (MARKER, "</b>"),
-//    (MARKER, "</div>"), (MARKER, "</body>"), (MARKER, "</html>")
-//  ]
-// segmentize_markers("<|tool_call|>[args]{ are here }[/args]<|tool_call_end|>") ->
-//  [ (MARKER, "<|tool_call|>"), (MARKER, "[args]"), (TEXT, "{ are here }"), (MARKER, "[/args]"), (MARKER, "<|tool_call_end|>") ]
-std::vector<segment> segmentize_markers(const std::string & text);
-
-// Prune whitespace-only segments from a vector of segments
-// * eg.:
-// segmentize_markers("<tool_call>\n<function=foo>\n<arg=bar>\n   \n</arg>\n</function>\n</tool_call>") ->
-//  X = [ (MARKER, "<tool_call>"), (TEXT, "\n"), (MARKER, "<function=foo>"), (TEXT, "\n"), (MARKER, "<arg=bar>"), (TEXT, "\n   \n"),
-//        (MARKER, "</arg>"), (TEXT, "\n"), (MARKER, "</function>"), (TEXT, "\n"), (MARKER, "</tool_call>") ]
-// prune_whitespace_segments(X) -> [ (MARKER, "<tool_call>"), (MARKER, "<function=foo>"), (MARKER, "<arg=bar>"), (MARKER, "</arg>"),
-//                                   (MARKER, "</function>"), (MARKER, "</tool_call>") ]
-std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments);
-
-namespace autoparser {
-
-// Apply a template with the given parameters, returning the rendered string (empty on failure)
-std::string apply_template(const common_chat_template & tmpl, const template_params & params);
-
-// Factorized differential comparison function
-// Takes base params and a single modifier lambda to create variant B
-// Returns compare_variants_result containing diff and both outputs, or std::nullopt on failure
-std::optional<compare_variants_result> compare_variants(
-    const common_chat_template &                   tmpl,
-    const template_params &                        params_A,
-    const std::function<void(template_params &)> & params_modifier);
-
-}  // namespace autoparser
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -1,434 +0,0 @@
-#pragma once
-
-#include "chat.h"
-#include "common.h"
-#include "jinja/caps.h"
-#include "peg-parser.h"
-#include "nlohmann/json.hpp"
-
-#include <chrono>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-using json = nlohmann::ordered_json;
-
-class common_chat_peg_builder;
-
-// ============================================================================
-// Parameters for template application (low-level, used by diff analysis)
-// ============================================================================
-struct template_params {
-    json                messages;
-    json                tools;
-    bool                add_generation_prompt = false;
-    bool                enable_thinking       = true;
-    std::optional<json> extra_context         = std::nullopt;
-};
-
-struct diff_split {
-    std::string prefix;
-    std::string suffix;
-    std::string left;
-    std::string right;
-
-    bool operator==(struct diff_split & other) const {
-        return prefix == other.prefix && suffix == other.suffix && left == other.left && right == other.right;
-    }
-};
-
-// Result of compare_variants containing diff and original outputs
-struct compare_variants_result {
-    diff_split  diff;
-    std::string output_A;
-    std::string output_B;
-};
-
-namespace autoparser {
-
-// ============================================================================
-// High-level params for parser generation
-// ============================================================================
-
-struct generation_params {
-    json                                  messages;
-    json                                  tools;
-    common_chat_tool_choice               tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
-    json                                  json_schema;
-    bool                                  parallel_tool_calls = true;
-    common_reasoning_format               reasoning_format    = COMMON_REASONING_FORMAT_AUTO;
-    bool                                  stream              = true;
-    std::string                           grammar;
-    bool                                  add_generation_prompt = false;
-    bool                                  enable_thinking       = true;
-    std::chrono::system_clock::time_point now                   = std::chrono::system_clock::now();
-    std::string                           generation_prompt;
-    json                                  extra_context;
-    bool                                  add_bos       = false;
-    bool                                  add_eos       = false;
-    bool                                  is_inference  = true;
-    bool                                  add_inference = false;
-    bool                                  mark_input    = true;  // whether to mark input strings in the jinja context
-};
-
-// ============================================================================
-// Analysis Result Enums
-// ============================================================================
-
-// Reasoning handling mode (derived from R1-R3 comparisons)
-enum class reasoning_mode {
-    NONE,           // No reasoning markers detected
-    TAG_BASED,      // Tag-based: <think>...</think> (start can be empty for delimiter-style)
-    TOOLS_ONLY      // Only reason on tool calls, not on normal content
-};
-
-inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode) {
-    switch (mode) {
-        case reasoning_mode::NONE:
-            return os << "NONE";
-        case reasoning_mode::TAG_BASED:
-            return os << "TAG_BASED";
-        case reasoning_mode::TOOLS_ONLY:
-            return os << "TOOLS_ONLY";
-        default:
-            return os << "UNKNOWN";
-    }
-}
-
-// Content wrapping mode (derived from C1 comparison)
-enum class content_mode {
-    PLAIN,                   // No content markers
-    ALWAYS_WRAPPED,          // Content always wrapped with markers
-    WRAPPED_WITH_REASONING,  // Content wrapped only when reasoning present
-};
-
-inline std::ostream & operator<<(std::ostream & os, const content_mode & mode) {
-    switch (mode) {
-        case content_mode::PLAIN:
-            return os << "PLAIN";
-        case content_mode::ALWAYS_WRAPPED:
-            return os << "ALWAYS_WRAPPED";
-        case content_mode::WRAPPED_WITH_REASONING:
-            return os << "WRAPPED_WITH_REASONING";
-        default:
-            return os << "UNKNOWN";
-    }
-}
-
-// Call ID position in tool calls (for non-JSON formats)
-enum class call_id_position {
-    NONE,                   // No call ID support detected
-    PRE_FUNC_NAME,          // Call ID before function name: [CALL_ID]id[FUNC]name{args}
-    BETWEEN_FUNC_AND_ARGS,  // Call ID between function and args: [FUNC]name[CALL_ID]id{args}
-    POST_ARGS,              // Call ID after arguments: [FUNC]name{args}[CALL_ID]id
-};
-
-inline std::ostream & operator<<(std::ostream & os, const call_id_position & pos) {
-    switch (pos) {
-        case call_id_position::NONE:
-            return os << "NONE";
-        case call_id_position::PRE_FUNC_NAME:
-            return os << "PRE_FUNC_NAME";
-        case call_id_position::BETWEEN_FUNC_AND_ARGS:
-            return os << "BETWEEN_FUNC_AND_ARGS";
-        case call_id_position::POST_ARGS:
-            return os << "POST_ARGS";
-        default:
-            return os << "UNKNOWN";
-    }
-}
-
-// Tool call format classification (derived from T1-T5, A1-A3 comparisons)
-enum class tool_format {
-    NONE,             // No tool support detected
-    JSON_NATIVE,      // Pure JSON: {"name": "X", "arguments": {...}}
-    TAG_WITH_JSON,    // Tag-based with JSON args: <function=X>{...}</function>
-    TAG_WITH_TAGGED,  // Tag-based with tagged args: <param=key>value</param>
-};
-
-inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
-    switch (format) {
-        case tool_format::NONE:
-            return os << "NONE";
-        case tool_format::JSON_NATIVE:
-            return os << "JSON_NATIVE";
-        case tool_format::TAG_WITH_JSON:
-            return os << "TAG_WITH_JSON";
-        case tool_format::TAG_WITH_TAGGED:
-            return os << "TAG_WITH_TAGGED";
-        default:
-            return os << "UNKNOWN";
-    }
-}
-
-// ============================================================================
-// Sub-structs for tool analysis
-// ============================================================================
-
-struct tool_format_analysis {
-    tool_format mode = tool_format::NONE;
-
-    std::string section_start;   // e.g., "<tool_call>", "[TOOL_CALLS]", ""
-    std::string section_end;     // e.g., "</tool_call>", ""
-    std::string per_call_start;  // e.g., "<|tool_call_begin|>", "" (for multi-call templates)
-    std::string per_call_end;    // e.g., "<|tool_call_end|>", ""
-
-    bool fun_name_is_key = false;       // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
-    bool tools_array_wrapped = false;   // Tool calls wrapped in JSON array [...]
-
-    std::string              function_field = "function";
-    std::string              name_field     = "name";
-    std::string              args_field     = "arguments";
-    std::string              id_field;
-    std::string              gen_id_field;
-    std::vector<std::string> parameter_order;
-};
-
-struct tool_function_analysis {
-    std::string name_prefix;  // e.g., "<function=", "\"name\": \"", "functions."
-    std::string name_suffix;  // e.g., ">", "\"", ":0"
-    std::string close;        // e.g., "</function>", "" (for tag-based)
-};
-
-struct tool_arguments_analysis {
-    std::string start;          // e.g., "<|tool_call_argument_begin|>", "<args>"
-    std::string end;            // e.g., "<|tool_call_argument_end|>", "</args>"
-    std::string name_prefix;   // e.g., "<param=", "<arg_key>", "\""
-    std::string name_suffix;   // e.g., ">", "</arg_key>", "\":"
-    std::string value_prefix;  // e.g., "", "<arg_value>", ""
-    std::string value_suffix;  // e.g., "</param>", "</arg_value>", ""
-    std::string separator;     // e.g., "", "\n", ","
-};
-
-struct tool_id_analysis {
-    call_id_position pos = call_id_position::NONE;
-
-    std::string prefix;  // e.g., "[CALL_ID]" (marker before call ID value)
-    std::string suffix;  // e.g., "" (marker after call ID value, before next section)
-};
-
-// ============================================================================
-// Parser build context (shared interface for build_parser methods)
-// ============================================================================
-
-struct analyze_content;
-struct analyze_reasoning;
-
-struct parser_build_context {
-    common_chat_peg_builder & p;
-    const generation_params &         inputs;
-    common_peg_parser                 reasoning_parser;
-    bool                              extracting_reasoning = false;
-    const analyze_reasoning *         reasoning            = nullptr;
-    const analyze_content *           content              = nullptr;
-
-    parser_build_context(common_chat_peg_builder & p, const generation_params & inputs);
-};
-
-// ============================================================================
-// Base class for analyzers with parser building
-// ============================================================================
-
-struct analyze_base {
-    virtual ~analyze_base() = default;
-    virtual common_peg_parser build_parser(parser_build_context & ctx) const = 0;
-
-  protected:
-    const common_chat_template * tmpl = nullptr;
-
-    analyze_base() = default;
-    explicit analyze_base(const common_chat_template & tmpl) : tmpl(&tmpl) {}
-};
-
-// ============================================================================
-// Reasoning analyzer
-// ============================================================================
-
-struct analyze_reasoning : analyze_base {
-    reasoning_mode mode = reasoning_mode::NONE;
-
-    std::string start;  // e.g., "<think>", "[THINK]", "<|START_THINKING|>", ""
-    std::string end;    // e.g., "</think>", "[BEGIN FINAL RESPONSE]", "<|END_THINKING|>"
-
-    analyze_reasoning() = default;
-    analyze_reasoning(const common_chat_template & tmpl, bool supports_tools);
-    analyze_reasoning(std::string start_, std::string end_) : start(std::move(start_)), end(std::move(end_)) {}
-
-    common_peg_parser build_parser(parser_build_context & ctx) const override;
-
-  private:
-    // Look for reasoning markers in rendered content
-    void compare_reasoning_presence();
-
-    // Compare generation prompt with enable_thinking=true vs false
-    void compare_thinking_enabled();
-
-    // Check if reasoning is always possible or only in tool calls
-    void compare_reasoning_scope();
-};
-
-// ============================================================================
-// Content analyzer
-// ============================================================================
-
-struct analyze_content : analyze_base {
-    content_mode mode = content_mode::PLAIN;
-
-    std::string start;  // e.g., "<response>", ">>>all\n", ""
-    std::string end;    // e.g., "</response>", ""
-
-    bool requires_nonnull_content = false;
-
-    analyze_content() = default;
-    analyze_content(const common_chat_template & tmpl, const analyze_reasoning & reasoning);
-
-    common_peg_parser build_parser(parser_build_context & ctx) const override;
-
-    bool is_always_wrapped() const;
-    common_peg_parser build_optional_wrapped(parser_build_context & ctx) const;
-};
-
-// ============================================================================
-// Tool analyzer
-// ============================================================================
-
-struct analyze_tools : analyze_base {
-    tool_format_analysis    format;
-    tool_function_analysis  function;
-    tool_arguments_analysis arguments;
-    tool_id_analysis        call_id;
-
-    analyze_tools() = default;
-    analyze_tools(const common_chat_template & tmpl,
-                  const jinja::caps &          caps,
-                  const analyze_reasoning &    reasoning);
-
-    common_peg_parser build_parser(parser_build_context & ctx) const override;
-
-  private:
-    // Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
-    void analyze_tool_calls(const analyze_reasoning & reasoning);
-
-    // Analyze format based on position of function and argument name in needle
-    void analyze_tool_call_format(const std::string &       haystack,
-                                  const std::string &       fun_name_needle,
-                                  const std::string &       arg_name_needle,
-                                  const analyze_reasoning & reasoning);
-
-    // Analyze specifics of JSON native format (entire tool call is a JSON object)
-    void analyze_tool_call_format_json_native(const std::string & clean_haystack,
-                                              const std::string & fun_name_needle,
-                                              const std::string & arg_name_needle);
-
-    // Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
-    void analyze_tool_call_format_non_json(const std::string & clean_haystack,
-                                           const std::string & fun_name_needle);
-
-    // Check for and extract specific per-call markers for non-native-JSON templates with parallel call support
-    void check_per_call_markers();
-
-    // Extract function name markers
-    void extract_function_markers();
-
-    // Delegates to separate functions for: separator analysis, argument name analysis, argument value analysis
-    void analyze_arguments();
-
-    // Extract argument name markers
-    void extract_argument_name_markers();
-
-    // Extract argument value markers
-    void extract_argument_value_markers();
-
-    // Extract argument separator, if specified (eg. <arg=foo>...</arg><sep><arg=bar>...</arg>)
-    void extract_argument_separator();
-
-    // Extract argument wrapper markers, if present (eg. '<args><arg=foo>...</arg><arg=bar>...</arg></args>')
-    void extract_args_markers();
-
-    // Extract call ID markers, if present
-    void extract_call_id_markers();
-
-    // Per-format tool parser builders
-    common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
-    common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
-    common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
-
-    // Shared helper: builds func_parser from open+call_id+args, handling atomic wrapping and close.
-    // atomic_peek: if present, used as the peek expression in the third atomicity branch.
-    common_peg_parser build_func_parser(common_chat_peg_builder & p, const std::string & name,
-                                        const common_peg_parser & call_id_section, bool have_call_id,
-                                        const common_peg_parser & args,
-                                        std::optional<common_peg_parser> atomic_peek) const;
-};
-
-// ============================================================================
-// Main autoparser class
-// ============================================================================
-
-struct autoparser {
-    jinja::caps          jinja_caps;
-    analyze_reasoning    reasoning;
-    analyze_content      content;
-    analyze_tools        tools;
-    bool                 analysis_complete = false;
-
-    // Preserved tokens for tokenizer (union of all non-empty markers)
-    std::vector<std::string> preserved_tokens;
-
-    autoparser() = default;
-
-    // Run full differential analysis on a template
-    void analyze_template(const common_chat_template & tmpl);
-
-    // Build the PEG parser for this template
-    common_peg_arena build_parser(const generation_params & inputs) const;
-
-  private:
-    // Collect tokens from entire analysis to preserve
-    void collect_preserved_tokens();
-};
-
-// ============================================================================
-// Parser generator
-// ============================================================================
-
-class peg_generator {
-  public:
-    static common_chat_params generate_parser(const common_chat_template &    tmpl,
-                                              const struct generation_params & inputs);
-
-    static common_chat_params generate_parser(const common_chat_template &    tmpl,
-                                              const struct generation_params & inputs,
-                                              const autoparser &              autoparser);
-};
-
-}  // namespace autoparser
-
-enum segment_type { TEXT, MARKER };
-
-inline std::ostream & operator<<(std::ostream & os, const segment_type & type) {
-    switch (type) {
-        case segment_type::TEXT:
-            return os << "TEXT";
-        case segment_type::MARKER:
-            return os << "MARKER";
-        default:
-            return os << "UNKNOWN";
-    }
-}
-
-struct segment {
-    segment_type type;
-    std::string  value;
-
-    segment(segment_type type, std::string value) : type(type), value(std::move(value)) {}
-
-    bool operator==(const segment & other) const {
-        return type == other.type && value == other.value;
-    }
-
-    bool operator!=(const segment & other) const {
-        return !(*this == other);
-    }
-};
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -0,0 +1,534 @@
+#include "chat-parser.h"
+#include "common.h"
+#include "log.h"
+#include "regex-partial.h"
+
+#include <algorithm>
+#include <cctype>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
+    : input_(input), is_partial_(is_partial), syntax_(syntax)
+{
+    result_.role = "assistant";
+
+    while (true) {
+        std::string id = std::to_string(std::rand());
+        if (input.find(id) == std::string::npos) {
+            healing_marker_ = id;
+            break;
+        }
+    }
+}
+
+std::string common_chat_msg_parser::str(const common_string_range & rng) const {
+    GGML_ASSERT(rng.begin <= rng.end);
+    return input_.substr(rng.begin, rng.end - rng.begin);
+}
+
+void common_chat_msg_parser::add_content(const std::string &content) {
+    result_.content += content;
+}
+
+void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
+    result_.reasoning_content += reasoning_content;
+}
+
+bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
+    if (name.empty()) {
+        return false;
+    }
+
+    common_chat_tool_call tool_call;
+    tool_call.name = name;
+    tool_call.arguments = arguments;
+    tool_call.id = id;
+
+    // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
+    result_.tool_calls.emplace_back(tool_call);
+
+    return true;
+}
+bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
+    std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
+    std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
+    std::string arguments = "";
+    if (tool_call.contains("arguments")) {
+        if (tool_call.at("arguments").is_object()) {
+            arguments = tool_call.at("arguments").dump();
+        } else {
+            arguments = tool_call.at("arguments");
+        }
+    }
+
+    return add_tool_call(name, id, arguments);
+}
+
+bool common_chat_msg_parser::add_tool_calls(const json & arr) {
+    for (const auto & item : arr) {
+        if (!add_tool_call(item)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
+    if (!tool_call.is_object() || tool_call.size() != 1) {
+        return false;
+    }
+
+    // Get the tool name (the single key in the object)
+    auto it = tool_call.begin();
+    std::string name = it.key();
+
+    if (name.empty()) {
+        return false;
+    }
+
+    // Get the arguments (the nested object)
+    const json & args_json = it.value();
+    std::string arguments = "";
+
+    if (args_json.is_object()) {
+        arguments = args_json.dump();
+    } else if (args_json.is_string()) {
+        arguments = args_json;
+    } else if (!args_json.is_null()) {
+        // For other types, convert to string representation
+        arguments = args_json.dump();
+    }
+
+    return add_tool_call(name, "", arguments);
+}
+void common_chat_msg_parser::finish() {
+    if (!is_partial_ && pos_ != input_.size()) {
+        throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
+    }
+}
+
+bool common_chat_msg_parser::consume_spaces() {
+    const auto length = input_.size();
+    auto consumed = false;
+    while (pos_ < length && std::isspace(input_[pos_])) {
+        ++pos_;
+        consumed = true;
+    }
+    return consumed;
+}
+
+bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
+    auto pos = pos_;
+    for (auto i = 0u; i < literal.size(); ++i) {
+        if (pos >= input_.size()) {
+            return false;
+        }
+        if (input_[pos] != literal[i]) {
+            return false;
+        }
+        ++pos;
+    }
+    pos_ = pos;
+    return true;
+}
+
+std::optional<common_chat_msg_parser::find_regex_result>  common_chat_msg_parser::try_find_literal(const std::string & literal) {
+    auto idx = input_.find(literal, pos_);
+    if (idx != std::string::npos) {
+        find_regex_result res;
+        res.prelude = input_.substr(pos_, idx - pos_);
+        auto end = idx + literal.size();
+        res.groups.emplace_back(common_string_range{idx, end});
+        move_to(end);
+        return res;
+    }
+    if (is_partial_) {
+        idx = string_find_partial_stop(input_, literal);
+        if (idx != std::string::npos && idx >= pos_) {
+            find_regex_result res;
+            res.prelude = input_.substr(pos_, idx - pos_);
+            auto end = input_.size();
+            res.groups.emplace_back(common_string_range{idx, end});
+            move_to(end);
+            return res;
+        }
+    }
+    return std::nullopt;
+}
+
+void common_chat_msg_parser::consume_literal(const std::string & literal) {
+    if (!try_consume_literal(literal)) {
+        throw common_chat_msg_partial_exception(literal);
+    }
+}
+
+bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
+    std::string pending_reasoning_prefix;
+
+    if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+        return false;
+    }
+
+    auto set_reasoning_prefix = [&](size_t prefix_pos) {
+        if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
+            return;
+        }
+        if (prefix_pos + start_think.size() > input_.size()) {
+            pending_reasoning_prefix.clear();
+            return;
+        }
+        // Capture the exact literal that opened the reasoning section so we can
+        // surface it back to callers. This ensures formats that force the
+        // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
+        // instead of dropping it during parsing.
+        pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
+    };
+
+    auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
+        auto stripped_reasoning = string_strip(reasoning);
+        if (stripped_reasoning.empty()) {
+            return;
+        }
+        if (syntax_.reasoning_in_content) {
+            add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
+            add_content(stripped_reasoning);
+            if (closed) {
+                add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
+            }
+        } else {
+            if (!pending_reasoning_prefix.empty()) {
+                add_reasoning_content(pending_reasoning_prefix);
+                pending_reasoning_prefix.clear();
+            }
+            add_reasoning_content(stripped_reasoning);
+        }
+    };
+
+    const size_t saved_pos = pos_;
+    const size_t saved_content_size = result_.content.size();
+    const size_t saved_reasoning_size = result_.reasoning_content.size();
+
+    auto restore_state = [&]() {
+        move_to(saved_pos);
+        result_.content.resize(saved_content_size);
+        result_.reasoning_content.resize(saved_reasoning_size);
+    };
+
+    // Allow leading whitespace to be preserved as content when reasoning is present at the start
+    size_t cursor = pos_;
+    size_t whitespace_end = cursor;
+    while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
+        ++whitespace_end;
+    }
+
+    if (whitespace_end >= input_.size()) {
+        restore_state();
+        if (syntax_.thinking_forced_open) {
+            auto rest = input_.substr(saved_pos);
+            if (!rest.empty()) {
+                handle_reasoning(rest, /* closed */ !is_partial());
+            }
+            move_to(input_.size());
+            return true;
+        }
+        return false;
+    }
+
+    cursor = whitespace_end;
+    const size_t remaining = input_.size() - cursor;
+    const size_t start_prefix = std::min(start_think.size(), remaining);
+    const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
+
+    if (has_start_tag && start_prefix < start_think.size()) {
+        move_to(input_.size());
+        return true;
+    }
+
+    if (has_start_tag) {
+        if (whitespace_end > pos_) {
+            add_content(input_.substr(pos_, whitespace_end - pos_));
+        }
+        set_reasoning_prefix(cursor);
+        cursor += start_think.size();
+    } else if (syntax_.thinking_forced_open) {
+        cursor = whitespace_end;
+    } else {
+        restore_state();
+        return false;
+    }
+    while (true) {
+        if (cursor >= input_.size()) {
+            move_to(input_.size());
+            return true;
+        }
+
+        size_t end_pos = input_.find(end_think, cursor);
+        if (end_pos == std::string::npos) {
+            std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
+            size_t partial_off = string_find_partial_stop(remaining_view, end_think);
+            size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
+            if (reasoning_end > cursor) {
+                handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
+            }
+            move_to(input_.size());
+            return true;
+        }
+
+        if (end_pos > cursor) {
+            handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
+        } else {
+            handle_reasoning("", /* closed */ true);
+        }
+
+        cursor = end_pos + end_think.size();
+
+        while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
+            ++cursor;
+        }
+
+        const size_t next_remaining = input_.size() - cursor;
+        if (next_remaining == 0) {
+            move_to(cursor);
+            return true;
+        }
+
+        const size_t next_prefix = std::min(start_think.size(), next_remaining);
+        if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
+            if (next_prefix < start_think.size()) {
+                move_to(input_.size());
+                return true;
+            }
+            set_reasoning_prefix(cursor);
+            cursor += start_think.size();
+            continue;
+        }
+
+        move_to(cursor);
+        return true;
+    }
+}
+
+std::string common_chat_msg_parser::consume_rest() {
+    auto rest = input_.substr(pos_);
+    pos_ = input_.size();
+    return rest;
+}
+
+// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
+std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
+    auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
+    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
+        return std::nullopt;
+    }
+    auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
+    pos_ = m.groups[0].end;
+
+    if (add_prelude_to_content) {
+        add_content(prelude);
+    }
+    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
+        if (is_partial()) {
+            throw common_chat_msg_partial_exception(regex.str());
+        }
+        return std::nullopt;
+    }
+    return find_regex_result{prelude, m.groups};
+}
+
+common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
+    if (auto result = try_consume_regex(regex)) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception(regex.str());
+}
+
+std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
+    auto m = regex.search(input_, pos_);
+    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
+        return std::nullopt;
+    }
+    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
+        if (is_partial()) {
+            throw common_chat_msg_partial_exception(regex.str());
+        }
+        return std::nullopt;
+    }
+    if (m.groups[0].begin != pos_) {
+        // Didn't match at the current position.
+        return std::nullopt;
+    }
+    pos_ = m.groups[0].end;
+
+    return find_regex_result {
+        /* .prelude = */ "",
+        m.groups,
+    };
+}
+
+std::optional<common_json> common_chat_msg_parser::try_consume_json() {
+    auto it = input_.cbegin() + pos_;
+    const auto end = input_.cend();
+    common_json result;
+    if (!common_json_parse(it, end, healing_marker_, result)) {
+        return std::nullopt;
+    }
+    pos_ = std::distance(input_.cbegin(), it);
+    if (result.healing_marker.marker.empty()) {
+        // No healing marker, just return the parsed json
+        return result;
+    }
+    if (!is_partial()) {
+        throw common_chat_msg_partial_exception("JSON");
+    }
+    return result;
+}
+
+common_json common_chat_msg_parser::consume_json() {
+    if (auto result = try_consume_json()) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception("JSON");
+}
+
+common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
+    const std::vector<std::vector<std::string>> & args_paths,
+    const std::vector<std::vector<std::string>> & content_paths
+) {
+    if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception("JSON");
+}
+
+std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
+    const std::vector<std::vector<std::string>> & args_paths,
+    const std::vector<std::vector<std::string>> & content_paths
+) {
+    auto partial = try_consume_json();
+    if (!partial) {
+        return std::nullopt;
+    }
+    auto is_arguments_path = [&](const std::vector<std::string> & path) {
+        return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
+    };
+    auto is_content_path = [&](const std::vector<std::string> & path) {
+        return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
+    };
+
+    if (partial->healing_marker.marker.empty()) {
+        if (args_paths.empty()) {
+            // No arguments to dump, and JSON was parsed fully.
+            return consume_json_result {
+                partial->json,
+                /* .is_partial = */ false,
+            };
+        }
+        if (is_arguments_path({})) {
+            // Entire JSON is the arguments and was parsed fully.
+            return consume_json_result {
+                partial->json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true),
+                /* .is_partial = */ false,
+            };
+        }
+    }
+
+    LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
+
+    auto found_healing_marker = false;
+    std::vector<std::string> path;
+    std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
+        if (is_arguments_path(path)) {
+            auto arguments = j.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true);
+            if (is_partial() && !partial->healing_marker.marker.empty()) {
+                auto idx = arguments.find(partial->healing_marker.json_dump_marker);
+                if (idx != std::string::npos) {
+                    arguments.resize(idx);
+                    found_healing_marker = true;
+                }
+                if (arguments == "\"") {
+                    // This happens because of completing `:"$magic` after `"arguments"`
+                    arguments = "";
+                }
+            }
+            return arguments;
+        }
+        if (is_content_path(path)) {
+            if (!j.is_string()) {
+                throw std::runtime_error("Content path must be a string");
+            }
+            std::string str = j;
+            auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
+            if (idx != std::string::npos) {
+                str.resize(idx);
+                found_healing_marker = true;
+            }
+            return str;
+        }
+        if (j.is_object()) {
+            auto obj = json::object();
+            for (const auto & p : j.items()) {
+                const auto & key = p.key();
+                const auto & value = p.value();
+                const std::string key_str = key; // NOLINT
+                auto idx = key_str.find(healing_marker_);
+                if (idx != std::string::npos) {
+                    found_healing_marker = true;
+                    break;
+                }
+                path.push_back(key_str);
+                if (value.is_string()) {
+                    const std::string value_str = value;
+                    if (value_str.find(healing_marker_) != std::string::npos) {
+                        found_healing_marker = true;
+                        if (is_content_path(path)) {
+                            if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
+                                // The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
+                                obj[key] = remove_unsupported_healings_and_dump_args(value);
+                            }
+                        }
+                        break;
+                    }
+                    obj[key] = value;
+                } else {
+                    obj[key] = remove_unsupported_healings_and_dump_args(value);
+                }
+                path.pop_back();
+            }
+            return obj;
+        }
+        if (j.is_array()) {
+            auto arr = json::array();
+            for (const auto & value : j) {
+                if (value.is_string()) {
+                    std::string str = value;
+                    auto idx = str.find(healing_marker_);
+                    if (idx != std::string::npos) {
+                        // Don't heal array values that aren't in the arguments.
+                        found_healing_marker = true;
+                        break;
+                    }
+                }
+                arr.push_back(remove_unsupported_healings_and_dump_args(value));
+            }
+            return arr;
+        }
+        return j;
+    };
+
+    auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
+    LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
+    return consume_json_result {
+        cleaned,
+        /* .is_partial = */ found_healing_marker,
+    };
+}
+
+void common_chat_msg_parser::clear_tools() {
+    result_.tool_calls.clear();
+}
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "chat.h"
+#include "json-partial.h"
+#include "regex-partial.h"
+
+#include <nlohmann/json.hpp>
+
+#include <optional>
+#include <string>
+#include <vector>
+
+class common_chat_msg_partial_exception : public std::runtime_error {
+  public:
+    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
+};
+
+class common_chat_msg_parser {
+    std::string input_;
+    bool is_partial_;
+    common_chat_syntax syntax_;
+    std::string healing_marker_;
+
+    size_t pos_ = 0;
+    common_chat_msg result_;
+
+  public:
+    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+    const std::string & input() const { return input_; }
+    size_t pos() const { return pos_; }
+    const std::string & healing_marker() const { return healing_marker_; }
+    const bool & is_partial() const { return is_partial_; }
+    const common_chat_msg & result() const { return result_; }
+    const common_chat_syntax & syntax() const { return syntax_; }
+
+    void move_to(size_t pos) {
+        if (pos > input_.size()) {
+            throw std::runtime_error("Invalid position!");
+        }
+        pos_ = pos;
+    }
+    void move_back(size_t n) {
+        if (pos_ < n) {
+            throw std::runtime_error("Can't move back that far!");
+        }
+        pos_ -= n;
+    }
+
+    // Get the substring of the input at the given range
+    std::string str(const common_string_range & rng) const;
+
+    // Appends to the result.content field
+    void add_content(const std::string & content);
+
+    // Appends to the result.reasoning_content field
+    void add_reasoning_content(const std::string & reasoning_content);
+
+    // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
+    bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
+
+    // Adds a tool call using the "name", "id" and "arguments" fields of the json object
+    bool add_tool_call(const nlohmann::ordered_json & tool_call);
+
+    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
+    bool add_tool_calls(const nlohmann::ordered_json & arr);
+
+    // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
+    bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
+
+    void finish();
+
+    bool consume_spaces();
+
+    void consume_literal(const std::string & literal);
+
+    bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
+
+    std::string consume_rest();
+
+    struct find_regex_result {
+        std::string prelude;
+        std::vector<common_string_range> groups;
+    };
+
+    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
+
+    bool try_consume_literal(const std::string & literal);
+
+    std::optional<find_regex_result> try_find_literal(const std::string & literal);
+
+    find_regex_result consume_regex(const common_regex & regex);
+
+    std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
+
+    std::optional<common_json> try_consume_json();
+    common_json consume_json();
+
+    struct consume_json_result {
+        nlohmann::ordered_json value;
+        bool is_partial;
+    };
+
+    /*
+        Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
+
+        By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
+        e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
+
+        But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
+        - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
+        - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
+    */
+    consume_json_result consume_json_with_dumped_args(
+        const std::vector<std::vector<std::string>> & args_paths = {},
+        const std::vector<std::vector<std::string>> & content_paths = {}
+    );
+    std::optional<consume_json_result> try_consume_json_with_dumped_args(
+        const std::vector<std::vector<std::string>> & args_paths = {},
+        const std::vector<std::vector<std::string>> & content_paths = {}
+    );
+
+    void clear_tools();
+};
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -1,195 +0,0 @@
-#pragma once
-
-#include "chat.h"
-#include "peg-parser.h"
-
-#include <map>
-#include <optional>
-#include <vector>
-
-class common_chat_peg_mapper {
-  public:
-    common_chat_msg & result;
-
-    common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
-
-    virtual ~common_chat_peg_mapper() = default;
-
-    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
-    virtual void map(const common_peg_ast_node & node);
-  protected:
-    virtual std::string normalize_container_value(const std::string & input);
-  private:
-      // Tool call handling state
-      std::optional<common_chat_tool_call> pending_tool_call;  // Tool call waiting for name
-      common_chat_tool_call *              current_tool          = nullptr;
-      int                                  arg_count             = 0;
-      bool                                 closing_quote_pending = false;
-      std::string                          args_buffer;  // Buffer to delay arguments until tool name is known
-
-      // Returns a reference to the active argument destination string.
-      // Before tool_name is known, writes go to args_buffer; after, to current_tool->arguments.
-      std::string & args_target();
-};
-
-class common_chat_peg_gemma4_mapper : public common_chat_peg_mapper {
-  public:
-    common_chat_peg_gemma4_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
-    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
-  private:
-    void visit(const common_peg_ast_arena & arena, common_peg_ast_id id);
-};
-
-struct content_structure;
-struct tool_call_structure;
-
-class common_chat_peg_builder : public common_peg_parser_builder {
-  public:
-    // Tag constants (from former common_chat_peg_base_builder)
-    static constexpr const char * REASONING_BLOCK = "reasoning-block";
-    static constexpr const char * REASONING       = "reasoning";
-    static constexpr const char * CONTENT         = "content";
-
-    // Tag constants
-    static constexpr const char * TOOL           = "tool";
-    static constexpr const char * TOOL_OPEN      = "tool-open";
-    static constexpr const char * TOOL_CLOSE     = "tool-close";
-    static constexpr const char * TOOL_ID        = "tool-id";
-    static constexpr const char * TOOL_NAME      = "tool-name";
-    static constexpr const char * TOOL_ARGS      = "tool-args";
-    static constexpr const char * TOOL_ARG       = "tool-arg";
-    static constexpr const char * TOOL_ARG_OPEN  = "tool-arg-open";
-    static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
-    static constexpr const char * TOOL_ARG_NAME         = "tool-arg-name";
-    static constexpr const char * TOOL_ARG_VALUE        = "tool-arg-value";
-    static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";  // For schema-declared string types
-
-    // Low-level tag methods (from former common_chat_peg_base_builder)
-    common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
-
-    common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
-
-    common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
-
-    common_peg_parser tag_with_safe_content(const std::string &       tag_name,
-                        const std::string &       marker,
-                        const common_peg_parser & p);
-
-    // Low-level tag methods
-    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
-    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
-    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
-    common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
-    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
-    common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
-    common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
-    common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
-    common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
-    common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
-    common_peg_parser tool_arg_value(const common_peg_parser & p) { return tag(TOOL_ARG_VALUE, p); }
-
-    // Use for schema-declared string types - won't be treated as potential JSON container
-    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
-    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }
-
-
-    // Return a parser that parses the prefix of a string, up to a given delimiter.
-    common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});
-
-    // Legacy-compatible helper for building standard JSON tool calls
-    // Used by tests and manual parsers
-    // name_key/args_key: JSON key names for function name and arguments
-    //   Empty or "name"/"arguments" will accept both common variations
-    //   Supports dot notation for nested objects (e.g., "function.name")
-    // array_wrapped: if true, tool calls are wrapped in JSON array [...]
-    // function_is_key: if true, function name is the JSON key (e.g., {"func_name": {...}})
-    // call_id_key: JSON key for string call ID (e.g., "id")
-    // gen_call_id_key: JSON key for generated integer call ID (e.g., "tool_call_id")
-    // parameters_order: order in which JSON fields should be parsed
-    common_peg_parser standard_json_tools(const std::string &              section_start,
-                                          const std::string &              section_end,
-                                          const nlohmann::ordered_json &   tools,
-                                          bool                             parallel_tool_calls,
-                                          bool                             force_tool_calls,
-                                          const std::string &              name_key = "",
-                                          const std::string &              args_key = "",
-                                          bool                             array_wrapped = false,
-                                          bool                             function_is_key = false,
-                                          const std::string &              call_id_key = "",
-                                          const std::string &              gen_call_id_key = "",
-                                          const std::vector<std::string> & parameters_order = {});
-
-    // Legacy-compatible helper for building XML/tagged style tool calls
-    // Used by tests and manual parsers
-    common_peg_parser standard_constructed_tools(const std::map<std::string, std::string> & markers,
-                                                 const nlohmann::ordered_json &             tools,
-                                                 bool                                       parallel_tool_calls,
-                                                 bool                                       force_tool_calls);
-
-    // Helper for Python-style function call format: name(arg1="value1", arg2=123)
-    // Used by LFM2 and similar templates
-    common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
-                                              bool                           parallel_tool_calls);
-
-  private:
-    // Implementation helpers for standard_json_tools — one per JSON tool call layout mode
-    common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
-                                                       const std::string &            args_key,
-                                                       const std::string &            effective_args_key,
-                                                       const std::string &            call_id_key,
-                                                       const std::string &            gen_call_id_key);
-
-    common_peg_parser build_json_tools_nested_keys(const nlohmann::ordered_json & tools,
-                                                   const std::string &            effective_name_key,
-                                                   const std::string &            effective_args_key,
-                                                   const std::string &            call_id_key,
-                                                   const std::string &            gen_call_id_key);
-
-    common_peg_parser build_json_tools_flat_keys(const nlohmann::ordered_json &   tools,
-                                                 const std::string &              effective_name_key,
-                                                 const std::string &              effective_args_key,
-                                                 const std::string &              call_id_key,
-                                                 const std::string &              gen_call_id_key,
-                                                 const std::vector<std::string> & parameters_order);
-};
-
-inline common_peg_arena build_chat_peg_parser(
-  const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
-  common_chat_peg_builder builder;
-  builder.set_root(fn(builder));
-  return builder.build();
-}
-
-class tag_based_peg_mapper {
-  public:
-    std::map<std::string, std::string> tags;
-
-    void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
-};
-
-struct tagged_parse_result {
-    common_peg_parse_result              result;
-    std::map<std::string, std::string> tags;
-};
-
-struct tagged_peg_parser {
-    common_peg_arena arena;
-    common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE;
-
-    tagged_peg_parser & withDebug() {
-      flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
-      return *this;
-    }
-
-    tagged_peg_parser & withoutDebug() {
-      flags = flags & ~COMMON_PEG_PARSE_FLAG_DEBUG;
-      return *this;
-    }
-
-    tagged_parse_result parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags = COMMON_PEG_PARSE_FLAG_NONE) const;
-    tagged_parse_result parse_anywhere_and_extract(const std::string & input) const;
-};
-
-tagged_peg_parser build_tagged_peg_parser(
-    const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
-
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Francis Couture-Harpin	93fbd407f3	Merge branch 'master' into compilade/convert-prequant Some checks failed Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled Python check requirements.txt / check-requirements (push) Has been cancelled Python Type-Check / pyright type-check (push) Has been cancelled Update Operations Documentation / update-ops-docs (push) Has been cancelled	2025-10-23 14:23:12 -04:00
Francis Couture-Harpin	0d5cfed596	Merge branch 'master' into compilade/convert-prequant Some checks failed Copilot Setup Steps / copilot-setup-steps (push) Has been cancelled Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled Python check requirements.txt / check-requirements (push) Has been cancelled Python Type-Check / pyright type-check (push) Has been cancelled Update Operations Documentation / update-ops-docs (push) Has been cancelled	2025-09-09 14:23:06 -04:00
Francis Couture-Harpin	adec43d774	Merge branch 'master' into compilade/convert-prequant	2025-09-01 10:13:29 -04:00
Francis Couture-Harpin	899398277d	convert : fix conversion from FP8 for Deepseek-V3.1-Base	2025-08-19 17:27:59 -04:00
Francis Couture-Harpin	1ae6ab7601	Merge branch 'master' into compilade/convert-prequant	2025-08-14 17:05:21 -04:00
Francis Couture-Harpin	de12f8ac50	convert : begin handling pre-quantized models	2025-07-22 04:11:34 -04:00
				`@@ -1 +0,0 @@`
				`IMPORTANT: Ensure you’ve thoroughly reviewed the [AGENTS.md](AGENTS.md) file before beginning any work.`